diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,64468 +1,162884 @@ { - "best_metric": 0.16217102110385895, - "best_model_checkpoint": "wyluilipe/deabuse/checkpoint-9000", - "epoch": 2.742032471437162, - "eval_steps": 120, - "global_step": 9120, + "best_metric": 0.07404118776321411, + "best_model_checkpoint": "wyluilipe/deabuse/checkpoint-22000", + "epoch": 2.9115552348382643, + "eval_steps": 400, + "global_step": 23200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 13.701467514038086, + "grad_norm": 728.5859375, "learning_rate": 2e-05, - "loss": 6.6206, + "loss": 15.4752, "step": 1 }, { "epoch": 0.0, - "grad_norm": 39.5434684753418, - "learning_rate": 1.9997995389395612e-05, - "loss": 4.445, + "grad_norm": 352.8587341308594, + "learning_rate": 1.9999163284943315e-05, + "loss": 15.063, "step": 2 }, { "epoch": 0.0, - "grad_norm": 83.92737579345703, - "learning_rate": 1.9995990778791222e-05, - "loss": 4.277, + "grad_norm": 518.958251953125, + "learning_rate": 1.9998326569886626e-05, + "loss": 11.8864, "step": 3 }, { "epoch": 0.0, - "grad_norm": 19.87366485595703, - "learning_rate": 1.9993986168186832e-05, - "loss": 4.9229, + "grad_norm": 1024.3834228515625, + "learning_rate": 1.999748985482994e-05, + "loss": 11.9749, "step": 4 }, { "epoch": 0.0, - "grad_norm": 22.43378448486328, - "learning_rate": 1.9991981557582442e-05, - "loss": 4.7554, + "grad_norm": 325.7301330566406, + "learning_rate": 1.9996653139773253e-05, + "loss": 13.9641, "step": 5 }, { "epoch": 0.0, - "grad_norm": 35.35092544555664, - "learning_rate": 1.9989976946978052e-05, - "loss": 4.1126, + "grad_norm": 313.9541931152344, + "learning_rate": 1.9995816424716566e-05, + "loss": 10.2194, "step": 6 }, { "epoch": 0.0, - "grad_norm": 133.4622344970703, - "learning_rate": 1.998797233637366e-05, - "loss": 4.5409, + "grad_norm": 358.2284240722656, + "learning_rate": 1.9994979709659877e-05, + "loss": 11.5145, "step": 7 }, { "epoch": 0.0, - "grad_norm": 20.33665657043457, - "learning_rate": 1.9985967725769272e-05, - "loss": 4.0595, + "grad_norm": 365.29931640625, + "learning_rate": 1.999414299460319e-05, + "loss": 10.7059, "step": 8 }, { "epoch": 0.0, - "grad_norm": 85.67984008789062, - "learning_rate": 1.9983963115164882e-05, - "loss": 4.9032, + "grad_norm": 490.46905517578125, + "learning_rate": 1.9993306279546504e-05, + "loss": 11.62, "step": 9 }, { "epoch": 0.0, - "grad_norm": 34.769309997558594, - "learning_rate": 1.998195850456049e-05, - "loss": 5.5253, + "grad_norm": 13291.681640625, + "learning_rate": 1.9992469564489814e-05, + "loss": 12.8367, "step": 10 }, { "epoch": 0.0, - "grad_norm": 40.53438949584961, - "learning_rate": 1.9979953893956102e-05, - "loss": 4.3671, + "grad_norm": 1022.1242065429688, + "learning_rate": 1.9991632849433128e-05, + "loss": 10.0496, "step": 11 }, { "epoch": 0.0, - "grad_norm": 21.633525848388672, - "learning_rate": 1.997794928335171e-05, - "loss": 3.9831, + "grad_norm": 192.495361328125, + "learning_rate": 1.999079613437644e-05, + "loss": 9.0021, "step": 12 }, { "epoch": 0.0, - "grad_norm": 20.7197322845459, - "learning_rate": 1.997594467274732e-05, - "loss": 4.4466, + "grad_norm": 211.52745056152344, + "learning_rate": 1.9989959419319752e-05, + "loss": 9.4134, "step": 13 }, { "epoch": 0.0, - "grad_norm": 110.01347351074219, - "learning_rate": 1.9973940062142933e-05, - "loss": 3.8518, + "grad_norm": 213.8238525390625, + "learning_rate": 1.9989122704263066e-05, + "loss": 10.3802, "step": 14 }, { "epoch": 0.0, - "grad_norm": 19.917098999023438, - "learning_rate": 1.997193545153854e-05, - "loss": 3.7986, + "grad_norm": 220.8242645263672, + "learning_rate": 1.9988285989206376e-05, + "loss": 8.2106, "step": 15 }, { "epoch": 0.0, - "grad_norm": 31.238054275512695, - "learning_rate": 1.996993084093415e-05, - "loss": 3.9456, + "grad_norm": 288.3450622558594, + "learning_rate": 1.998744927414969e-05, + "loss": 9.2149, "step": 16 }, { - "epoch": 0.01, - "grad_norm": 20.226728439331055, - "learning_rate": 1.996792623032976e-05, - "loss": 4.1858, + "epoch": 0.0, + "grad_norm": 99.93714141845703, + "learning_rate": 1.9986612559093003e-05, + "loss": 7.4131, "step": 17 }, { - "epoch": 0.01, - "grad_norm": 28.916229248046875, - "learning_rate": 1.996592161972537e-05, - "loss": 4.2707, + "epoch": 0.0, + "grad_norm": 1925.28076171875, + "learning_rate": 1.9985775844036313e-05, + "loss": 9.6749, "step": 18 }, { - "epoch": 0.01, - "grad_norm": 20.103069305419922, - "learning_rate": 1.996391700912098e-05, - "loss": 5.2733, + "epoch": 0.0, + "grad_norm": 159.0907440185547, + "learning_rate": 1.9984939128979627e-05, + "loss": 7.9085, "step": 19 }, { - "epoch": 0.01, - "grad_norm": 30.481670379638672, - "learning_rate": 1.996191239851659e-05, - "loss": 3.6844, + "epoch": 0.0, + "grad_norm": 196.09454345703125, + "learning_rate": 1.998410241392294e-05, + "loss": 8.3921, "step": 20 }, { - "epoch": 0.01, - "grad_norm": 27.793643951416016, - "learning_rate": 1.99599077879122e-05, - "loss": 4.3403, + "epoch": 0.0, + "grad_norm": 104.76849365234375, + "learning_rate": 1.998326569886625e-05, + "loss": 7.0599, "step": 21 }, { - "epoch": 0.01, - "grad_norm": 32.221065521240234, - "learning_rate": 1.995790317730781e-05, - "loss": 3.9117, + "epoch": 0.0, + "grad_norm": 121.42261505126953, + "learning_rate": 1.9982428983809565e-05, + "loss": 9.2175, "step": 22 }, { - "epoch": 0.01, - "grad_norm": 23.525253295898438, - "learning_rate": 1.995589856670342e-05, - "loss": 3.804, + "epoch": 0.0, + "grad_norm": 118.76298522949219, + "learning_rate": 1.9981592268752878e-05, + "loss": 7.1647, "step": 23 }, { - "epoch": 0.01, - "grad_norm": 16.033878326416016, - "learning_rate": 1.995389395609903e-05, - "loss": 5.166, + "epoch": 0.0, + "grad_norm": 176.77235412597656, + "learning_rate": 1.998075555369619e-05, + "loss": 6.6496, "step": 24 }, { - "epoch": 0.01, - "grad_norm": 22.915647506713867, - "learning_rate": 1.995188934549464e-05, - "loss": 3.5634, + "epoch": 0.0, + "grad_norm": 58.71125030517578, + "learning_rate": 1.9979918838639502e-05, + "loss": 6.4695, "step": 25 }, { - "epoch": 0.01, - "grad_norm": 18.16558837890625, - "learning_rate": 1.994988473489025e-05, - "loss": 4.9719, + "epoch": 0.0, + "grad_norm": 664.5093383789062, + "learning_rate": 1.9979082123582816e-05, + "loss": 10.4393, "step": 26 }, { - "epoch": 0.01, - "grad_norm": 31.655603408813477, - "learning_rate": 1.994788012428586e-05, - "loss": 5.1661, + "epoch": 0.0, + "grad_norm": 114.95386505126953, + "learning_rate": 1.997824540852613e-05, + "loss": 7.0759, "step": 27 }, { - "epoch": 0.01, - "grad_norm": 18.09580421447754, - "learning_rate": 1.994587551368147e-05, - "loss": 4.4041, + "epoch": 0.0, + "grad_norm": 45.126991271972656, + "learning_rate": 1.997740869346944e-05, + "loss": 6.3129, "step": 28 }, { - "epoch": 0.01, - "grad_norm": 32.707950592041016, - "learning_rate": 1.9943870903077077e-05, - "loss": 4.5414, + "epoch": 0.0, + "grad_norm": 374.5563659667969, + "learning_rate": 1.9976571978412753e-05, + "loss": 5.6168, "step": 29 }, { - "epoch": 0.01, - "grad_norm": 53.21809005737305, - "learning_rate": 1.994186629247269e-05, - "loss": 4.2779, + "epoch": 0.0, + "grad_norm": 1181.4608154296875, + "learning_rate": 1.9975735263356067e-05, + "loss": 9.343, "step": 30 }, { - "epoch": 0.01, - "grad_norm": 36.61603927612305, - "learning_rate": 1.9939861681868297e-05, - "loss": 4.0579, + "epoch": 0.0, + "grad_norm": 283.9852294921875, + "learning_rate": 1.9974898548299377e-05, + "loss": 7.2791, "step": 31 }, { - "epoch": 0.01, - "grad_norm": 21.24146842956543, - "learning_rate": 1.9937857071263907e-05, - "loss": 4.4486, + "epoch": 0.0, + "grad_norm": 164.09121704101562, + "learning_rate": 1.997406183324269e-05, + "loss": 7.4634, "step": 32 }, { - "epoch": 0.01, - "grad_norm": 45.33989715576172, - "learning_rate": 1.993585246065952e-05, - "loss": 4.7946, + "epoch": 0.0, + "grad_norm": 409.6220703125, + "learning_rate": 1.9973225118186005e-05, + "loss": 6.7892, "step": 33 }, { - "epoch": 0.01, - "grad_norm": 36.568145751953125, - "learning_rate": 1.9933847850055128e-05, - "loss": 4.6392, + "epoch": 0.0, + "grad_norm": 1974.9354248046875, + "learning_rate": 1.9972388403129318e-05, + "loss": 8.4132, "step": 34 }, { - "epoch": 0.01, - "grad_norm": 19.171186447143555, - "learning_rate": 1.9931843239450738e-05, - "loss": 4.0399, + "epoch": 0.0, + "grad_norm": 105.84317016601562, + "learning_rate": 1.997155168807263e-05, + "loss": 6.9896, "step": 35 }, { - "epoch": 0.01, - "grad_norm": 28.191804885864258, - "learning_rate": 1.9929838628846348e-05, - "loss": 4.9894, + "epoch": 0.0, + "grad_norm": 74.51609802246094, + "learning_rate": 1.9970714973015942e-05, + "loss": 7.7109, "step": 36 }, { - "epoch": 0.01, - "grad_norm": 56.99246597290039, - "learning_rate": 1.9927834018241958e-05, - "loss": 3.7354, + "epoch": 0.0, + "grad_norm": 491.32171630859375, + "learning_rate": 1.9969878257959256e-05, + "loss": 7.9236, "step": 37 }, { - "epoch": 0.01, - "grad_norm": 25.940950393676758, - "learning_rate": 1.9925829407637568e-05, - "loss": 3.4817, + "epoch": 0.0, + "grad_norm": 97.33172607421875, + "learning_rate": 1.9969041542902566e-05, + "loss": 5.2768, "step": 38 }, { - "epoch": 0.01, - "grad_norm": 38.012001037597656, - "learning_rate": 1.9923824797033178e-05, - "loss": 3.5887, + "epoch": 0.0, + "grad_norm": 187.27645874023438, + "learning_rate": 1.996820482784588e-05, + "loss": 5.6723, "step": 39 }, { "epoch": 0.01, - "grad_norm": 23.125755310058594, - "learning_rate": 1.9921820186428788e-05, - "loss": 4.8167, + "grad_norm": 143.09710693359375, + "learning_rate": 1.9967368112789193e-05, + "loss": 7.2103, "step": 40 }, { "epoch": 0.01, - "grad_norm": 25.253414154052734, - "learning_rate": 1.9919815575824398e-05, - "loss": 3.8167, + "grad_norm": 127.5011978149414, + "learning_rate": 1.9966531397732504e-05, + "loss": 6.2016, "step": 41 }, { "epoch": 0.01, - "grad_norm": 18.642539978027344, - "learning_rate": 1.991781096522001e-05, - "loss": 3.5483, + "grad_norm": 258.0473937988281, + "learning_rate": 1.9965694682675817e-05, + "loss": 6.897, "step": 42 }, { "epoch": 0.01, - "grad_norm": 102.15142059326172, - "learning_rate": 1.9915806354615615e-05, - "loss": 5.1262, + "grad_norm": 223.92303466796875, + "learning_rate": 1.9964857967619127e-05, + "loss": 10.1903, "step": 43 }, { "epoch": 0.01, - "grad_norm": 24.280908584594727, - "learning_rate": 1.991380174401123e-05, - "loss": 4.2309, + "grad_norm": 69.66583251953125, + "learning_rate": 1.996402125256244e-05, + "loss": 6.5244, "step": 44 }, { "epoch": 0.01, - "grad_norm": 18.596893310546875, - "learning_rate": 1.991179713340684e-05, - "loss": 3.3356, + "grad_norm": 292.11639404296875, + "learning_rate": 1.9963184537505755e-05, + "loss": 7.3886, "step": 45 }, { "epoch": 0.01, - "grad_norm": 21.592117309570312, - "learning_rate": 1.9909792522802445e-05, - "loss": 4.8101, + "grad_norm": 113.60519409179688, + "learning_rate": 1.9962347822449065e-05, + "loss": 6.6999, "step": 46 }, { "epoch": 0.01, - "grad_norm": 19.77571678161621, - "learning_rate": 1.990778791219806e-05, - "loss": 3.4436, + "grad_norm": 213.7831268310547, + "learning_rate": 1.996151110739238e-05, + "loss": 8.2679, "step": 47 }, { "epoch": 0.01, - "grad_norm": 47.20704650878906, - "learning_rate": 1.9905783301593665e-05, - "loss": 4.077, + "grad_norm": 60.55680847167969, + "learning_rate": 1.9960674392335692e-05, + "loss": 7.006, "step": 48 }, { "epoch": 0.01, - "grad_norm": 21.739601135253906, - "learning_rate": 1.9903778690989276e-05, - "loss": 4.5314, + "grad_norm": 70.07861328125, + "learning_rate": 1.9959837677279003e-05, + "loss": 7.2405, "step": 49 }, { - "epoch": 0.02, - "grad_norm": 22.48163604736328, - "learning_rate": 1.9901774080384886e-05, - "loss": 4.1253, + "epoch": 0.01, + "grad_norm": 77.42964172363281, + "learning_rate": 1.9959000962222316e-05, + "loss": 5.8274, "step": 50 }, { - "epoch": 0.02, - "grad_norm": 26.019960403442383, - "learning_rate": 1.9899769469780496e-05, - "loss": 4.4319, + "epoch": 0.01, + "grad_norm": 97.56826782226562, + "learning_rate": 1.995816424716563e-05, + "loss": 6.531, "step": 51 }, { - "epoch": 0.02, - "grad_norm": 21.49262046813965, - "learning_rate": 1.9897764859176106e-05, - "loss": 4.7591, + "epoch": 0.01, + "grad_norm": 378.0497131347656, + "learning_rate": 1.995732753210894e-05, + "loss": 6.7833, "step": 52 }, { - "epoch": 0.02, - "grad_norm": 21.852209091186523, - "learning_rate": 1.9895760248571716e-05, - "loss": 4.5679, + "epoch": 0.01, + "grad_norm": 121.92767333984375, + "learning_rate": 1.9956490817052254e-05, + "loss": 6.7799, "step": 53 }, { - "epoch": 0.02, - "grad_norm": 17.225950241088867, - "learning_rate": 1.9893755637967326e-05, - "loss": 3.5482, + "epoch": 0.01, + "grad_norm": 85.76437377929688, + "learning_rate": 1.9955654101995567e-05, + "loss": 5.3537, "step": 54 }, { - "epoch": 0.02, - "grad_norm": 17.815263748168945, - "learning_rate": 1.9891751027362936e-05, - "loss": 3.6293, + "epoch": 0.01, + "grad_norm": 40.82673645019531, + "learning_rate": 1.995481738693888e-05, + "loss": 6.3451, "step": 55 }, { - "epoch": 0.02, - "grad_norm": 19.188623428344727, - "learning_rate": 1.9889746416758546e-05, - "loss": 4.1118, + "epoch": 0.01, + "grad_norm": 82.32181549072266, + "learning_rate": 1.995398067188219e-05, + "loss": 6.1245, "step": 56 }, { - "epoch": 0.02, - "grad_norm": 21.920475006103516, - "learning_rate": 1.9887741806154156e-05, - "loss": 3.8108, + "epoch": 0.01, + "grad_norm": 84.50050354003906, + "learning_rate": 1.9953143956825505e-05, + "loss": 6.2817, "step": 57 }, { - "epoch": 0.02, - "grad_norm": 24.557518005371094, - "learning_rate": 1.9885737195549766e-05, - "loss": 3.8317, + "epoch": 0.01, + "grad_norm": 89.99356842041016, + "learning_rate": 1.995230724176882e-05, + "loss": 7.0404, "step": 58 }, { - "epoch": 0.02, - "grad_norm": 90.77320098876953, - "learning_rate": 1.9883732584945376e-05, - "loss": 5.1687, + "epoch": 0.01, + "grad_norm": 40.455955505371094, + "learning_rate": 1.995147052671213e-05, + "loss": 5.1922, "step": 59 }, { - "epoch": 0.02, - "grad_norm": 25.83659553527832, - "learning_rate": 1.9881727974340986e-05, - "loss": 5.0576, + "epoch": 0.01, + "grad_norm": 99.8443603515625, + "learning_rate": 1.9950633811655443e-05, + "loss": 7.4731, "step": 60 }, { - "epoch": 0.02, - "grad_norm": 19.337127685546875, - "learning_rate": 1.9879723363736597e-05, - "loss": 3.364, + "epoch": 0.01, + "grad_norm": 67.06603240966797, + "learning_rate": 1.9949797096598756e-05, + "loss": 7.7414, "step": 61 }, { - "epoch": 0.02, - "grad_norm": 19.83724594116211, - "learning_rate": 1.9877718753132203e-05, - "loss": 3.5426, + "epoch": 0.01, + "grad_norm": 35.99038314819336, + "learning_rate": 1.994896038154207e-05, + "loss": 5.406, "step": 62 }, { - "epoch": 0.02, - "grad_norm": 14.76127815246582, - "learning_rate": 1.9875714142527817e-05, - "loss": 3.8213, + "epoch": 0.01, + "grad_norm": 74.65876770019531, + "learning_rate": 1.994812366648538e-05, + "loss": 6.6748, "step": 63 }, { - "epoch": 0.02, - "grad_norm": 21.111013412475586, - "learning_rate": 1.9873709531923427e-05, - "loss": 3.4596, + "epoch": 0.01, + "grad_norm": 49.295806884765625, + "learning_rate": 1.9947286951428694e-05, + "loss": 5.6246, "step": 64 }, { - "epoch": 0.02, - "grad_norm": 63.46014404296875, - "learning_rate": 1.9871704921319033e-05, - "loss": 4.4058, + "epoch": 0.01, + "grad_norm": 1596.51708984375, + "learning_rate": 1.9946450236372007e-05, + "loss": 6.601, "step": 65 }, { - "epoch": 0.02, - "grad_norm": 19.21253776550293, - "learning_rate": 1.9869700310714647e-05, - "loss": 3.8774, + "epoch": 0.01, + "grad_norm": 43.59678649902344, + "learning_rate": 1.9945613521315318e-05, + "loss": 5.112, "step": 66 }, { - "epoch": 0.02, - "grad_norm": 23.148107528686523, - "learning_rate": 1.9867695700110254e-05, - "loss": 3.9923, + "epoch": 0.01, + "grad_norm": 112.09934997558594, + "learning_rate": 1.994477680625863e-05, + "loss": 5.9346, "step": 67 }, { - "epoch": 0.02, - "grad_norm": 25.991294860839844, - "learning_rate": 1.9865691089505864e-05, - "loss": 4.5205, + "epoch": 0.01, + "grad_norm": 44.990169525146484, + "learning_rate": 1.994394009120194e-05, + "loss": 3.9908, "step": 68 }, { - "epoch": 0.02, - "grad_norm": 18.949478149414062, - "learning_rate": 1.9863686478901477e-05, - "loss": 4.678, + "epoch": 0.01, + "grad_norm": 43.052791595458984, + "learning_rate": 1.9943103376145255e-05, + "loss": 4.6972, "step": 69 }, { - "epoch": 0.02, - "grad_norm": 17.614856719970703, - "learning_rate": 1.9861681868297084e-05, - "loss": 3.758, + "epoch": 0.01, + "grad_norm": 66.66222381591797, + "learning_rate": 1.994226666108857e-05, + "loss": 4.5402, "step": 70 }, { - "epoch": 0.02, - "grad_norm": 20.783035278320312, - "learning_rate": 1.9859677257692694e-05, - "loss": 4.5238, + "epoch": 0.01, + "grad_norm": 433.1274108886719, + "learning_rate": 1.994142994603188e-05, + "loss": 6.3487, "step": 71 }, { - "epoch": 0.02, - "grad_norm": 27.744810104370117, - "learning_rate": 1.9857672647088304e-05, - "loss": 4.7011, + "epoch": 0.01, + "grad_norm": 35.271053314208984, + "learning_rate": 1.9940593230975193e-05, + "loss": 3.7392, "step": 72 }, { - "epoch": 0.02, - "grad_norm": 22.72287368774414, - "learning_rate": 1.9855668036483914e-05, - "loss": 3.6715, + "epoch": 0.01, + "grad_norm": 55.31417465209961, + "learning_rate": 1.9939756515918503e-05, + "loss": 5.8701, "step": 73 }, { - "epoch": 0.02, - "grad_norm": 23.866741180419922, - "learning_rate": 1.9853663425879524e-05, - "loss": 5.4669, + "epoch": 0.01, + "grad_norm": 434.6205139160156, + "learning_rate": 1.9938919800861817e-05, + "loss": 6.7362, "step": 74 }, { - "epoch": 0.02, - "grad_norm": 16.936256408691406, - "learning_rate": 1.9851658815275134e-05, - "loss": 3.3014, + "epoch": 0.01, + "grad_norm": 47.17160415649414, + "learning_rate": 1.993808308580513e-05, + "loss": 4.5174, "step": 75 }, { - "epoch": 0.02, - "grad_norm": 22.439456939697266, - "learning_rate": 1.9849654204670744e-05, - "loss": 3.202, + "epoch": 0.01, + "grad_norm": 320.0890197753906, + "learning_rate": 1.9937246370748444e-05, + "loss": 6.1796, "step": 76 }, { - "epoch": 0.02, - "grad_norm": 16.06572723388672, - "learning_rate": 1.9847649594066354e-05, - "loss": 3.3253, + "epoch": 0.01, + "grad_norm": 50.07407760620117, + "learning_rate": 1.9936409655691754e-05, + "loss": 3.736, "step": 77 }, { - "epoch": 0.02, - "grad_norm": 29.42819595336914, - "learning_rate": 1.9845644983461965e-05, - "loss": 4.1956, + "epoch": 0.01, + "grad_norm": 53.458953857421875, + "learning_rate": 1.9935572940635068e-05, + "loss": 5.0562, "step": 78 }, { - "epoch": 0.02, - "grad_norm": 20.668291091918945, - "learning_rate": 1.9843640372857575e-05, - "loss": 4.4557, + "epoch": 0.01, + "grad_norm": 67.62255096435547, + "learning_rate": 1.993473622557838e-05, + "loss": 6.4006, "step": 79 }, { - "epoch": 0.02, - "grad_norm": 21.241540908813477, - "learning_rate": 1.9841635762253185e-05, - "loss": 2.7819, + "epoch": 0.01, + "grad_norm": 144.18785095214844, + "learning_rate": 1.9933899510521692e-05, + "loss": 4.4056, "step": 80 }, { - "epoch": 0.02, - "grad_norm": 39.1732292175293, - "learning_rate": 1.9839631151648795e-05, - "loss": 4.044, + "epoch": 0.01, + "grad_norm": 207.25999450683594, + "learning_rate": 1.9933062795465005e-05, + "loss": 7.6115, "step": 81 }, { - "epoch": 0.02, - "grad_norm": 17.961164474487305, - "learning_rate": 1.9837626541044405e-05, - "loss": 4.0641, + "epoch": 0.01, + "grad_norm": 152.46791076660156, + "learning_rate": 1.993222608040832e-05, + "loss": 6.7522, "step": 82 }, { - "epoch": 0.02, - "grad_norm": 23.27247428894043, - "learning_rate": 1.9835621930440015e-05, - "loss": 4.1457, + "epoch": 0.01, + "grad_norm": 50.106224060058594, + "learning_rate": 1.9931389365351633e-05, + "loss": 5.4387, "step": 83 }, { - "epoch": 0.03, - "grad_norm": 33.87324523925781, - "learning_rate": 1.983361731983562e-05, - "loss": 4.4233, + "epoch": 0.01, + "grad_norm": 46.966224670410156, + "learning_rate": 1.9930552650294943e-05, + "loss": 4.5016, "step": 84 }, { - "epoch": 0.03, - "grad_norm": 27.01508903503418, - "learning_rate": 1.9831612709231235e-05, - "loss": 3.5391, + "epoch": 0.01, + "grad_norm": 46.80181121826172, + "learning_rate": 1.9929715935238257e-05, + "loss": 6.9584, "step": 85 }, { - "epoch": 0.03, - "grad_norm": 18.453147888183594, - "learning_rate": 1.9829608098626842e-05, - "loss": 3.8098, + "epoch": 0.01, + "grad_norm": 167.78543090820312, + "learning_rate": 1.992887922018157e-05, + "loss": 5.6751, "step": 86 }, { - "epoch": 0.03, - "grad_norm": 551.41259765625, - "learning_rate": 1.9827603488022452e-05, - "loss": 3.3489, + "epoch": 0.01, + "grad_norm": 222.9422149658203, + "learning_rate": 1.992804250512488e-05, + "loss": 5.5146, "step": 87 }, { - "epoch": 0.03, - "grad_norm": 33.0008544921875, - "learning_rate": 1.9825598877418065e-05, - "loss": 3.8738, + "epoch": 0.01, + "grad_norm": 46.92573165893555, + "learning_rate": 1.9927205790068194e-05, + "loss": 4.6874, "step": 88 }, { - "epoch": 0.03, - "grad_norm": 80.79717254638672, - "learning_rate": 1.9823594266813672e-05, - "loss": 3.9937, + "epoch": 0.01, + "grad_norm": 246.22035217285156, + "learning_rate": 1.9926369075011508e-05, + "loss": 5.9676, "step": 89 }, { - "epoch": 0.03, - "grad_norm": 20.0161075592041, - "learning_rate": 1.9821589656209282e-05, - "loss": 3.2797, + "epoch": 0.01, + "grad_norm": 57.03669738769531, + "learning_rate": 1.992553235995482e-05, + "loss": 5.1982, "step": 90 }, { - "epoch": 0.03, - "grad_norm": 25.17633056640625, - "learning_rate": 1.9819585045604892e-05, - "loss": 3.3241, + "epoch": 0.01, + "grad_norm": 51.081966400146484, + "learning_rate": 1.9924695644898132e-05, + "loss": 3.6639, "step": 91 }, { - "epoch": 0.03, - "grad_norm": 50.05910873413086, - "learning_rate": 1.9817580435000502e-05, - "loss": 5.4063, + "epoch": 0.01, + "grad_norm": 31.671546936035156, + "learning_rate": 1.9923858929841445e-05, + "loss": 3.5103, "step": 92 }, { - "epoch": 0.03, - "grad_norm": 19.56489372253418, - "learning_rate": 1.9815575824396112e-05, - "loss": 3.3133, + "epoch": 0.01, + "grad_norm": 47.267677307128906, + "learning_rate": 1.992302221478476e-05, + "loss": 3.8276, "step": 93 }, { - "epoch": 0.03, - "grad_norm": 66.2906265258789, - "learning_rate": 1.9813571213791723e-05, - "loss": 4.3678, + "epoch": 0.01, + "grad_norm": 74.63932800292969, + "learning_rate": 1.992218549972807e-05, + "loss": 6.8494, "step": 94 }, { - "epoch": 0.03, - "grad_norm": 17.404077529907227, - "learning_rate": 1.9811566603187333e-05, - "loss": 3.7368, + "epoch": 0.01, + "grad_norm": 68.33775329589844, + "learning_rate": 1.9921348784671383e-05, + "loss": 5.1167, "step": 95 }, { - "epoch": 0.03, - "grad_norm": 17.154563903808594, - "learning_rate": 1.9809561992582943e-05, - "loss": 3.6814, + "epoch": 0.01, + "grad_norm": 40.34266662597656, + "learning_rate": 1.9920512069614693e-05, + "loss": 4.5064, "step": 96 }, { - "epoch": 0.03, - "grad_norm": 21.113162994384766, - "learning_rate": 1.9807557381978553e-05, - "loss": 4.3332, + "epoch": 0.01, + "grad_norm": 219.28070068359375, + "learning_rate": 1.9919675354558007e-05, + "loss": 3.9197, "step": 97 }, { - "epoch": 0.03, - "grad_norm": 35.284725189208984, - "learning_rate": 1.9805552771374163e-05, - "loss": 3.2572, + "epoch": 0.01, + "grad_norm": 162.0477752685547, + "learning_rate": 1.991883863950132e-05, + "loss": 5.7777, "step": 98 }, { - "epoch": 0.03, - "grad_norm": 19.46085548400879, - "learning_rate": 1.9803548160769773e-05, - "loss": 3.5422, + "epoch": 0.01, + "grad_norm": 35.64801788330078, + "learning_rate": 1.991800192444463e-05, + "loss": 5.6309, "step": 99 }, { - "epoch": 0.03, - "grad_norm": 24.549236297607422, - "learning_rate": 1.9801543550165383e-05, - "loss": 3.071, + "epoch": 0.01, + "grad_norm": 164.5700225830078, + "learning_rate": 1.9917165209387944e-05, + "loss": 5.6857, "step": 100 }, { - "epoch": 0.03, - "grad_norm": 24.21219253540039, - "learning_rate": 1.9799538939560993e-05, - "loss": 3.7529, + "epoch": 0.01, + "grad_norm": 42.44116973876953, + "learning_rate": 1.9916328494331255e-05, + "loss": 5.1935, "step": 101 }, { - "epoch": 0.03, - "grad_norm": 19.04281234741211, - "learning_rate": 1.9797534328956603e-05, - "loss": 3.5407, + "epoch": 0.01, + "grad_norm": 30.379470825195312, + "learning_rate": 1.991549177927457e-05, + "loss": 4.7206, "step": 102 }, { - "epoch": 0.03, - "grad_norm": 46.35712814331055, - "learning_rate": 1.979552971835221e-05, - "loss": 3.0938, + "epoch": 0.01, + "grad_norm": 56.13416290283203, + "learning_rate": 1.9914655064217882e-05, + "loss": 5.3201, "step": 103 }, { - "epoch": 0.03, - "grad_norm": 32.1992073059082, - "learning_rate": 1.9793525107747823e-05, - "loss": 4.1555, + "epoch": 0.01, + "grad_norm": 57.6418571472168, + "learning_rate": 1.9913818349161196e-05, + "loss": 3.8999, "step": 104 }, { - "epoch": 0.03, - "grad_norm": 18.388561248779297, - "learning_rate": 1.979152049714343e-05, - "loss": 3.25, + "epoch": 0.01, + "grad_norm": 70.00899505615234, + "learning_rate": 1.9912981634104506e-05, + "loss": 4.046, "step": 105 }, { - "epoch": 0.03, - "grad_norm": 25.422645568847656, - "learning_rate": 1.978951588653904e-05, - "loss": 3.7947, + "epoch": 0.01, + "grad_norm": 41.35757064819336, + "learning_rate": 1.991214491904782e-05, + "loss": 6.1811, "step": 106 }, { - "epoch": 0.03, - "grad_norm": 20.530521392822266, - "learning_rate": 1.9787511275934654e-05, - "loss": 3.3529, + "epoch": 0.01, + "grad_norm": 92.74214172363281, + "learning_rate": 1.9911308203991133e-05, + "loss": 4.667, "step": 107 }, { - "epoch": 0.03, - "grad_norm": 40.206138610839844, - "learning_rate": 1.978550666533026e-05, - "loss": 4.6079, + "epoch": 0.01, + "grad_norm": 64.18307495117188, + "learning_rate": 1.9910471488934443e-05, + "loss": 6.3902, "step": 108 }, { - "epoch": 0.03, - "grad_norm": 43.84805679321289, - "learning_rate": 1.978350205472587e-05, - "loss": 3.6345, + "epoch": 0.01, + "grad_norm": 38.1926383972168, + "learning_rate": 1.9909634773877757e-05, + "loss": 6.1301, "step": 109 }, { - "epoch": 0.03, - "grad_norm": 26.29327392578125, - "learning_rate": 1.978149744412148e-05, - "loss": 4.9385, + "epoch": 0.01, + "grad_norm": 89.6794662475586, + "learning_rate": 1.990879805882107e-05, + "loss": 4.9284, "step": 110 }, { - "epoch": 0.03, - "grad_norm": 24.576719284057617, - "learning_rate": 1.977949283351709e-05, - "loss": 3.7024, + "epoch": 0.01, + "grad_norm": 52.6729621887207, + "learning_rate": 1.9907961343764384e-05, + "loss": 5.0203, "step": 111 }, { - "epoch": 0.03, - "grad_norm": 21.797935485839844, - "learning_rate": 1.97774882229127e-05, - "loss": 4.0716, + "epoch": 0.01, + "grad_norm": 96.23528289794922, + "learning_rate": 1.9907124628707695e-05, + "loss": 3.5231, "step": 112 }, { - "epoch": 0.03, - "grad_norm": 51.30803298950195, - "learning_rate": 1.977548361230831e-05, - "loss": 4.4803, + "epoch": 0.01, + "grad_norm": 51.27971649169922, + "learning_rate": 1.990628791365101e-05, + "loss": 3.9009, "step": 113 }, { - "epoch": 0.03, - "grad_norm": 17.133638381958008, - "learning_rate": 1.977347900170392e-05, - "loss": 2.996, + "epoch": 0.01, + "grad_norm": 55.0560302734375, + "learning_rate": 1.9905451198594322e-05, + "loss": 5.9776, "step": 114 }, { - "epoch": 0.03, - "grad_norm": 23.30716323852539, - "learning_rate": 1.977147439109953e-05, - "loss": 3.6316, + "epoch": 0.01, + "grad_norm": 28.32353401184082, + "learning_rate": 1.9904614483537632e-05, + "loss": 5.6077, "step": 115 }, { - "epoch": 0.03, - "grad_norm": 21.08536720275879, - "learning_rate": 1.976946978049514e-05, - "loss": 3.2101, + "epoch": 0.01, + "grad_norm": 31.120115280151367, + "learning_rate": 1.9903777768480946e-05, + "loss": 4.6517, "step": 116 }, { - "epoch": 0.04, - "grad_norm": 40.39435577392578, - "learning_rate": 1.9767465169890748e-05, - "loss": 3.9427, + "epoch": 0.01, + "grad_norm": 41.79515838623047, + "learning_rate": 1.990294105342426e-05, + "loss": 3.8836, "step": 117 }, { - "epoch": 0.04, - "grad_norm": 20.827363967895508, - "learning_rate": 1.976546055928636e-05, - "loss": 3.7911, + "epoch": 0.01, + "grad_norm": 47.09407043457031, + "learning_rate": 1.9902104338367573e-05, + "loss": 5.2293, "step": 118 }, { - "epoch": 0.04, - "grad_norm": 31.05581283569336, - "learning_rate": 1.976345594868197e-05, - "loss": 4.0686, + "epoch": 0.01, + "grad_norm": 56.67197799682617, + "learning_rate": 1.9901267623310883e-05, + "loss": 5.4069, "step": 119 }, { - "epoch": 0.04, - "grad_norm": 75.66057586669922, - "learning_rate": 1.976145133807758e-05, - "loss": 3.659, - "step": 120 - }, - { - "epoch": 0.04, - "eval_loss": 1.6635291576385498, - "eval_runtime": 43.6414, - "eval_samples_per_second": 33.89, - "eval_steps_per_second": 33.89, + "epoch": 0.02, + "grad_norm": 58.246856689453125, + "learning_rate": 1.9900430908254197e-05, + "loss": 5.2024, "step": 120 }, { - "epoch": 0.04, - "grad_norm": 14.022979736328125, - "learning_rate": 1.975944672747319e-05, - "loss": 3.0417, + "epoch": 0.02, + "grad_norm": 54.86061477661133, + "learning_rate": 1.9899594193197507e-05, + "loss": 3.8366, "step": 121 }, { - "epoch": 0.04, - "grad_norm": 38.61516189575195, - "learning_rate": 1.9757442116868798e-05, - "loss": 4.3155, + "epoch": 0.02, + "grad_norm": 70.79507446289062, + "learning_rate": 1.989875747814082e-05, + "loss": 4.7176, "step": 122 }, { - "epoch": 0.04, - "grad_norm": 19.140295028686523, - "learning_rate": 1.975543750626441e-05, - "loss": 4.6368, + "epoch": 0.02, + "grad_norm": 155.59671020507812, + "learning_rate": 1.9897920763084135e-05, + "loss": 4.8716, "step": 123 }, { - "epoch": 0.04, - "grad_norm": 24.628950119018555, - "learning_rate": 1.975343289566002e-05, - "loss": 4.131, + "epoch": 0.02, + "grad_norm": 60.553348541259766, + "learning_rate": 1.9897084048027445e-05, + "loss": 6.3457, "step": 124 }, { - "epoch": 0.04, - "grad_norm": 32.278045654296875, - "learning_rate": 1.975142828505563e-05, - "loss": 4.4181, + "epoch": 0.02, + "grad_norm": 29.441686630249023, + "learning_rate": 1.989624733297076e-05, + "loss": 7.1413, "step": 125 }, { - "epoch": 0.04, - "grad_norm": 25.219432830810547, - "learning_rate": 1.9749423674451242e-05, - "loss": 3.8382, + "epoch": 0.02, + "grad_norm": 77.06675720214844, + "learning_rate": 1.989541061791407e-05, + "loss": 5.7867, "step": 126 }, { - "epoch": 0.04, - "grad_norm": 20.332189559936523, - "learning_rate": 1.974741906384685e-05, - "loss": 2.7729, + "epoch": 0.02, + "grad_norm": 84.96646881103516, + "learning_rate": 1.9894573902857382e-05, + "loss": 5.201, "step": 127 }, { - "epoch": 0.04, - "grad_norm": 24.03636932373047, - "learning_rate": 1.974541445324246e-05, - "loss": 3.4281, + "epoch": 0.02, + "grad_norm": 27.29684066772461, + "learning_rate": 1.9893737187800696e-05, + "loss": 4.5, "step": 128 }, { - "epoch": 0.04, - "grad_norm": 48.14682388305664, - "learning_rate": 1.974340984263807e-05, - "loss": 3.3327, + "epoch": 0.02, + "grad_norm": 69.64246368408203, + "learning_rate": 1.9892900472744006e-05, + "loss": 5.7104, "step": 129 }, { - "epoch": 0.04, - "grad_norm": 22.612560272216797, - "learning_rate": 1.974140523203368e-05, - "loss": 2.8426, + "epoch": 0.02, + "grad_norm": 26.5671443939209, + "learning_rate": 1.989206375768732e-05, + "loss": 7.3485, "step": 130 }, { - "epoch": 0.04, - "grad_norm": 18.185649871826172, - "learning_rate": 1.973940062142929e-05, - "loss": 3.8089, + "epoch": 0.02, + "grad_norm": 28.373462677001953, + "learning_rate": 1.9891227042630634e-05, + "loss": 4.5569, "step": 131 }, { - "epoch": 0.04, - "grad_norm": 39.714744567871094, - "learning_rate": 1.97373960108249e-05, - "loss": 3.5672, + "epoch": 0.02, + "grad_norm": 32.1534309387207, + "learning_rate": 1.9890390327573947e-05, + "loss": 4.7846, "step": 132 }, { - "epoch": 0.04, - "grad_norm": 63.07310485839844, - "learning_rate": 1.973539140022051e-05, - "loss": 3.1297, + "epoch": 0.02, + "grad_norm": 51.85831832885742, + "learning_rate": 1.9889553612517258e-05, + "loss": 4.2173, "step": 133 }, { - "epoch": 0.04, - "grad_norm": 22.210826873779297, - "learning_rate": 1.973338678961612e-05, - "loss": 5.2622, + "epoch": 0.02, + "grad_norm": 25.660507202148438, + "learning_rate": 1.988871689746057e-05, + "loss": 3.8976, "step": 134 }, { - "epoch": 0.04, - "grad_norm": 15.51650619506836, - "learning_rate": 1.973138217901173e-05, - "loss": 3.0659, + "epoch": 0.02, + "grad_norm": 33.85279083251953, + "learning_rate": 1.9887880182403885e-05, + "loss": 4.4822, "step": 135 }, { - "epoch": 0.04, - "grad_norm": 20.847930908203125, - "learning_rate": 1.9729377568407336e-05, - "loss": 3.1732, + "epoch": 0.02, + "grad_norm": 43.09807205200195, + "learning_rate": 1.9887043467347195e-05, + "loss": 3.6526, "step": 136 }, { - "epoch": 0.04, - "grad_norm": 18.530948638916016, - "learning_rate": 1.972737295780295e-05, - "loss": 2.7667, + "epoch": 0.02, + "grad_norm": 36.18055725097656, + "learning_rate": 1.988620675229051e-05, + "loss": 4.2724, "step": 137 }, { - "epoch": 0.04, - "grad_norm": 47.08240509033203, - "learning_rate": 1.972536834719856e-05, - "loss": 2.5596, + "epoch": 0.02, + "grad_norm": 82.61565399169922, + "learning_rate": 1.9885370037233822e-05, + "loss": 4.4369, "step": 138 }, { - "epoch": 0.04, - "grad_norm": 26.97058868408203, - "learning_rate": 1.9723363736594166e-05, - "loss": 3.129, + "epoch": 0.02, + "grad_norm": 37.68891906738281, + "learning_rate": 1.9884533322177136e-05, + "loss": 5.8417, "step": 139 }, { - "epoch": 0.04, - "grad_norm": 20.2777099609375, - "learning_rate": 1.972135912598978e-05, - "loss": 3.4919, + "epoch": 0.02, + "grad_norm": 28.7679443359375, + "learning_rate": 1.9883696607120446e-05, + "loss": 6.1488, "step": 140 }, { - "epoch": 0.04, - "grad_norm": 21.72832679748535, - "learning_rate": 1.9719354515385386e-05, - "loss": 4.1341, + "epoch": 0.02, + "grad_norm": 55.896026611328125, + "learning_rate": 1.988285989206376e-05, + "loss": 5.2888, "step": 141 }, { - "epoch": 0.04, - "grad_norm": 17.30972671508789, - "learning_rate": 1.9717349904780996e-05, - "loss": 3.2444, + "epoch": 0.02, + "grad_norm": 28.051551818847656, + "learning_rate": 1.9882023177007074e-05, + "loss": 3.9439, "step": 142 }, { - "epoch": 0.04, - "grad_norm": 49.03208923339844, - "learning_rate": 1.971534529417661e-05, - "loss": 4.314, + "epoch": 0.02, + "grad_norm": 40.94978713989258, + "learning_rate": 1.9881186461950384e-05, + "loss": 3.9018, "step": 143 }, { - "epoch": 0.04, - "grad_norm": 26.420650482177734, - "learning_rate": 1.9713340683572217e-05, - "loss": 3.5411, + "epoch": 0.02, + "grad_norm": 37.6737060546875, + "learning_rate": 1.9880349746893698e-05, + "loss": 4.2534, "step": 144 }, { - "epoch": 0.04, - "grad_norm": 25.890357971191406, - "learning_rate": 1.9711336072967827e-05, - "loss": 3.741, + "epoch": 0.02, + "grad_norm": 31.931015014648438, + "learning_rate": 1.987951303183701e-05, + "loss": 5.6937, "step": 145 }, { - "epoch": 0.04, - "grad_norm": 22.66683578491211, - "learning_rate": 1.9709331462363437e-05, - "loss": 3.116, + "epoch": 0.02, + "grad_norm": 30.532413482666016, + "learning_rate": 1.9878676316780325e-05, + "loss": 3.8673, "step": 146 }, { - "epoch": 0.04, - "grad_norm": 31.934288024902344, - "learning_rate": 1.9707326851759047e-05, - "loss": 3.8739, + "epoch": 0.02, + "grad_norm": 29.377347946166992, + "learning_rate": 1.9877839601723635e-05, + "loss": 5.2521, "step": 147 }, { - "epoch": 0.04, - "grad_norm": 29.55501937866211, - "learning_rate": 1.9705322241154657e-05, - "loss": 3.7167, + "epoch": 0.02, + "grad_norm": 38.87949752807617, + "learning_rate": 1.987700288666695e-05, + "loss": 3.4218, "step": 148 }, { - "epoch": 0.04, - "grad_norm": 21.13062286376953, - "learning_rate": 1.9703317630550267e-05, - "loss": 3.0495, + "epoch": 0.02, + "grad_norm": 21.628009796142578, + "learning_rate": 1.987616617161026e-05, + "loss": 2.7079, "step": 149 }, { - "epoch": 0.05, - "grad_norm": 27.444517135620117, - "learning_rate": 1.9701313019945877e-05, - "loss": 3.2852, + "epoch": 0.02, + "grad_norm": 31.379051208496094, + "learning_rate": 1.9875329456553573e-05, + "loss": 5.888, "step": 150 }, { - "epoch": 0.05, - "grad_norm": 19.37506675720215, - "learning_rate": 1.9699308409341487e-05, - "loss": 5.2696, + "epoch": 0.02, + "grad_norm": 28.817869186401367, + "learning_rate": 1.9874492741496886e-05, + "loss": 3.8517, "step": 151 }, { - "epoch": 0.05, - "grad_norm": 49.605247497558594, - "learning_rate": 1.9697303798737097e-05, - "loss": 3.7375, + "epoch": 0.02, + "grad_norm": 24.719228744506836, + "learning_rate": 1.9873656026440197e-05, + "loss": 4.9138, "step": 152 }, { - "epoch": 0.05, - "grad_norm": 19.60191535949707, - "learning_rate": 1.9695299188132707e-05, - "loss": 3.6209, + "epoch": 0.02, + "grad_norm": 42.05404281616211, + "learning_rate": 1.987281931138351e-05, + "loss": 3.521, "step": 153 }, { - "epoch": 0.05, - "grad_norm": 18.86824607849121, - "learning_rate": 1.9693294577528317e-05, - "loss": 3.1842, + "epoch": 0.02, + "grad_norm": 28.471454620361328, + "learning_rate": 1.987198259632682e-05, + "loss": 4.5766, "step": 154 }, { - "epoch": 0.05, - "grad_norm": 30.93740463256836, - "learning_rate": 1.9691289966923928e-05, - "loss": 3.555, + "epoch": 0.02, + "grad_norm": 92.56484985351562, + "learning_rate": 1.9871145881270134e-05, + "loss": 6.5103, "step": 155 }, { - "epoch": 0.05, - "grad_norm": 24.22440528869629, - "learning_rate": 1.9689285356319538e-05, - "loss": 3.5906, + "epoch": 0.02, + "grad_norm": 28.869182586669922, + "learning_rate": 1.9870309166213448e-05, + "loss": 4.8127, "step": 156 }, { - "epoch": 0.05, - "grad_norm": 43.011539459228516, - "learning_rate": 1.9687280745715148e-05, - "loss": 4.5245, + "epoch": 0.02, + "grad_norm": 96.62332153320312, + "learning_rate": 1.9869472451156758e-05, + "loss": 5.0915, "step": 157 }, { - "epoch": 0.05, - "grad_norm": 24.332199096679688, - "learning_rate": 1.9685276135110754e-05, - "loss": 3.9531, + "epoch": 0.02, + "grad_norm": 83.90554809570312, + "learning_rate": 1.9868635736100072e-05, + "loss": 4.9871, "step": 158 }, { - "epoch": 0.05, - "grad_norm": 37.94855880737305, - "learning_rate": 1.9683271524506368e-05, - "loss": 4.7006, + "epoch": 0.02, + "grad_norm": 27.655275344848633, + "learning_rate": 1.9867799021043385e-05, + "loss": 1.8241, "step": 159 }, { - "epoch": 0.05, - "grad_norm": 20.99969482421875, - "learning_rate": 1.9681266913901975e-05, - "loss": 4.7367, + "epoch": 0.02, + "grad_norm": 24.46050453186035, + "learning_rate": 1.98669623059867e-05, + "loss": 4.7742, "step": 160 }, { - "epoch": 0.05, - "grad_norm": 34.952232360839844, - "learning_rate": 1.9679262303297585e-05, - "loss": 3.2787, + "epoch": 0.02, + "grad_norm": 27.145444869995117, + "learning_rate": 1.986612559093001e-05, + "loss": 5.0453, "step": 161 }, { - "epoch": 0.05, - "grad_norm": 25.392833709716797, - "learning_rate": 1.9677257692693198e-05, - "loss": 5.474, + "epoch": 0.02, + "grad_norm": 22.919443130493164, + "learning_rate": 1.9865288875873323e-05, + "loss": 2.4198, "step": 162 }, { - "epoch": 0.05, - "grad_norm": 20.959766387939453, - "learning_rate": 1.9675253082088805e-05, - "loss": 4.8595, + "epoch": 0.02, + "grad_norm": 24.94849395751953, + "learning_rate": 1.9864452160816637e-05, + "loss": 5.8624, "step": 163 }, { - "epoch": 0.05, - "grad_norm": 49.37668228149414, - "learning_rate": 1.9673248471484415e-05, - "loss": 4.2558, + "epoch": 0.02, + "grad_norm": 20.590097427368164, + "learning_rate": 1.9863615445759947e-05, + "loss": 3.993, "step": 164 }, { - "epoch": 0.05, - "grad_norm": 23.258516311645508, - "learning_rate": 1.9671243860880025e-05, - "loss": 4.362, + "epoch": 0.02, + "grad_norm": 15.112648963928223, + "learning_rate": 1.986277873070326e-05, + "loss": 3.5619, "step": 165 }, { - "epoch": 0.05, - "grad_norm": 23.190126419067383, - "learning_rate": 1.9669239250275635e-05, - "loss": 3.2228, + "epoch": 0.02, + "grad_norm": 56.81850051879883, + "learning_rate": 1.9861942015646574e-05, + "loss": 2.6144, "step": 166 }, { - "epoch": 0.05, - "grad_norm": 18.251544952392578, - "learning_rate": 1.9667234639671245e-05, - "loss": 4.7009, + "epoch": 0.02, + "grad_norm": 63.49656677246094, + "learning_rate": 1.9861105300589888e-05, + "loss": 3.4254, "step": 167 }, { - "epoch": 0.05, - "grad_norm": 19.142702102661133, - "learning_rate": 1.9665230029066855e-05, - "loss": 3.9582, + "epoch": 0.02, + "grad_norm": 52.94969940185547, + "learning_rate": 1.9860268585533198e-05, + "loss": 4.6606, "step": 168 }, { - "epoch": 0.05, - "grad_norm": 21.59228515625, - "learning_rate": 1.9663225418462465e-05, - "loss": 5.7372, + "epoch": 0.02, + "grad_norm": 48.12769317626953, + "learning_rate": 1.985943187047651e-05, + "loss": 4.2959, "step": 169 }, { - "epoch": 0.05, - "grad_norm": 55.695281982421875, - "learning_rate": 1.9661220807858075e-05, - "loss": 3.9148, + "epoch": 0.02, + "grad_norm": 31.31679916381836, + "learning_rate": 1.9858595155419825e-05, + "loss": 4.3399, "step": 170 }, { - "epoch": 0.05, - "grad_norm": 68.97881317138672, - "learning_rate": 1.9659216197253685e-05, - "loss": 4.3682, + "epoch": 0.02, + "grad_norm": 28.835838317871094, + "learning_rate": 1.9857758440363136e-05, + "loss": 6.7479, "step": 171 }, { - "epoch": 0.05, - "grad_norm": 20.77494239807129, - "learning_rate": 1.9657211586649296e-05, - "loss": 4.0729, + "epoch": 0.02, + "grad_norm": 26.881940841674805, + "learning_rate": 1.985692172530645e-05, + "loss": 5.6704, "step": 172 }, { - "epoch": 0.05, - "grad_norm": 48.90683364868164, - "learning_rate": 1.9655206976044906e-05, - "loss": 3.9279, + "epoch": 0.02, + "grad_norm": 29.435422897338867, + "learning_rate": 1.9856085010249763e-05, + "loss": 3.5854, "step": 173 }, { - "epoch": 0.05, - "grad_norm": 27.227113723754883, - "learning_rate": 1.9653202365440516e-05, - "loss": 4.6083, + "epoch": 0.02, + "grad_norm": 28.923675537109375, + "learning_rate": 1.9855248295193073e-05, + "loss": 5.2268, "step": 174 }, { - "epoch": 0.05, - "grad_norm": 25.539125442504883, - "learning_rate": 1.9651197754836126e-05, - "loss": 3.8157, + "epoch": 0.02, + "grad_norm": 50.01896667480469, + "learning_rate": 1.9854411580136387e-05, + "loss": 2.6085, "step": 175 }, { - "epoch": 0.05, - "grad_norm": 21.638704299926758, - "learning_rate": 1.9649193144231736e-05, - "loss": 3.8649, + "epoch": 0.02, + "grad_norm": 34.73915100097656, + "learning_rate": 1.98535748650797e-05, + "loss": 4.8505, "step": 176 }, { - "epoch": 0.05, - "grad_norm": 29.82769775390625, - "learning_rate": 1.9647188533627343e-05, - "loss": 4.5611, + "epoch": 0.02, + "grad_norm": 96.74727630615234, + "learning_rate": 1.985273815002301e-05, + "loss": 5.0014, "step": 177 }, { - "epoch": 0.05, - "grad_norm": 22.91179656982422, - "learning_rate": 1.9645183923022956e-05, - "loss": 3.083, + "epoch": 0.02, + "grad_norm": 32.39076232910156, + "learning_rate": 1.9851901434966324e-05, + "loss": 6.2986, "step": 178 }, { - "epoch": 0.05, - "grad_norm": 19.676715850830078, - "learning_rate": 1.9643179312418563e-05, - "loss": 3.8661, + "epoch": 0.02, + "grad_norm": 20.07256317138672, + "learning_rate": 1.9851064719909635e-05, + "loss": 4.4205, "step": 179 }, { - "epoch": 0.05, - "grad_norm": 18.871702194213867, - "learning_rate": 1.9641174701814173e-05, - "loss": 3.8232, + "epoch": 0.02, + "grad_norm": 20.149194717407227, + "learning_rate": 1.9850228004852948e-05, + "loss": 4.715, "step": 180 }, { - "epoch": 0.05, - "grad_norm": 33.633079528808594, - "learning_rate": 1.9639170091209786e-05, - "loss": 3.9483, + "epoch": 0.02, + "grad_norm": 34.583003997802734, + "learning_rate": 1.9849391289796262e-05, + "loss": 6.2908, "step": 181 }, { - "epoch": 0.05, - "grad_norm": 17.3801212310791, - "learning_rate": 1.9637165480605393e-05, - "loss": 3.4511, + "epoch": 0.02, + "grad_norm": 15.587189674377441, + "learning_rate": 1.9848554574739572e-05, + "loss": 2.9253, "step": 182 }, { - "epoch": 0.06, - "grad_norm": 30.476238250732422, - "learning_rate": 1.9635160870001003e-05, - "loss": 4.5803, + "epoch": 0.02, + "grad_norm": 30.812002182006836, + "learning_rate": 1.9847717859682886e-05, + "loss": 4.1018, "step": 183 }, { - "epoch": 0.06, - "grad_norm": 20.700347900390625, - "learning_rate": 1.9633156259396613e-05, - "loss": 3.6991, + "epoch": 0.02, + "grad_norm": 22.826852798461914, + "learning_rate": 1.98468811446262e-05, + "loss": 2.9077, "step": 184 }, { - "epoch": 0.06, - "grad_norm": 16.463884353637695, - "learning_rate": 1.9631151648792223e-05, - "loss": 3.2284, + "epoch": 0.02, + "grad_norm": 92.27213287353516, + "learning_rate": 1.984604442956951e-05, + "loss": 4.9038, "step": 185 }, { - "epoch": 0.06, - "grad_norm": 22.347118377685547, - "learning_rate": 1.9629147038187833e-05, - "loss": 4.603, + "epoch": 0.02, + "grad_norm": 34.23793029785156, + "learning_rate": 1.9845207714512823e-05, + "loss": 4.782, "step": 186 }, { - "epoch": 0.06, - "grad_norm": 22.56680679321289, - "learning_rate": 1.9627142427583443e-05, - "loss": 4.332, + "epoch": 0.02, + "grad_norm": 40.07436752319336, + "learning_rate": 1.9844370999456137e-05, + "loss": 6.2297, "step": 187 }, { - "epoch": 0.06, - "grad_norm": 31.80207061767578, - "learning_rate": 1.9625137816979054e-05, - "loss": 3.8008, + "epoch": 0.02, + "grad_norm": 38.16945266723633, + "learning_rate": 1.9843534284399447e-05, + "loss": 6.1175, "step": 188 }, { - "epoch": 0.06, - "grad_norm": 22.534713745117188, - "learning_rate": 1.9623133206374664e-05, - "loss": 3.7199, + "epoch": 0.02, + "grad_norm": 38.583229064941406, + "learning_rate": 1.984269756934276e-05, + "loss": 3.8736, "step": 189 }, { - "epoch": 0.06, - "grad_norm": 22.314205169677734, - "learning_rate": 1.9621128595770274e-05, - "loss": 4.3184, + "epoch": 0.02, + "grad_norm": 36.794189453125, + "learning_rate": 1.9841860854286075e-05, + "loss": 4.1338, "step": 190 }, { - "epoch": 0.06, - "grad_norm": 24.710988998413086, - "learning_rate": 1.9619123985165884e-05, - "loss": 4.7067, + "epoch": 0.02, + "grad_norm": 37.98091125488281, + "learning_rate": 1.9841024139229388e-05, + "loss": 5.0907, "step": 191 }, { - "epoch": 0.06, - "grad_norm": 34.565242767333984, - "learning_rate": 1.9617119374561494e-05, - "loss": 3.0013, + "epoch": 0.02, + "grad_norm": 21.34034538269043, + "learning_rate": 1.98401874241727e-05, + "loss": 3.6487, "step": 192 }, { - "epoch": 0.06, - "grad_norm": 21.252920150756836, - "learning_rate": 1.9615114763957104e-05, - "loss": 5.5017, + "epoch": 0.02, + "grad_norm": 38.78040313720703, + "learning_rate": 1.9839350709116012e-05, + "loss": 3.5643, "step": 193 }, { - "epoch": 0.06, - "grad_norm": 18.080705642700195, - "learning_rate": 1.9613110153352714e-05, - "loss": 3.4748, + "epoch": 0.02, + "grad_norm": 31.415233612060547, + "learning_rate": 1.9838513994059326e-05, + "loss": 2.7609, "step": 194 }, { - "epoch": 0.06, - "grad_norm": 20.119901657104492, - "learning_rate": 1.9611105542748324e-05, - "loss": 4.6922, + "epoch": 0.02, + "grad_norm": 28.023122787475586, + "learning_rate": 1.9837677279002636e-05, + "loss": 4.1092, "step": 195 }, { - "epoch": 0.06, - "grad_norm": 20.794160842895508, - "learning_rate": 1.960910093214393e-05, - "loss": 4.3524, + "epoch": 0.02, + "grad_norm": 22.933298110961914, + "learning_rate": 1.983684056394595e-05, + "loss": 3.9792, "step": 196 }, { - "epoch": 0.06, - "grad_norm": 16.995960235595703, - "learning_rate": 1.9607096321539544e-05, - "loss": 5.311, + "epoch": 0.02, + "grad_norm": 25.238265991210938, + "learning_rate": 1.9836003848889263e-05, + "loss": 3.7678, "step": 197 }, { - "epoch": 0.06, - "grad_norm": 28.07927131652832, - "learning_rate": 1.9605091710935154e-05, - "loss": 4.5386, + "epoch": 0.02, + "grad_norm": 34.65019989013672, + "learning_rate": 1.9835167133832577e-05, + "loss": 3.7924, "step": 198 }, { - "epoch": 0.06, - "grad_norm": 20.054431915283203, - "learning_rate": 1.960308710033076e-05, - "loss": 3.8613, + "epoch": 0.02, + "grad_norm": 40.85706329345703, + "learning_rate": 1.9834330418775887e-05, + "loss": 4.5442, "step": 199 }, { - "epoch": 0.06, - "grad_norm": 19.08523941040039, - "learning_rate": 1.9601082489726375e-05, - "loss": 4.5862, + "epoch": 0.03, + "grad_norm": 19.91678237915039, + "learning_rate": 1.98334937037192e-05, + "loss": 4.335, "step": 200 }, { - "epoch": 0.06, - "grad_norm": 20.86153793334961, - "learning_rate": 1.959907787912198e-05, - "loss": 4.4943, + "epoch": 0.03, + "grad_norm": 27.40424156188965, + "learning_rate": 1.9832656988662515e-05, + "loss": 4.0261, "step": 201 }, { - "epoch": 0.06, - "grad_norm": 17.117141723632812, - "learning_rate": 1.959707326851759e-05, - "loss": 3.1455, + "epoch": 0.03, + "grad_norm": 28.54695701599121, + "learning_rate": 1.9831820273605825e-05, + "loss": 5.5133, "step": 202 }, { - "epoch": 0.06, - "grad_norm": 20.701730728149414, - "learning_rate": 1.95950686579132e-05, - "loss": 4.1613, + "epoch": 0.03, + "grad_norm": 21.289785385131836, + "learning_rate": 1.983098355854914e-05, + "loss": 2.3904, "step": 203 }, { - "epoch": 0.06, - "grad_norm": 17.824867248535156, - "learning_rate": 1.959306404730881e-05, - "loss": 3.764, + "epoch": 0.03, + "grad_norm": 20.005878448486328, + "learning_rate": 1.9830146843492452e-05, + "loss": 4.9863, "step": 204 }, { - "epoch": 0.06, - "grad_norm": 23.23500633239746, - "learning_rate": 1.959105943670442e-05, - "loss": 3.647, + "epoch": 0.03, + "grad_norm": 27.44087028503418, + "learning_rate": 1.9829310128435762e-05, + "loss": 4.5081, "step": 205 }, { - "epoch": 0.06, - "grad_norm": 19.766569137573242, - "learning_rate": 1.958905482610003e-05, - "loss": 3.9145, + "epoch": 0.03, + "grad_norm": 42.43584442138672, + "learning_rate": 1.9828473413379076e-05, + "loss": 5.4452, "step": 206 }, { - "epoch": 0.06, - "grad_norm": 22.38438606262207, - "learning_rate": 1.9587050215495642e-05, - "loss": 4.4528, + "epoch": 0.03, + "grad_norm": 25.16995620727539, + "learning_rate": 1.9827636698322386e-05, + "loss": 4.7488, "step": 207 }, { - "epoch": 0.06, - "grad_norm": 17.628910064697266, - "learning_rate": 1.9585045604891252e-05, - "loss": 3.7731, + "epoch": 0.03, + "grad_norm": 13.992096900939941, + "learning_rate": 1.98267999832657e-05, + "loss": 5.4432, "step": 208 }, { - "epoch": 0.06, - "grad_norm": 18.171613693237305, - "learning_rate": 1.9583040994286862e-05, - "loss": 4.1904, + "epoch": 0.03, + "grad_norm": 32.74681854248047, + "learning_rate": 1.9825963268209014e-05, + "loss": 4.6107, "step": 209 }, { - "epoch": 0.06, - "grad_norm": 27.142881393432617, - "learning_rate": 1.958103638368247e-05, - "loss": 4.3347, + "epoch": 0.03, + "grad_norm": 29.164478302001953, + "learning_rate": 1.9825126553152324e-05, + "loss": 3.5511, "step": 210 }, { - "epoch": 0.06, - "grad_norm": 13.47857666015625, - "learning_rate": 1.9579031773078082e-05, - "loss": 3.2086, + "epoch": 0.03, + "grad_norm": 25.634967803955078, + "learning_rate": 1.9824289838095637e-05, + "loss": 4.1461, "step": 211 }, { - "epoch": 0.06, - "grad_norm": 18.275033950805664, - "learning_rate": 1.9577027162473692e-05, - "loss": 4.6453, + "epoch": 0.03, + "grad_norm": 26.063108444213867, + "learning_rate": 1.982345312303895e-05, + "loss": 2.5191, "step": 212 }, { - "epoch": 0.06, - "grad_norm": 17.69455337524414, - "learning_rate": 1.95750225518693e-05, - "loss": 3.5041, + "epoch": 0.03, + "grad_norm": 30.398160934448242, + "learning_rate": 1.982261640798226e-05, + "loss": 4.1432, "step": 213 }, { - "epoch": 0.06, - "grad_norm": 16.023883819580078, - "learning_rate": 1.9573017941264912e-05, - "loss": 4.2656, + "epoch": 0.03, + "grad_norm": 20.190427780151367, + "learning_rate": 1.9821779692925575e-05, + "loss": 2.7101, "step": 214 }, { - "epoch": 0.06, - "grad_norm": 25.941577911376953, - "learning_rate": 1.957101333066052e-05, - "loss": 3.9683, + "epoch": 0.03, + "grad_norm": 29.440567016601562, + "learning_rate": 1.982094297786889e-05, + "loss": 4.5135, "step": 215 }, { - "epoch": 0.06, - "grad_norm": 28.56459617614746, - "learning_rate": 1.956900872005613e-05, - "loss": 4.1389, + "epoch": 0.03, + "grad_norm": 24.399301528930664, + "learning_rate": 1.98201062628122e-05, + "loss": 3.446, "step": 216 }, { - "epoch": 0.07, - "grad_norm": 18.201026916503906, - "learning_rate": 1.9567004109451743e-05, - "loss": 4.2534, + "epoch": 0.03, + "grad_norm": 19.4206600189209, + "learning_rate": 1.9819269547755513e-05, + "loss": 3.162, "step": 217 }, { - "epoch": 0.07, - "grad_norm": 17.178457260131836, - "learning_rate": 1.956499949884735e-05, - "loss": 3.8277, + "epoch": 0.03, + "grad_norm": 23.20094108581543, + "learning_rate": 1.9818432832698826e-05, + "loss": 5.614, "step": 218 }, { - "epoch": 0.07, - "grad_norm": 38.801876068115234, - "learning_rate": 1.956299488824296e-05, - "loss": 4.1429, + "epoch": 0.03, + "grad_norm": 23.896318435668945, + "learning_rate": 1.981759611764214e-05, + "loss": 2.5011, "step": 219 }, { - "epoch": 0.07, - "grad_norm": 23.956295013427734, - "learning_rate": 1.956099027763857e-05, - "loss": 5.4837, + "epoch": 0.03, + "grad_norm": 27.542938232421875, + "learning_rate": 1.981675940258545e-05, + "loss": 1.6989, "step": 220 }, { - "epoch": 0.07, - "grad_norm": 16.060426712036133, - "learning_rate": 1.955898566703418e-05, - "loss": 3.7044, + "epoch": 0.03, + "grad_norm": 20.2366886138916, + "learning_rate": 1.9815922687528764e-05, + "loss": 2.322, "step": 221 }, { - "epoch": 0.07, - "grad_norm": 18.859159469604492, - "learning_rate": 1.955698105642979e-05, - "loss": 4.168, + "epoch": 0.03, + "grad_norm": 19.03917694091797, + "learning_rate": 1.9815085972472077e-05, + "loss": 7.0657, "step": 222 }, { - "epoch": 0.07, - "grad_norm": 21.508644104003906, - "learning_rate": 1.95549764458254e-05, - "loss": 4.5285, + "epoch": 0.03, + "grad_norm": 80.15726470947266, + "learning_rate": 1.9814249257415388e-05, + "loss": 4.5171, "step": 223 }, { - "epoch": 0.07, - "grad_norm": 25.023488998413086, - "learning_rate": 1.955297183522101e-05, - "loss": 3.2762, + "epoch": 0.03, + "grad_norm": 24.671308517456055, + "learning_rate": 1.98134125423587e-05, + "loss": 5.1079, "step": 224 }, { - "epoch": 0.07, - "grad_norm": 16.727890014648438, - "learning_rate": 1.955096722461662e-05, - "loss": 3.3409, + "epoch": 0.03, + "grad_norm": 17.61369514465332, + "learning_rate": 1.9812575827302015e-05, + "loss": 2.1667, "step": 225 }, { - "epoch": 0.07, - "grad_norm": 20.456737518310547, - "learning_rate": 1.954896261401223e-05, - "loss": 4.1665, + "epoch": 0.03, + "grad_norm": 23.398298263549805, + "learning_rate": 1.981173911224533e-05, + "loss": 4.2372, "step": 226 }, { - "epoch": 0.07, - "grad_norm": 17.329370498657227, - "learning_rate": 1.954695800340784e-05, - "loss": 3.0992, + "epoch": 0.03, + "grad_norm": 29.489824295043945, + "learning_rate": 1.981090239718864e-05, + "loss": 4.3447, "step": 227 }, { - "epoch": 0.07, - "grad_norm": 39.43778610229492, - "learning_rate": 1.954495339280345e-05, - "loss": 4.5199, + "epoch": 0.03, + "grad_norm": 44.913883209228516, + "learning_rate": 1.9810065682131953e-05, + "loss": 3.274, "step": 228 }, { - "epoch": 0.07, - "grad_norm": 20.120681762695312, - "learning_rate": 1.954294878219906e-05, - "loss": 4.6025, + "epoch": 0.03, + "grad_norm": 33.340850830078125, + "learning_rate": 1.9809228967075266e-05, + "loss": 3.2506, "step": 229 }, { - "epoch": 0.07, - "grad_norm": 22.665719985961914, - "learning_rate": 1.954094417159467e-05, - "loss": 4.3815, + "epoch": 0.03, + "grad_norm": 19.065631866455078, + "learning_rate": 1.9808392252018576e-05, + "loss": 4.2629, "step": 230 }, { - "epoch": 0.07, - "grad_norm": 24.487995147705078, - "learning_rate": 1.953893956099028e-05, - "loss": 4.2686, + "epoch": 0.03, + "grad_norm": 24.77507972717285, + "learning_rate": 1.980755553696189e-05, + "loss": 3.236, "step": 231 }, { - "epoch": 0.07, - "grad_norm": 32.88031768798828, - "learning_rate": 1.9536934950385887e-05, - "loss": 4.3775, + "epoch": 0.03, + "grad_norm": 55.65793991088867, + "learning_rate": 1.98067188219052e-05, + "loss": 3.031, "step": 232 }, { - "epoch": 0.07, - "grad_norm": 22.327272415161133, - "learning_rate": 1.95349303397815e-05, - "loss": 3.7784, + "epoch": 0.03, + "grad_norm": 14.242398262023926, + "learning_rate": 1.9805882106848514e-05, + "loss": 4.0164, "step": 233 }, { - "epoch": 0.07, - "grad_norm": 33.93623352050781, - "learning_rate": 1.9532925729177107e-05, - "loss": 3.2231, + "epoch": 0.03, + "grad_norm": 25.252952575683594, + "learning_rate": 1.9805045391791828e-05, + "loss": 3.5733, "step": 234 }, { - "epoch": 0.07, - "grad_norm": 24.04096794128418, - "learning_rate": 1.9530921118572717e-05, - "loss": 4.1805, + "epoch": 0.03, + "grad_norm": 26.969528198242188, + "learning_rate": 1.9804208676735138e-05, + "loss": 4.2902, "step": 235 }, { - "epoch": 0.07, - "grad_norm": 18.985858917236328, - "learning_rate": 1.952891650796833e-05, - "loss": 4.1365, + "epoch": 0.03, + "grad_norm": 18.014312744140625, + "learning_rate": 1.980337196167845e-05, + "loss": 4.9391, "step": 236 }, { - "epoch": 0.07, - "grad_norm": 21.616893768310547, - "learning_rate": 1.9526911897363937e-05, - "loss": 2.4831, + "epoch": 0.03, + "grad_norm": 67.04398345947266, + "learning_rate": 1.9802535246621762e-05, + "loss": 2.8289, "step": 237 }, { - "epoch": 0.07, - "grad_norm": 26.768409729003906, - "learning_rate": 1.9524907286759548e-05, - "loss": 4.6538, + "epoch": 0.03, + "grad_norm": 24.836767196655273, + "learning_rate": 1.9801698531565076e-05, + "loss": 2.4735, "step": 238 }, { - "epoch": 0.07, - "grad_norm": 21.572690963745117, - "learning_rate": 1.9522902676155158e-05, - "loss": 3.7805, + "epoch": 0.03, + "grad_norm": 36.715023040771484, + "learning_rate": 1.980086181650839e-05, + "loss": 4.0768, "step": 239 }, { - "epoch": 0.07, - "grad_norm": 14.675576210021973, - "learning_rate": 1.9520898065550768e-05, - "loss": 3.3591, - "step": 240 - }, - { - "epoch": 0.07, - "eval_loss": 1.399770975112915, - "eval_runtime": 43.6698, - "eval_samples_per_second": 33.868, - "eval_steps_per_second": 33.868, + "epoch": 0.03, + "grad_norm": 17.729856491088867, + "learning_rate": 1.9800025101451703e-05, + "loss": 5.27, "step": 240 }, { - "epoch": 0.07, - "grad_norm": 39.946598052978516, - "learning_rate": 1.9518893454946378e-05, - "loss": 3.3287, + "epoch": 0.03, + "grad_norm": 45.96723556518555, + "learning_rate": 1.9799188386395013e-05, + "loss": 4.0394, "step": 241 }, { - "epoch": 0.07, - "grad_norm": 22.521690368652344, - "learning_rate": 1.9516888844341988e-05, - "loss": 3.3806, + "epoch": 0.03, + "grad_norm": 92.66098022460938, + "learning_rate": 1.9798351671338327e-05, + "loss": 3.5547, "step": 242 }, { - "epoch": 0.07, - "grad_norm": 18.76473045349121, - "learning_rate": 1.9514884233737598e-05, - "loss": 4.1907, + "epoch": 0.03, + "grad_norm": 19.052738189697266, + "learning_rate": 1.979751495628164e-05, + "loss": 3.59, "step": 243 }, { - "epoch": 0.07, - "grad_norm": 17.154766082763672, - "learning_rate": 1.9512879623133208e-05, - "loss": 3.7032, + "epoch": 0.03, + "grad_norm": 27.923494338989258, + "learning_rate": 1.979667824122495e-05, + "loss": 3.4968, "step": 244 }, { - "epoch": 0.07, - "grad_norm": 25.60683250427246, - "learning_rate": 1.9510875012528818e-05, - "loss": 4.4485, + "epoch": 0.03, + "grad_norm": 24.136812210083008, + "learning_rate": 1.9795841526168264e-05, + "loss": 4.4669, "step": 245 }, { - "epoch": 0.07, - "grad_norm": 29.663965225219727, - "learning_rate": 1.9508870401924428e-05, - "loss": 4.5449, + "epoch": 0.03, + "grad_norm": 22.240131378173828, + "learning_rate": 1.9795004811111578e-05, + "loss": 4.423, "step": 246 }, { - "epoch": 0.07, - "grad_norm": 24.08624267578125, - "learning_rate": 1.950686579132004e-05, - "loss": 4.6023, + "epoch": 0.03, + "grad_norm": 20.765474319458008, + "learning_rate": 1.979416809605489e-05, + "loss": 3.8466, "step": 247 }, { - "epoch": 0.07, - "grad_norm": 17.730571746826172, - "learning_rate": 1.950486118071565e-05, - "loss": 4.7885, + "epoch": 0.03, + "grad_norm": 16.91594123840332, + "learning_rate": 1.9793331380998202e-05, + "loss": 3.9957, "step": 248 }, { - "epoch": 0.07, - "grad_norm": 16.919239044189453, - "learning_rate": 1.950285657011126e-05, - "loss": 3.2188, + "epoch": 0.03, + "grad_norm": 46.77788543701172, + "learning_rate": 1.9792494665941515e-05, + "loss": 2.9274, "step": 249 }, { - "epoch": 0.08, - "grad_norm": 16.421016693115234, - "learning_rate": 1.950085195950687e-05, - "loss": 4.6058, + "epoch": 0.03, + "grad_norm": 26.280046463012695, + "learning_rate": 1.979165795088483e-05, + "loss": 5.7432, "step": 250 }, { - "epoch": 0.08, - "grad_norm": 26.03248405456543, - "learning_rate": 1.9498847348902475e-05, - "loss": 5.1777, + "epoch": 0.03, + "grad_norm": 32.9533576965332, + "learning_rate": 1.979082123582814e-05, + "loss": 3.7103, "step": 251 }, { - "epoch": 0.08, - "grad_norm": 24.347097396850586, - "learning_rate": 1.949684273829809e-05, - "loss": 4.5525, + "epoch": 0.03, + "grad_norm": 21.88126564025879, + "learning_rate": 1.9789984520771453e-05, + "loss": 5.8115, "step": 252 }, { - "epoch": 0.08, - "grad_norm": 16.834897994995117, - "learning_rate": 1.9494838127693695e-05, - "loss": 3.7237, + "epoch": 0.03, + "grad_norm": 25.590606689453125, + "learning_rate": 1.9789147805714767e-05, + "loss": 5.702, "step": 253 }, { - "epoch": 0.08, - "grad_norm": 14.267677307128906, - "learning_rate": 1.9492833517089306e-05, - "loss": 4.2919, + "epoch": 0.03, + "grad_norm": 16.906322479248047, + "learning_rate": 1.978831109065808e-05, + "loss": 4.2226, "step": 254 }, { - "epoch": 0.08, - "grad_norm": 14.566679954528809, - "learning_rate": 1.949082890648492e-05, - "loss": 4.2664, + "epoch": 0.03, + "grad_norm": 54.520263671875, + "learning_rate": 1.978747437560139e-05, + "loss": 3.7309, "step": 255 }, { - "epoch": 0.08, - "grad_norm": 20.296588897705078, - "learning_rate": 1.9488824295880526e-05, - "loss": 4.1636, + "epoch": 0.03, + "grad_norm": 15.407824516296387, + "learning_rate": 1.9786637660544704e-05, + "loss": 1.4514, "step": 256 }, { - "epoch": 0.08, - "grad_norm": 19.508743286132812, - "learning_rate": 1.9486819685276136e-05, - "loss": 3.5817, + "epoch": 0.03, + "grad_norm": 27.355886459350586, + "learning_rate": 1.9785800945488018e-05, + "loss": 6.2087, "step": 257 }, { - "epoch": 0.08, - "grad_norm": 22.892982482910156, - "learning_rate": 1.9484815074671746e-05, - "loss": 4.1911, + "epoch": 0.03, + "grad_norm": 19.986955642700195, + "learning_rate": 1.9784964230431328e-05, + "loss": 4.4818, "step": 258 }, { - "epoch": 0.08, - "grad_norm": 20.90638542175293, - "learning_rate": 1.9482810464067356e-05, - "loss": 3.3314, + "epoch": 0.03, + "grad_norm": 19.87293243408203, + "learning_rate": 1.9784127515374642e-05, + "loss": 3.9836, "step": 259 }, { - "epoch": 0.08, - "grad_norm": 16.314008712768555, - "learning_rate": 1.9480805853462966e-05, - "loss": 3.5795, + "epoch": 0.03, + "grad_norm": 19.814401626586914, + "learning_rate": 1.9783290800317952e-05, + "loss": 4.6888, "step": 260 }, { - "epoch": 0.08, - "grad_norm": 19.024072647094727, - "learning_rate": 1.9478801242858576e-05, - "loss": 3.4856, + "epoch": 0.03, + "grad_norm": 63.62940979003906, + "learning_rate": 1.9782454085261266e-05, + "loss": 4.4955, "step": 261 }, { - "epoch": 0.08, - "grad_norm": 15.41294002532959, - "learning_rate": 1.9476796632254186e-05, - "loss": 3.9168, + "epoch": 0.03, + "grad_norm": 21.967994689941406, + "learning_rate": 1.978161737020458e-05, + "loss": 3.8234, "step": 262 }, { - "epoch": 0.08, - "grad_norm": 27.279674530029297, - "learning_rate": 1.9474792021649796e-05, - "loss": 4.4373, + "epoch": 0.03, + "grad_norm": 24.47323989868164, + "learning_rate": 1.978078065514789e-05, + "loss": 3.2202, "step": 263 }, { - "epoch": 0.08, - "grad_norm": 26.365346908569336, - "learning_rate": 1.9472787411045406e-05, - "loss": 4.3507, + "epoch": 0.03, + "grad_norm": 31.2077693939209, + "learning_rate": 1.9779943940091203e-05, + "loss": 4.4682, "step": 264 }, { - "epoch": 0.08, - "grad_norm": 18.319107055664062, - "learning_rate": 1.9470782800441016e-05, - "loss": 4.1936, + "epoch": 0.03, + "grad_norm": 16.750263214111328, + "learning_rate": 1.9779107225034514e-05, + "loss": 3.5919, "step": 265 }, { - "epoch": 0.08, - "grad_norm": 16.385744094848633, - "learning_rate": 1.9468778189836627e-05, - "loss": 4.3412, + "epoch": 0.03, + "grad_norm": 23.57237434387207, + "learning_rate": 1.9778270509977827e-05, + "loss": 1.6616, "step": 266 }, { - "epoch": 0.08, - "grad_norm": 17.853548049926758, - "learning_rate": 1.9466773579232237e-05, - "loss": 4.4193, + "epoch": 0.03, + "grad_norm": 22.44601058959961, + "learning_rate": 1.977743379492114e-05, + "loss": 4.5676, "step": 267 }, { - "epoch": 0.08, - "grad_norm": 23.52976417541504, - "learning_rate": 1.9464768968627847e-05, - "loss": 5.0003, + "epoch": 0.03, + "grad_norm": 24.28874969482422, + "learning_rate": 1.9776597079864454e-05, + "loss": 4.0052, "step": 268 }, { - "epoch": 0.08, - "grad_norm": 20.375717163085938, - "learning_rate": 1.9462764358023457e-05, - "loss": 3.6193, + "epoch": 0.03, + "grad_norm": 15.053487777709961, + "learning_rate": 1.9775760364807765e-05, + "loss": 5.6047, "step": 269 }, { - "epoch": 0.08, - "grad_norm": 19.429637908935547, - "learning_rate": 1.9460759747419063e-05, - "loss": 4.0949, + "epoch": 0.03, + "grad_norm": 16.717178344726562, + "learning_rate": 1.977492364975108e-05, + "loss": 1.4622, "step": 270 }, { - "epoch": 0.08, - "grad_norm": 16.466793060302734, - "learning_rate": 1.9458755136814677e-05, - "loss": 3.3825, + "epoch": 0.03, + "grad_norm": 127.36611938476562, + "learning_rate": 1.9774086934694392e-05, + "loss": 3.3587, "step": 271 }, { - "epoch": 0.08, - "grad_norm": 28.024784088134766, - "learning_rate": 1.9456750526210287e-05, - "loss": 4.1028, + "epoch": 0.03, + "grad_norm": 24.546730041503906, + "learning_rate": 1.9773250219637702e-05, + "loss": 2.8608, "step": 272 }, { - "epoch": 0.08, - "grad_norm": 20.214069366455078, - "learning_rate": 1.9454745915605894e-05, - "loss": 3.611, + "epoch": 0.03, + "grad_norm": 27.449810028076172, + "learning_rate": 1.9772413504581016e-05, + "loss": 3.8879, "step": 273 }, { - "epoch": 0.08, - "grad_norm": 17.36444854736328, - "learning_rate": 1.9452741305001507e-05, - "loss": 3.7358, + "epoch": 0.03, + "grad_norm": 23.76478385925293, + "learning_rate": 1.977157678952433e-05, + "loss": 3.8756, "step": 274 }, { - "epoch": 0.08, - "grad_norm": 17.746870040893555, - "learning_rate": 1.9450736694397114e-05, - "loss": 3.527, + "epoch": 0.03, + "grad_norm": 28.613237380981445, + "learning_rate": 1.9770740074467643e-05, + "loss": 1.9286, "step": 275 }, { - "epoch": 0.08, - "grad_norm": 15.168383598327637, - "learning_rate": 1.9448732083792724e-05, - "loss": 3.3192, + "epoch": 0.03, + "grad_norm": 47.288856506347656, + "learning_rate": 1.9769903359410954e-05, + "loss": 4.0929, "step": 276 }, { - "epoch": 0.08, - "grad_norm": 24.655101776123047, - "learning_rate": 1.9446727473188334e-05, - "loss": 5.3757, + "epoch": 0.03, + "grad_norm": 24.35297966003418, + "learning_rate": 1.9769066644354267e-05, + "loss": 4.455, "step": 277 }, { - "epoch": 0.08, - "grad_norm": 22.04741668701172, - "learning_rate": 1.9444722862583944e-05, - "loss": 3.5021, + "epoch": 0.03, + "grad_norm": 70.38229370117188, + "learning_rate": 1.976822992929758e-05, + "loss": 3.4509, "step": 278 }, { - "epoch": 0.08, - "grad_norm": 15.345871925354004, - "learning_rate": 1.9442718251979554e-05, - "loss": 4.5815, + "epoch": 0.04, + "grad_norm": 15.766568183898926, + "learning_rate": 1.976739321424089e-05, + "loss": 3.3342, "step": 279 }, { - "epoch": 0.08, - "grad_norm": 17.076583862304688, - "learning_rate": 1.9440713641375164e-05, - "loss": 4.7359, + "epoch": 0.04, + "grad_norm": 22.440858840942383, + "learning_rate": 1.9766556499184205e-05, + "loss": 5.897, "step": 280 }, { - "epoch": 0.08, - "grad_norm": 17.223661422729492, - "learning_rate": 1.9438709030770774e-05, - "loss": 3.6285, + "epoch": 0.04, + "grad_norm": 15.124077796936035, + "learning_rate": 1.976571978412752e-05, + "loss": 3.1477, "step": 281 }, { - "epoch": 0.08, - "grad_norm": 19.869312286376953, - "learning_rate": 1.9436704420166385e-05, - "loss": 3.9027, + "epoch": 0.04, + "grad_norm": 17.690269470214844, + "learning_rate": 1.9764883069070832e-05, + "loss": 4.4101, "step": 282 }, { - "epoch": 0.09, - "grad_norm": 20.547405242919922, - "learning_rate": 1.9434699809561995e-05, - "loss": 3.6283, + "epoch": 0.04, + "grad_norm": 45.84641647338867, + "learning_rate": 1.9764046354014142e-05, + "loss": 6.0457, "step": 283 }, { - "epoch": 0.09, - "grad_norm": 18.522144317626953, - "learning_rate": 1.9432695198957605e-05, - "loss": 4.2172, + "epoch": 0.04, + "grad_norm": 20.248172760009766, + "learning_rate": 1.9763209638957456e-05, + "loss": 3.7918, "step": 284 }, { - "epoch": 0.09, - "grad_norm": 30.722158432006836, - "learning_rate": 1.9430690588353215e-05, - "loss": 3.6669, + "epoch": 0.04, + "grad_norm": 18.53105926513672, + "learning_rate": 1.9762372923900766e-05, + "loss": 3.0112, "step": 285 }, { - "epoch": 0.09, - "grad_norm": 16.493730545043945, - "learning_rate": 1.9428685977748825e-05, - "loss": 4.1544, + "epoch": 0.04, + "grad_norm": 44.17702102661133, + "learning_rate": 1.976153620884408e-05, + "loss": 3.3778, "step": 286 }, { - "epoch": 0.09, - "grad_norm": 15.535743713378906, - "learning_rate": 1.942668136714443e-05, - "loss": 4.1201, + "epoch": 0.04, + "grad_norm": 27.555721282958984, + "learning_rate": 1.9760699493787393e-05, + "loss": 3.6562, "step": 287 }, { - "epoch": 0.09, - "grad_norm": 136.23187255859375, - "learning_rate": 1.9424676756540045e-05, - "loss": 4.5668, + "epoch": 0.04, + "grad_norm": 29.536550521850586, + "learning_rate": 1.9759862778730704e-05, + "loss": 5.412, "step": 288 }, { - "epoch": 0.09, - "grad_norm": 15.681511878967285, - "learning_rate": 1.9422672145935652e-05, - "loss": 4.1992, + "epoch": 0.04, + "grad_norm": 24.517093658447266, + "learning_rate": 1.9759026063674017e-05, + "loss": 4.254, "step": 289 }, { - "epoch": 0.09, - "grad_norm": 12.06187915802002, - "learning_rate": 1.9420667535331262e-05, - "loss": 2.6199, + "epoch": 0.04, + "grad_norm": 17.41961097717285, + "learning_rate": 1.9758189348617328e-05, + "loss": 3.973, "step": 290 }, { - "epoch": 0.09, - "grad_norm": 14.176081657409668, - "learning_rate": 1.9418662924726875e-05, - "loss": 2.6663, + "epoch": 0.04, + "grad_norm": 14.359147071838379, + "learning_rate": 1.975735263356064e-05, + "loss": 1.5225, "step": 291 }, { - "epoch": 0.09, - "grad_norm": 16.331350326538086, - "learning_rate": 1.9416658314122482e-05, - "loss": 4.1395, + "epoch": 0.04, + "grad_norm": 18.383588790893555, + "learning_rate": 1.9756515918503955e-05, + "loss": 2.5873, "step": 292 }, { - "epoch": 0.09, - "grad_norm": 25.406064987182617, - "learning_rate": 1.9414653703518095e-05, - "loss": 3.2631, + "epoch": 0.04, + "grad_norm": 20.837644577026367, + "learning_rate": 1.9755679203447265e-05, + "loss": 4.5986, "step": 293 }, { - "epoch": 0.09, - "grad_norm": 25.89323616027832, - "learning_rate": 1.9412649092913702e-05, - "loss": 4.4711, + "epoch": 0.04, + "grad_norm": 17.78486442565918, + "learning_rate": 1.975484248839058e-05, + "loss": 3.6508, "step": 294 }, { - "epoch": 0.09, - "grad_norm": 12.358001708984375, - "learning_rate": 1.9410644482309312e-05, - "loss": 4.0686, + "epoch": 0.04, + "grad_norm": 23.529550552368164, + "learning_rate": 1.9754005773333893e-05, + "loss": 3.8641, "step": 295 }, { - "epoch": 0.09, - "grad_norm": 17.452009201049805, - "learning_rate": 1.9408639871704922e-05, - "loss": 3.6253, + "epoch": 0.04, + "grad_norm": 28.262685775756836, + "learning_rate": 1.9753169058277206e-05, + "loss": 5.1546, "step": 296 }, { - "epoch": 0.09, - "grad_norm": 28.617233276367188, - "learning_rate": 1.9406635261100532e-05, - "loss": 4.9357, + "epoch": 0.04, + "grad_norm": 45.46820831298828, + "learning_rate": 1.9752332343220516e-05, + "loss": 3.7566, "step": 297 }, { - "epoch": 0.09, - "grad_norm": 17.00152587890625, - "learning_rate": 1.9404630650496142e-05, - "loss": 3.7579, + "epoch": 0.04, + "grad_norm": 18.303991317749023, + "learning_rate": 1.975149562816383e-05, + "loss": 2.5184, "step": 298 }, { - "epoch": 0.09, - "grad_norm": 17.73528289794922, - "learning_rate": 1.9402626039891753e-05, - "loss": 3.7798, + "epoch": 0.04, + "grad_norm": 14.7951078414917, + "learning_rate": 1.9750658913107144e-05, + "loss": 3.0745, "step": 299 }, { - "epoch": 0.09, - "grad_norm": 15.016806602478027, - "learning_rate": 1.9400621429287363e-05, - "loss": 4.3192, + "epoch": 0.04, + "grad_norm": 42.80633544921875, + "learning_rate": 1.9749822198050454e-05, + "loss": 3.0089, "step": 300 }, { - "epoch": 0.09, - "grad_norm": 12.723051071166992, - "learning_rate": 1.9398616818682973e-05, - "loss": 3.8939, + "epoch": 0.04, + "grad_norm": 30.628992080688477, + "learning_rate": 1.9748985482993768e-05, + "loss": 4.6328, "step": 301 }, { - "epoch": 0.09, - "grad_norm": 14.875953674316406, - "learning_rate": 1.9396612208078583e-05, - "loss": 2.8365, + "epoch": 0.04, + "grad_norm": 17.47206687927246, + "learning_rate": 1.974814876793708e-05, + "loss": 3.6176, "step": 302 }, { - "epoch": 0.09, - "grad_norm": 16.63105583190918, - "learning_rate": 1.9394607597474193e-05, - "loss": 3.7121, + "epoch": 0.04, + "grad_norm": 12.888838768005371, + "learning_rate": 1.9747312052880395e-05, + "loss": 1.8506, "step": 303 }, { - "epoch": 0.09, - "grad_norm": 22.02267074584961, - "learning_rate": 1.9392602986869803e-05, - "loss": 5.0273, + "epoch": 0.04, + "grad_norm": 14.139362335205078, + "learning_rate": 1.9746475337823705e-05, + "loss": 2.9678, "step": 304 }, { - "epoch": 0.09, - "grad_norm": 44.309234619140625, - "learning_rate": 1.9390598376265413e-05, - "loss": 4.5806, + "epoch": 0.04, + "grad_norm": 22.3652400970459, + "learning_rate": 1.974563862276702e-05, + "loss": 3.513, "step": 305 }, { - "epoch": 0.09, - "grad_norm": 20.403287887573242, - "learning_rate": 1.938859376566102e-05, - "loss": 3.2671, + "epoch": 0.04, + "grad_norm": 16.76512336730957, + "learning_rate": 1.9744801907710332e-05, + "loss": 3.6319, "step": 306 }, { - "epoch": 0.09, - "grad_norm": 15.081746101379395, - "learning_rate": 1.9386589155056633e-05, - "loss": 4.2114, + "epoch": 0.04, + "grad_norm": 14.686978340148926, + "learning_rate": 1.9743965192653643e-05, + "loss": 2.93, "step": 307 }, { - "epoch": 0.09, - "grad_norm": 16.417646408081055, - "learning_rate": 1.938458454445224e-05, - "loss": 3.878, + "epoch": 0.04, + "grad_norm": 17.279701232910156, + "learning_rate": 1.9743128477596956e-05, + "loss": 6.5399, "step": 308 }, { - "epoch": 0.09, - "grad_norm": 20.749244689941406, - "learning_rate": 1.938257993384785e-05, - "loss": 3.3179, + "epoch": 0.04, + "grad_norm": 33.21736526489258, + "learning_rate": 1.974229176254027e-05, + "loss": 4.1478, "step": 309 }, { - "epoch": 0.09, - "grad_norm": 21.518075942993164, - "learning_rate": 1.9380575323243463e-05, - "loss": 3.4908, + "epoch": 0.04, + "grad_norm": 15.007304191589355, + "learning_rate": 1.974145504748358e-05, + "loss": 4.3891, "step": 310 }, { - "epoch": 0.09, - "grad_norm": 17.475610733032227, - "learning_rate": 1.937857071263907e-05, - "loss": 4.1835, + "epoch": 0.04, + "grad_norm": 14.931962966918945, + "learning_rate": 1.9740618332426894e-05, + "loss": 4.9052, "step": 311 }, { - "epoch": 0.09, - "grad_norm": 20.958995819091797, - "learning_rate": 1.937656610203468e-05, - "loss": 4.5692, + "epoch": 0.04, + "grad_norm": 16.65130615234375, + "learning_rate": 1.9739781617370208e-05, + "loss": 2.7062, "step": 312 }, { - "epoch": 0.09, - "grad_norm": 16.16900062561035, - "learning_rate": 1.937456149143029e-05, - "loss": 2.9379, + "epoch": 0.04, + "grad_norm": 23.00065040588379, + "learning_rate": 1.9738944902313518e-05, + "loss": 3.7142, "step": 313 }, { - "epoch": 0.09, - "grad_norm": 29.76364517211914, - "learning_rate": 1.93725568808259e-05, - "loss": 4.0285, + "epoch": 0.04, + "grad_norm": 19.257022857666016, + "learning_rate": 1.973810818725683e-05, + "loss": 3.6447, "step": 314 }, { - "epoch": 0.09, - "grad_norm": 14.153215408325195, - "learning_rate": 1.937055227022151e-05, - "loss": 3.6143, + "epoch": 0.04, + "grad_norm": 18.844392776489258, + "learning_rate": 1.9737271472200145e-05, + "loss": 3.9087, "step": 315 }, { - "epoch": 0.1, - "grad_norm": 25.9129581451416, - "learning_rate": 1.936854765961712e-05, - "loss": 4.9313, + "epoch": 0.04, + "grad_norm": 21.749853134155273, + "learning_rate": 1.9736434757143455e-05, + "loss": 2.765, "step": 316 }, { - "epoch": 0.1, - "grad_norm": 12.36880874633789, - "learning_rate": 1.936654304901273e-05, - "loss": 3.5271, + "epoch": 0.04, + "grad_norm": 16.286396026611328, + "learning_rate": 1.973559804208677e-05, + "loss": 3.5264, "step": 317 }, { - "epoch": 0.1, - "grad_norm": 20.27505874633789, - "learning_rate": 1.936453843840834e-05, - "loss": 3.8401, + "epoch": 0.04, + "grad_norm": 21.13903045654297, + "learning_rate": 1.973476132703008e-05, + "loss": 4.9465, "step": 318 }, { - "epoch": 0.1, - "grad_norm": 20.562847137451172, - "learning_rate": 1.936253382780395e-05, - "loss": 4.0286, + "epoch": 0.04, + "grad_norm": 18.01095199584961, + "learning_rate": 1.9733924611973393e-05, + "loss": 4.4266, "step": 319 }, { - "epoch": 0.1, - "grad_norm": 23.03611183166504, - "learning_rate": 1.936052921719956e-05, - "loss": 3.1755, + "epoch": 0.04, + "grad_norm": 57.30530548095703, + "learning_rate": 1.9733087896916707e-05, + "loss": 4.1182, "step": 320 }, { - "epoch": 0.1, - "grad_norm": 24.045217514038086, - "learning_rate": 1.935852460659517e-05, - "loss": 4.1921, + "epoch": 0.04, + "grad_norm": 34.54814529418945, + "learning_rate": 1.9732251181860017e-05, + "loss": 6.332, "step": 321 }, { - "epoch": 0.1, - "grad_norm": 15.012109756469727, - "learning_rate": 1.935651999599078e-05, - "loss": 3.9538, + "epoch": 0.04, + "grad_norm": 15.039722442626953, + "learning_rate": 1.973141446680333e-05, + "loss": 2.0336, "step": 322 }, { - "epoch": 0.1, - "grad_norm": 16.769580841064453, - "learning_rate": 1.935451538538639e-05, - "loss": 4.1792, + "epoch": 0.04, + "grad_norm": 16.953218460083008, + "learning_rate": 1.9730577751746644e-05, + "loss": 4.2034, "step": 323 }, { - "epoch": 0.1, - "grad_norm": 15.990212440490723, - "learning_rate": 1.9352510774782e-05, - "loss": 3.3792, + "epoch": 0.04, + "grad_norm": 12.90640926361084, + "learning_rate": 1.9729741036689958e-05, + "loss": 2.2464, "step": 324 }, { - "epoch": 0.1, - "grad_norm": 17.586837768554688, - "learning_rate": 1.9350506164177608e-05, - "loss": 3.8851, + "epoch": 0.04, + "grad_norm": 28.966737747192383, + "learning_rate": 1.9728904321633268e-05, + "loss": 2.8233, "step": 325 }, { - "epoch": 0.1, - "grad_norm": 28.66331672668457, - "learning_rate": 1.934850155357322e-05, - "loss": 4.4007, + "epoch": 0.04, + "grad_norm": 21.538097381591797, + "learning_rate": 1.9728067606576582e-05, + "loss": 2.7435, "step": 326 }, { - "epoch": 0.1, - "grad_norm": 19.1878604888916, - "learning_rate": 1.934649694296883e-05, - "loss": 3.7238, + "epoch": 0.04, + "grad_norm": 20.6904354095459, + "learning_rate": 1.9727230891519895e-05, + "loss": 5.3992, "step": 327 }, { - "epoch": 0.1, - "grad_norm": 27.458173751831055, - "learning_rate": 1.9344492332364438e-05, - "loss": 3.6198, + "epoch": 0.04, + "grad_norm": 15.666752815246582, + "learning_rate": 1.9726394176463206e-05, + "loss": 3.7641, "step": 328 }, { - "epoch": 0.1, - "grad_norm": 23.124168395996094, - "learning_rate": 1.934248772176005e-05, - "loss": 3.9671, + "epoch": 0.04, + "grad_norm": 43.591861724853516, + "learning_rate": 1.972555746140652e-05, + "loss": 2.9297, "step": 329 }, { - "epoch": 0.1, - "grad_norm": 22.697099685668945, - "learning_rate": 1.934048311115566e-05, - "loss": 4.6046, + "epoch": 0.04, + "grad_norm": 23.864118576049805, + "learning_rate": 1.9724720746349833e-05, + "loss": 3.9387, "step": 330 }, { - "epoch": 0.1, - "grad_norm": 16.66628074645996, - "learning_rate": 1.933847850055127e-05, - "loss": 3.4079, + "epoch": 0.04, + "grad_norm": 19.16819953918457, + "learning_rate": 1.9723884031293147e-05, + "loss": 3.6225, "step": 331 }, { - "epoch": 0.1, - "grad_norm": 21.602243423461914, - "learning_rate": 1.933647388994688e-05, - "loss": 2.6947, + "epoch": 0.04, + "grad_norm": 25.3775634765625, + "learning_rate": 1.9723047316236457e-05, + "loss": 4.0582, "step": 332 }, { - "epoch": 0.1, - "grad_norm": 19.987550735473633, - "learning_rate": 1.933446927934249e-05, - "loss": 4.0264, + "epoch": 0.04, + "grad_norm": 17.970884323120117, + "learning_rate": 1.972221060117977e-05, + "loss": 5.6782, "step": 333 }, { - "epoch": 0.1, - "grad_norm": 27.548095703125, - "learning_rate": 1.93324646687381e-05, - "loss": 3.6234, + "epoch": 0.04, + "grad_norm": 18.719879150390625, + "learning_rate": 1.9721373886123084e-05, + "loss": 5.1692, "step": 334 }, { - "epoch": 0.1, - "grad_norm": 16.985288619995117, - "learning_rate": 1.933046005813371e-05, - "loss": 3.5273, + "epoch": 0.04, + "grad_norm": 24.480466842651367, + "learning_rate": 1.9720537171066394e-05, + "loss": 4.19, "step": 335 }, { - "epoch": 0.1, - "grad_norm": 249.91415405273438, - "learning_rate": 1.932845544752932e-05, - "loss": 3.0628, + "epoch": 0.04, + "grad_norm": 34.20682907104492, + "learning_rate": 1.9719700456009708e-05, + "loss": 1.6398, "step": 336 }, { - "epoch": 0.1, - "grad_norm": 22.47792625427246, - "learning_rate": 1.932645083692493e-05, - "loss": 3.7402, + "epoch": 0.04, + "grad_norm": 21.29566192626953, + "learning_rate": 1.9718863740953022e-05, + "loss": 2.8134, "step": 337 }, { - "epoch": 0.1, - "grad_norm": 28.15104866027832, - "learning_rate": 1.932444622632054e-05, - "loss": 3.5902, + "epoch": 0.04, + "grad_norm": 32.886497497558594, + "learning_rate": 1.9718027025896332e-05, + "loss": 5.3904, "step": 338 }, { - "epoch": 0.1, - "grad_norm": 15.687856674194336, - "learning_rate": 1.932244161571615e-05, - "loss": 3.4816, + "epoch": 0.04, + "grad_norm": 20.40854835510254, + "learning_rate": 1.9717190310839646e-05, + "loss": 4.7706, "step": 339 }, { - "epoch": 0.1, - "grad_norm": 15.60968017578125, - "learning_rate": 1.932043700511176e-05, - "loss": 3.8098, + "epoch": 0.04, + "grad_norm": 23.373363494873047, + "learning_rate": 1.971635359578296e-05, + "loss": 3.6134, "step": 340 }, { - "epoch": 0.1, - "grad_norm": 21.058176040649414, - "learning_rate": 1.931843239450737e-05, - "loss": 4.0355, + "epoch": 0.04, + "grad_norm": 23.77679443359375, + "learning_rate": 1.971551688072627e-05, + "loss": 4.1803, "step": 341 }, { - "epoch": 0.1, - "grad_norm": 52.26857376098633, - "learning_rate": 1.931642778390298e-05, - "loss": 5.9303, + "epoch": 0.04, + "grad_norm": 24.839336395263672, + "learning_rate": 1.9714680165669583e-05, + "loss": 3.7854, "step": 342 }, { - "epoch": 0.1, - "grad_norm": 14.664571762084961, - "learning_rate": 1.931442317329859e-05, - "loss": 3.5722, + "epoch": 0.04, + "grad_norm": 13.832033157348633, + "learning_rate": 1.9713843450612893e-05, + "loss": 3.402, "step": 343 }, { - "epoch": 0.1, - "grad_norm": 11.48156452178955, - "learning_rate": 1.9312418562694196e-05, - "loss": 3.8408, + "epoch": 0.04, + "grad_norm": 19.192140579223633, + "learning_rate": 1.9713006735556207e-05, + "loss": 2.4615, "step": 344 }, { - "epoch": 0.1, - "grad_norm": 18.76517677307129, - "learning_rate": 1.931041395208981e-05, - "loss": 3.5406, + "epoch": 0.04, + "grad_norm": 11.908378601074219, + "learning_rate": 1.971217002049952e-05, + "loss": 3.0252, "step": 345 }, { - "epoch": 0.1, - "grad_norm": 15.111738204956055, - "learning_rate": 1.930840934148542e-05, - "loss": 3.6611, + "epoch": 0.04, + "grad_norm": 49.56410598754883, + "learning_rate": 1.971133330544283e-05, + "loss": 2.499, "step": 346 }, { - "epoch": 0.1, - "grad_norm": 11.203923225402832, - "learning_rate": 1.9306404730881026e-05, - "loss": 3.0199, + "epoch": 0.04, + "grad_norm": 19.634252548217773, + "learning_rate": 1.9710496590386145e-05, + "loss": 3.4309, "step": 347 }, { - "epoch": 0.1, - "grad_norm": 21.51028060913086, - "learning_rate": 1.930440012027664e-05, - "loss": 3.1991, + "epoch": 0.04, + "grad_norm": 25.267864227294922, + "learning_rate": 1.970965987532946e-05, + "loss": 3.8254, "step": 348 }, { - "epoch": 0.1, - "grad_norm": 21.56675910949707, - "learning_rate": 1.9302395509672247e-05, - "loss": 3.6963, + "epoch": 0.04, + "grad_norm": 27.672697067260742, + "learning_rate": 1.970882316027277e-05, + "loss": 4.0687, "step": 349 }, { - "epoch": 0.11, - "grad_norm": 13.743776321411133, - "learning_rate": 1.9300390899067857e-05, - "loss": 3.6142, + "epoch": 0.04, + "grad_norm": 29.05243492126465, + "learning_rate": 1.9707986445216082e-05, + "loss": 2.8896, "step": 350 }, { - "epoch": 0.11, - "grad_norm": 15.742630004882812, - "learning_rate": 1.9298386288463467e-05, - "loss": 2.8555, + "epoch": 0.04, + "grad_norm": 13.088189125061035, + "learning_rate": 1.9707149730159396e-05, + "loss": 2.2714, "step": 351 }, { - "epoch": 0.11, - "grad_norm": 14.802862167358398, - "learning_rate": 1.9296381677859077e-05, - "loss": 2.9698, + "epoch": 0.04, + "grad_norm": 23.043155670166016, + "learning_rate": 1.970631301510271e-05, + "loss": 4.1696, "step": 352 }, { - "epoch": 0.11, - "grad_norm": 16.21837615966797, - "learning_rate": 1.9294377067254687e-05, - "loss": 5.3529, + "epoch": 0.04, + "grad_norm": 47.78111267089844, + "learning_rate": 1.970547630004602e-05, + "loss": 4.2777, "step": 353 }, { - "epoch": 0.11, - "grad_norm": 14.180800437927246, - "learning_rate": 1.9292372456650297e-05, - "loss": 3.7083, + "epoch": 0.04, + "grad_norm": 28.614927291870117, + "learning_rate": 1.9704639584989333e-05, + "loss": 4.6397, "step": 354 }, { - "epoch": 0.11, - "grad_norm": 16.493022918701172, - "learning_rate": 1.9290367846045907e-05, - "loss": 2.6302, + "epoch": 0.04, + "grad_norm": 25.07124137878418, + "learning_rate": 1.9703802869932647e-05, + "loss": 3.9658, "step": 355 }, { - "epoch": 0.11, - "grad_norm": 19.73296356201172, - "learning_rate": 1.9288363235441517e-05, - "loss": 3.4556, + "epoch": 0.04, + "grad_norm": 18.310688018798828, + "learning_rate": 1.9702966154875957e-05, + "loss": 3.4839, "step": 356 }, { - "epoch": 0.11, - "grad_norm": 20.31797218322754, - "learning_rate": 1.9286358624837127e-05, - "loss": 4.2031, + "epoch": 0.04, + "grad_norm": 21.59184455871582, + "learning_rate": 1.970212943981927e-05, + "loss": 2.9543, "step": 357 }, { - "epoch": 0.11, - "grad_norm": 24.127195358276367, - "learning_rate": 1.9284354014232737e-05, - "loss": 4.2493, + "epoch": 0.04, + "grad_norm": 25.248348236083984, + "learning_rate": 1.9701292724762585e-05, + "loss": 3.7935, "step": 358 }, { - "epoch": 0.11, - "grad_norm": 33.403282165527344, - "learning_rate": 1.9282349403628347e-05, - "loss": 4.6944, + "epoch": 0.05, + "grad_norm": 18.58963394165039, + "learning_rate": 1.9700456009705898e-05, + "loss": 3.1486, "step": 359 }, { - "epoch": 0.11, - "grad_norm": 52.27415466308594, - "learning_rate": 1.9280344793023958e-05, - "loss": 3.5661, - "step": 360 - }, - { - "epoch": 0.11, - "eval_loss": 1.2527186870574951, - "eval_runtime": 43.7248, - "eval_samples_per_second": 33.825, - "eval_steps_per_second": 33.825, + "epoch": 0.05, + "grad_norm": 14.266563415527344, + "learning_rate": 1.969961929464921e-05, + "loss": 3.7676, "step": 360 }, { - "epoch": 0.11, - "grad_norm": 18.096208572387695, - "learning_rate": 1.9278340182419568e-05, - "loss": 4.3089, + "epoch": 0.05, + "grad_norm": 18.322986602783203, + "learning_rate": 1.9698782579592522e-05, + "loss": 3.0848, "step": 361 }, { - "epoch": 0.11, - "grad_norm": 12.513582229614258, - "learning_rate": 1.9276335571815178e-05, - "loss": 3.7448, + "epoch": 0.05, + "grad_norm": 14.636101722717285, + "learning_rate": 1.9697945864535836e-05, + "loss": 4.5866, "step": 362 }, { - "epoch": 0.11, - "grad_norm": 18.849184036254883, - "learning_rate": 1.9274330961210784e-05, - "loss": 4.8774, + "epoch": 0.05, + "grad_norm": 41.74934005737305, + "learning_rate": 1.9697109149479146e-05, + "loss": 2.5856, "step": 363 }, { - "epoch": 0.11, - "grad_norm": 14.817676544189453, - "learning_rate": 1.9272326350606398e-05, - "loss": 4.2751, + "epoch": 0.05, + "grad_norm": 14.788008689880371, + "learning_rate": 1.969627243442246e-05, + "loss": 3.0474, "step": 364 }, { - "epoch": 0.11, - "grad_norm": 24.063844680786133, - "learning_rate": 1.9270321740002008e-05, - "loss": 3.8558, + "epoch": 0.05, + "grad_norm": 18.032474517822266, + "learning_rate": 1.9695435719365773e-05, + "loss": 6.5002, "step": 365 }, { - "epoch": 0.11, - "grad_norm": 18.589746475219727, - "learning_rate": 1.9268317129397615e-05, - "loss": 2.9172, + "epoch": 0.05, + "grad_norm": 22.44284439086914, + "learning_rate": 1.9694599004309084e-05, + "loss": 2.2314, "step": 366 }, { - "epoch": 0.11, - "grad_norm": 51.5893669128418, - "learning_rate": 1.9266312518793228e-05, - "loss": 3.2441, + "epoch": 0.05, + "grad_norm": 13.548026084899902, + "learning_rate": 1.9693762289252397e-05, + "loss": 3.7871, "step": 367 }, { - "epoch": 0.11, - "grad_norm": 18.138118743896484, - "learning_rate": 1.9264307908188835e-05, - "loss": 2.9894, + "epoch": 0.05, + "grad_norm": 24.17680549621582, + "learning_rate": 1.969292557419571e-05, + "loss": 4.6725, "step": 368 }, { - "epoch": 0.11, - "grad_norm": 14.857394218444824, - "learning_rate": 1.9262303297584445e-05, - "loss": 3.1436, + "epoch": 0.05, + "grad_norm": 22.945974349975586, + "learning_rate": 1.969208885913902e-05, + "loss": 3.5503, "step": 369 }, { - "epoch": 0.11, - "grad_norm": 17.468170166015625, - "learning_rate": 1.926029868698006e-05, - "loss": 3.2683, + "epoch": 0.05, + "grad_norm": 14.245816230773926, + "learning_rate": 1.9691252144082335e-05, + "loss": 4.1735, "step": 370 }, { - "epoch": 0.11, - "grad_norm": 17.41706657409668, - "learning_rate": 1.9258294076375665e-05, - "loss": 3.6597, + "epoch": 0.05, + "grad_norm": 16.24998664855957, + "learning_rate": 1.9690415429025645e-05, + "loss": 1.5692, "step": 371 }, { - "epoch": 0.11, - "grad_norm": 15.607809066772461, - "learning_rate": 1.9256289465771275e-05, - "loss": 3.5317, + "epoch": 0.05, + "grad_norm": 18.322080612182617, + "learning_rate": 1.968957871396896e-05, + "loss": 3.6277, "step": 372 }, { - "epoch": 0.11, - "grad_norm": 25.48333168029785, - "learning_rate": 1.9254284855166885e-05, - "loss": 4.017, + "epoch": 0.05, + "grad_norm": 19.254554748535156, + "learning_rate": 1.9688741998912272e-05, + "loss": 3.3539, "step": 373 }, { - "epoch": 0.11, - "grad_norm": 26.36655044555664, - "learning_rate": 1.9252280244562495e-05, - "loss": 3.1871, + "epoch": 0.05, + "grad_norm": 49.72288131713867, + "learning_rate": 1.9687905283855583e-05, + "loss": 3.7778, "step": 374 }, { - "epoch": 0.11, - "grad_norm": 29.703283309936523, - "learning_rate": 1.9250275633958105e-05, - "loss": 3.56, + "epoch": 0.05, + "grad_norm": 18.277828216552734, + "learning_rate": 1.9687068568798896e-05, + "loss": 3.7181, "step": 375 }, { - "epoch": 0.11, - "grad_norm": 12.352496147155762, - "learning_rate": 1.9248271023353715e-05, - "loss": 3.2961, + "epoch": 0.05, + "grad_norm": 10.964229583740234, + "learning_rate": 1.968623185374221e-05, + "loss": 2.8094, "step": 376 }, { - "epoch": 0.11, - "grad_norm": 24.971912384033203, - "learning_rate": 1.9246266412749326e-05, - "loss": 3.8872, + "epoch": 0.05, + "grad_norm": 18.096059799194336, + "learning_rate": 1.968539513868552e-05, + "loss": 3.2346, "step": 377 }, { - "epoch": 0.11, - "grad_norm": 20.017274856567383, - "learning_rate": 1.9244261802144936e-05, - "loss": 4.3486, + "epoch": 0.05, + "grad_norm": 14.133166313171387, + "learning_rate": 1.9684558423628834e-05, + "loss": 3.6737, "step": 378 }, { - "epoch": 0.11, - "grad_norm": 16.611631393432617, - "learning_rate": 1.9242257191540546e-05, - "loss": 4.1083, + "epoch": 0.05, + "grad_norm": 13.820868492126465, + "learning_rate": 1.9683721708572148e-05, + "loss": 2.8704, "step": 379 }, { - "epoch": 0.11, - "grad_norm": 94.53077697753906, - "learning_rate": 1.9240252580936152e-05, - "loss": 3.7256, + "epoch": 0.05, + "grad_norm": 28.055957794189453, + "learning_rate": 1.968288499351546e-05, + "loss": 4.1075, "step": 380 }, { - "epoch": 0.11, - "grad_norm": 14.142595291137695, - "learning_rate": 1.9238247970331766e-05, - "loss": 3.9881, + "epoch": 0.05, + "grad_norm": 18.781705856323242, + "learning_rate": 1.968204827845877e-05, + "loss": 3.0917, "step": 381 }, { - "epoch": 0.11, - "grad_norm": 15.951706886291504, - "learning_rate": 1.9236243359727373e-05, - "loss": 3.6539, + "epoch": 0.05, + "grad_norm": 14.126927375793457, + "learning_rate": 1.9681211563402085e-05, + "loss": 3.5304, "step": 382 }, { - "epoch": 0.12, - "grad_norm": 16.317209243774414, - "learning_rate": 1.9234238749122983e-05, - "loss": 4.028, + "epoch": 0.05, + "grad_norm": 13.951945304870605, + "learning_rate": 1.96803748483454e-05, + "loss": 4.9655, "step": 383 }, { - "epoch": 0.12, - "grad_norm": 15.425968170166016, - "learning_rate": 1.9232234138518596e-05, - "loss": 2.9222, + "epoch": 0.05, + "grad_norm": 21.58669090270996, + "learning_rate": 1.967953813328871e-05, + "loss": 2.5144, "step": 384 }, { - "epoch": 0.12, - "grad_norm": 15.79262924194336, - "learning_rate": 1.9230229527914203e-05, - "loss": 3.1179, + "epoch": 0.05, + "grad_norm": 9.454935073852539, + "learning_rate": 1.9678701418232023e-05, + "loss": 4.0318, "step": 385 }, { - "epoch": 0.12, - "grad_norm": 19.95915985107422, - "learning_rate": 1.9228224917309813e-05, - "loss": 3.9919, + "epoch": 0.05, + "grad_norm": 22.182817459106445, + "learning_rate": 1.9677864703175336e-05, + "loss": 2.4768, "step": 386 }, { - "epoch": 0.12, - "grad_norm": 14.439689636230469, - "learning_rate": 1.9226220306705423e-05, - "loss": 3.257, + "epoch": 0.05, + "grad_norm": 30.83213996887207, + "learning_rate": 1.967702798811865e-05, + "loss": 5.7192, "step": 387 }, { - "epoch": 0.12, - "grad_norm": 16.699209213256836, - "learning_rate": 1.9224215696101033e-05, - "loss": 3.8755, + "epoch": 0.05, + "grad_norm": 36.244850158691406, + "learning_rate": 1.967619127306196e-05, + "loss": 4.4631, "step": 388 }, { - "epoch": 0.12, - "grad_norm": 24.074472427368164, - "learning_rate": 1.9222211085496643e-05, - "loss": 4.0977, + "epoch": 0.05, + "grad_norm": 18.907621383666992, + "learning_rate": 1.9675354558005274e-05, + "loss": 3.8661, "step": 389 }, { - "epoch": 0.12, - "grad_norm": 15.57771110534668, - "learning_rate": 1.9220206474892253e-05, - "loss": 4.4361, + "epoch": 0.05, + "grad_norm": 22.727462768554688, + "learning_rate": 1.9674517842948588e-05, + "loss": 5.2718, "step": 390 }, { - "epoch": 0.12, - "grad_norm": 13.52245807647705, - "learning_rate": 1.9218201864287863e-05, - "loss": 3.7686, + "epoch": 0.05, + "grad_norm": 23.575908660888672, + "learning_rate": 1.9673681127891898e-05, + "loss": 6.1416, "step": 391 }, { - "epoch": 0.12, - "grad_norm": 19.746614456176758, - "learning_rate": 1.9216197253683473e-05, - "loss": 3.819, + "epoch": 0.05, + "grad_norm": 28.313125610351562, + "learning_rate": 1.967284441283521e-05, + "loss": 4.5796, "step": 392 }, { - "epoch": 0.12, - "grad_norm": 14.887005805969238, - "learning_rate": 1.9214192643079084e-05, - "loss": 3.8956, + "epoch": 0.05, + "grad_norm": 16.64825439453125, + "learning_rate": 1.9672007697778525e-05, + "loss": 1.9543, "step": 393 }, { - "epoch": 0.12, - "grad_norm": 18.658023834228516, - "learning_rate": 1.9212188032474694e-05, - "loss": 3.9219, + "epoch": 0.05, + "grad_norm": 28.147789001464844, + "learning_rate": 1.9671170982721835e-05, + "loss": 2.8313, "step": 394 }, { - "epoch": 0.12, - "grad_norm": 14.493846893310547, - "learning_rate": 1.9210183421870304e-05, - "loss": 4.26, + "epoch": 0.05, + "grad_norm": 27.15233039855957, + "learning_rate": 1.967033426766515e-05, + "loss": 3.023, "step": 395 }, { - "epoch": 0.12, - "grad_norm": 16.01069450378418, - "learning_rate": 1.9208178811265914e-05, - "loss": 2.9605, + "epoch": 0.05, + "grad_norm": 16.548398971557617, + "learning_rate": 1.966949755260846e-05, + "loss": 2.378, "step": 396 }, { - "epoch": 0.12, - "grad_norm": 24.826866149902344, - "learning_rate": 1.9206174200661524e-05, - "loss": 4.3274, + "epoch": 0.05, + "grad_norm": 23.343488693237305, + "learning_rate": 1.9668660837551773e-05, + "loss": 5.2585, "step": 397 }, { - "epoch": 0.12, - "grad_norm": 10.156407356262207, - "learning_rate": 1.9204169590057134e-05, - "loss": 3.2431, + "epoch": 0.05, + "grad_norm": 15.054876327514648, + "learning_rate": 1.9667824122495087e-05, + "loss": 4.0137, "step": 398 }, { - "epoch": 0.12, - "grad_norm": 21.510562896728516, - "learning_rate": 1.920216497945274e-05, - "loss": 4.4685, + "epoch": 0.05, + "grad_norm": 27.556150436401367, + "learning_rate": 1.9666987407438397e-05, + "loss": 5.8062, "step": 399 }, { - "epoch": 0.12, - "grad_norm": 25.705717086791992, - "learning_rate": 1.9200160368848354e-05, - "loss": 4.3723, + "epoch": 0.05, + "grad_norm": 20.49494171142578, + "learning_rate": 1.966615069238171e-05, + "loss": 5.7858, "step": 400 }, { - "epoch": 0.12, - "grad_norm": 15.125773429870605, - "learning_rate": 1.9198155758243964e-05, - "loss": 3.9991, + "epoch": 0.05, + "eval_loss": 0.7426895499229431, + "eval_runtime": 268.2339, + "eval_samples_per_second": 13.205, + "eval_steps_per_second": 13.205, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 16.434072494506836, + "learning_rate": 1.9665313977325024e-05, + "loss": 4.5162, "step": 401 }, { - "epoch": 0.12, - "grad_norm": 11.93122386932373, - "learning_rate": 1.919615114763957e-05, - "loss": 3.5661, + "epoch": 0.05, + "grad_norm": 22.502578735351562, + "learning_rate": 1.9664477262268334e-05, + "loss": 4.2813, "step": 402 }, { - "epoch": 0.12, - "grad_norm": 9.818219184875488, - "learning_rate": 1.9194146537035184e-05, - "loss": 3.4324, + "epoch": 0.05, + "grad_norm": 17.459829330444336, + "learning_rate": 1.9663640547211648e-05, + "loss": 2.355, "step": 403 }, { - "epoch": 0.12, - "grad_norm": 23.427623748779297, - "learning_rate": 1.919214192643079e-05, - "loss": 4.2537, + "epoch": 0.05, + "grad_norm": 21.548038482666016, + "learning_rate": 1.966280383215496e-05, + "loss": 6.0367, "step": 404 }, { - "epoch": 0.12, - "grad_norm": 18.164058685302734, - "learning_rate": 1.91901373158264e-05, - "loss": 3.2003, + "epoch": 0.05, + "grad_norm": 16.913127899169922, + "learning_rate": 1.9661967117098272e-05, + "loss": 3.0731, "step": 405 }, { - "epoch": 0.12, - "grad_norm": 20.42031478881836, - "learning_rate": 1.918813270522201e-05, - "loss": 2.4072, + "epoch": 0.05, + "grad_norm": 23.094449996948242, + "learning_rate": 1.9661130402041586e-05, + "loss": 3.1689, "step": 406 }, { - "epoch": 0.12, - "grad_norm": 21.685937881469727, - "learning_rate": 1.918612809461762e-05, - "loss": 3.5641, + "epoch": 0.05, + "grad_norm": 19.218231201171875, + "learning_rate": 1.96602936869849e-05, + "loss": 2.5354, "step": 407 }, { - "epoch": 0.12, - "grad_norm": 19.072772979736328, - "learning_rate": 1.918412348401323e-05, - "loss": 3.8314, + "epoch": 0.05, + "grad_norm": 14.398783683776855, + "learning_rate": 1.9659456971928213e-05, + "loss": 2.5509, "step": 408 }, { - "epoch": 0.12, - "grad_norm": 11.509551048278809, - "learning_rate": 1.918211887340884e-05, - "loss": 2.837, + "epoch": 0.05, + "grad_norm": 28.161535263061523, + "learning_rate": 1.9658620256871523e-05, + "loss": 4.797, "step": 409 }, { - "epoch": 0.12, - "grad_norm": 14.759239196777344, - "learning_rate": 1.918011426280445e-05, - "loss": 3.501, + "epoch": 0.05, + "grad_norm": 17.712581634521484, + "learning_rate": 1.9657783541814837e-05, + "loss": 3.4253, "step": 410 }, { - "epoch": 0.12, - "grad_norm": 25.07742691040039, - "learning_rate": 1.917810965220006e-05, - "loss": 3.0863, + "epoch": 0.05, + "grad_norm": 27.91653823852539, + "learning_rate": 1.965694682675815e-05, + "loss": 4.9134, "step": 411 }, { - "epoch": 0.12, - "grad_norm": 17.365577697753906, - "learning_rate": 1.9176105041595672e-05, - "loss": 3.6191, + "epoch": 0.05, + "grad_norm": 19.58594512939453, + "learning_rate": 1.965611011170146e-05, + "loss": 2.3899, "step": 412 }, { - "epoch": 0.12, - "grad_norm": 15.148959159851074, - "learning_rate": 1.9174100430991282e-05, - "loss": 3.6065, + "epoch": 0.05, + "grad_norm": 37.195430755615234, + "learning_rate": 1.9655273396644774e-05, + "loss": 4.6033, "step": 413 }, { - "epoch": 0.12, - "grad_norm": 22.2659854888916, - "learning_rate": 1.9172095820386892e-05, - "loss": 3.7687, + "epoch": 0.05, + "grad_norm": 21.229907989501953, + "learning_rate": 1.9654436681588088e-05, + "loss": 2.5836, "step": 414 }, { - "epoch": 0.12, - "grad_norm": 10.9700927734375, - "learning_rate": 1.9170091209782502e-05, - "loss": 3.6995, + "epoch": 0.05, + "grad_norm": 13.44748592376709, + "learning_rate": 1.96535999665314e-05, + "loss": 4.605, "step": 415 }, { - "epoch": 0.13, - "grad_norm": 16.873733520507812, - "learning_rate": 1.9168086599178112e-05, - "loss": 2.3494, + "epoch": 0.05, + "grad_norm": 16.857240676879883, + "learning_rate": 1.9652763251474712e-05, + "loss": 2.4381, "step": 416 }, { - "epoch": 0.13, - "grad_norm": 22.604717254638672, - "learning_rate": 1.9166081988573722e-05, - "loss": 3.417, + "epoch": 0.05, + "grad_norm": 17.680007934570312, + "learning_rate": 1.9651926536418026e-05, + "loss": 2.3925, "step": 417 }, { - "epoch": 0.13, - "grad_norm": 16.342870712280273, - "learning_rate": 1.916407737796933e-05, - "loss": 3.7907, + "epoch": 0.05, + "grad_norm": 17.41832733154297, + "learning_rate": 1.965108982136134e-05, + "loss": 5.1594, "step": 418 }, { - "epoch": 0.13, - "grad_norm": 43.08720016479492, - "learning_rate": 1.9162072767364942e-05, - "loss": 2.4359, + "epoch": 0.05, + "grad_norm": 10.49217414855957, + "learning_rate": 1.965025310630465e-05, + "loss": 3.4874, "step": 419 }, { - "epoch": 0.13, - "grad_norm": 18.63433074951172, - "learning_rate": 1.9160068156760552e-05, - "loss": 4.5936, + "epoch": 0.05, + "grad_norm": 18.354557037353516, + "learning_rate": 1.9649416391247963e-05, + "loss": 3.7841, "step": 420 }, { - "epoch": 0.13, - "grad_norm": 20.13007164001465, - "learning_rate": 1.915806354615616e-05, - "loss": 4.5972, + "epoch": 0.05, + "grad_norm": 14.493927955627441, + "learning_rate": 1.9648579676191273e-05, + "loss": 2.6667, "step": 421 }, { - "epoch": 0.13, - "grad_norm": 17.850637435913086, - "learning_rate": 1.9156058935551773e-05, - "loss": 3.1744, + "epoch": 0.05, + "grad_norm": 12.498456954956055, + "learning_rate": 1.9647742961134587e-05, + "loss": 4.7955, "step": 422 }, { - "epoch": 0.13, - "grad_norm": 15.922115325927734, - "learning_rate": 1.915405432494738e-05, - "loss": 3.862, + "epoch": 0.05, + "grad_norm": 29.352039337158203, + "learning_rate": 1.96469062460779e-05, + "loss": 2.5031, "step": 423 }, { - "epoch": 0.13, - "grad_norm": 20.92375946044922, - "learning_rate": 1.915204971434299e-05, - "loss": 2.907, + "epoch": 0.05, + "grad_norm": 30.123577117919922, + "learning_rate": 1.964606953102121e-05, + "loss": 3.5351, "step": 424 }, { - "epoch": 0.13, - "grad_norm": 18.767166137695312, - "learning_rate": 1.91500451037386e-05, - "loss": 4.3281, + "epoch": 0.05, + "grad_norm": 13.854634284973145, + "learning_rate": 1.9645232815964525e-05, + "loss": 4.9223, "step": 425 }, { - "epoch": 0.13, - "grad_norm": 16.2836856842041, - "learning_rate": 1.914804049313421e-05, - "loss": 3.5196, + "epoch": 0.05, + "grad_norm": 13.630191802978516, + "learning_rate": 1.9644396100907838e-05, + "loss": 4.2064, "step": 426 }, { - "epoch": 0.13, - "grad_norm": 13.612652778625488, - "learning_rate": 1.914603588252982e-05, - "loss": 4.0671, + "epoch": 0.05, + "grad_norm": 17.01755714416504, + "learning_rate": 1.964355938585115e-05, + "loss": 2.1371, "step": 427 }, { - "epoch": 0.13, - "grad_norm": 20.497167587280273, - "learning_rate": 1.914403127192543e-05, - "loss": 4.5793, + "epoch": 0.05, + "grad_norm": 16.535648345947266, + "learning_rate": 1.9642722670794462e-05, + "loss": 3.343, "step": 428 }, { - "epoch": 0.13, - "grad_norm": 16.230653762817383, - "learning_rate": 1.914202666132104e-05, - "loss": 4.3463, + "epoch": 0.05, + "grad_norm": 17.35195541381836, + "learning_rate": 1.9641885955737776e-05, + "loss": 3.838, "step": 429 }, { - "epoch": 0.13, - "grad_norm": 17.90238380432129, - "learning_rate": 1.914002205071665e-05, - "loss": 3.6869, + "epoch": 0.05, + "grad_norm": 12.293807983398438, + "learning_rate": 1.9641049240681086e-05, + "loss": 3.1724, "step": 430 }, { - "epoch": 0.13, - "grad_norm": 23.716754913330078, - "learning_rate": 1.913801744011226e-05, - "loss": 3.8698, + "epoch": 0.05, + "grad_norm": 66.72679138183594, + "learning_rate": 1.96402125256244e-05, + "loss": 4.5378, "step": 431 }, { - "epoch": 0.13, - "grad_norm": 17.085433959960938, - "learning_rate": 1.913601282950787e-05, - "loss": 2.4927, + "epoch": 0.05, + "grad_norm": 24.131763458251953, + "learning_rate": 1.9639375810567713e-05, + "loss": 3.6992, "step": 432 }, { - "epoch": 0.13, - "grad_norm": 19.192747116088867, - "learning_rate": 1.913400821890348e-05, - "loss": 3.9132, + "epoch": 0.05, + "grad_norm": 17.67955780029297, + "learning_rate": 1.9638539095511024e-05, + "loss": 2.1648, "step": 433 }, { - "epoch": 0.13, - "grad_norm": 13.243489265441895, - "learning_rate": 1.913200360829909e-05, - "loss": 2.9527, + "epoch": 0.05, + "grad_norm": 14.534732818603516, + "learning_rate": 1.9637702380454337e-05, + "loss": 3.1824, "step": 434 }, { - "epoch": 0.13, - "grad_norm": 15.914332389831543, - "learning_rate": 1.91299989976947e-05, - "loss": 3.9814, + "epoch": 0.05, + "grad_norm": 24.638200759887695, + "learning_rate": 1.963686566539765e-05, + "loss": 3.8397, "step": 435 }, { - "epoch": 0.13, - "grad_norm": 16.82046127319336, - "learning_rate": 1.912799438709031e-05, - "loss": 3.2976, + "epoch": 0.05, + "grad_norm": 11.531396865844727, + "learning_rate": 1.9636028950340965e-05, + "loss": 3.1445, "step": 436 }, { - "epoch": 0.13, - "grad_norm": 30.610519409179688, - "learning_rate": 1.9125989776485917e-05, - "loss": 2.8746, + "epoch": 0.05, + "grad_norm": 31.642684936523438, + "learning_rate": 1.9635192235284275e-05, + "loss": 4.803, "step": 437 }, { - "epoch": 0.13, - "grad_norm": 14.973118782043457, - "learning_rate": 1.912398516588153e-05, - "loss": 3.4231, + "epoch": 0.05, + "grad_norm": 12.691168785095215, + "learning_rate": 1.963435552022759e-05, + "loss": 3.1471, "step": 438 }, { - "epoch": 0.13, - "grad_norm": 17.690595626831055, - "learning_rate": 1.912198055527714e-05, - "loss": 2.8452, + "epoch": 0.06, + "grad_norm": 17.246782302856445, + "learning_rate": 1.9633518805170902e-05, + "loss": 3.2402, "step": 439 }, { - "epoch": 0.13, - "grad_norm": 13.494382858276367, - "learning_rate": 1.9119975944672747e-05, - "loss": 3.4656, + "epoch": 0.06, + "grad_norm": 22.924819946289062, + "learning_rate": 1.9632682090114212e-05, + "loss": 3.015, "step": 440 }, { - "epoch": 0.13, - "grad_norm": 15.88420581817627, - "learning_rate": 1.911797133406836e-05, - "loss": 3.4494, + "epoch": 0.06, + "grad_norm": 13.039619445800781, + "learning_rate": 1.9631845375057526e-05, + "loss": 3.6388, "step": 441 }, { - "epoch": 0.13, - "grad_norm": 15.013689041137695, - "learning_rate": 1.9115966723463967e-05, - "loss": 3.7962, + "epoch": 0.06, + "grad_norm": 15.010822296142578, + "learning_rate": 1.963100866000084e-05, + "loss": 6.2867, "step": 442 }, { - "epoch": 0.13, - "grad_norm": 19.49163246154785, - "learning_rate": 1.9113962112859578e-05, - "loss": 3.5861, + "epoch": 0.06, + "grad_norm": 14.180012702941895, + "learning_rate": 1.9630171944944153e-05, + "loss": 5.0498, "step": 443 }, { - "epoch": 0.13, - "grad_norm": 14.526626586914062, - "learning_rate": 1.911195750225519e-05, - "loss": 4.3179, + "epoch": 0.06, + "grad_norm": 14.439057350158691, + "learning_rate": 1.9629335229887464e-05, + "loss": 5.7437, "step": 444 }, { - "epoch": 0.13, - "grad_norm": 14.432934761047363, - "learning_rate": 1.9109952891650798e-05, - "loss": 3.6713, + "epoch": 0.06, + "grad_norm": 25.79570198059082, + "learning_rate": 1.9628498514830777e-05, + "loss": 3.5513, "step": 445 }, { - "epoch": 0.13, - "grad_norm": 17.720027923583984, - "learning_rate": 1.9107948281046408e-05, - "loss": 2.9935, + "epoch": 0.06, + "grad_norm": 13.489237785339355, + "learning_rate": 1.962766179977409e-05, + "loss": 4.9429, "step": 446 }, { - "epoch": 0.13, - "grad_norm": 30.397794723510742, - "learning_rate": 1.9105943670442018e-05, - "loss": 3.4623, + "epoch": 0.06, + "grad_norm": 18.599224090576172, + "learning_rate": 1.96268250847174e-05, + "loss": 3.3341, "step": 447 }, { - "epoch": 0.13, - "grad_norm": 14.584136009216309, - "learning_rate": 1.9103939059837628e-05, - "loss": 3.4608, + "epoch": 0.06, + "grad_norm": 11.782310485839844, + "learning_rate": 1.9625988369660715e-05, + "loss": 2.9087, "step": 448 }, { - "epoch": 0.13, - "grad_norm": 10.301027297973633, - "learning_rate": 1.9101934449233238e-05, - "loss": 3.1884, + "epoch": 0.06, + "grad_norm": 12.502921104431152, + "learning_rate": 1.9625151654604025e-05, + "loss": 2.8944, "step": 449 }, { - "epoch": 0.14, - "grad_norm": 17.015613555908203, - "learning_rate": 1.9099929838628848e-05, - "loss": 3.9679, + "epoch": 0.06, + "grad_norm": 14.942875862121582, + "learning_rate": 1.962431493954734e-05, + "loss": 3.2534, "step": 450 }, { - "epoch": 0.14, - "grad_norm": 13.941701889038086, - "learning_rate": 1.9097925228024458e-05, - "loss": 3.3073, + "epoch": 0.06, + "grad_norm": 12.024401664733887, + "learning_rate": 1.9623478224490652e-05, + "loss": 5.8004, "step": 451 }, { - "epoch": 0.14, - "grad_norm": 16.194982528686523, - "learning_rate": 1.909592061742007e-05, - "loss": 3.6541, + "epoch": 0.06, + "grad_norm": 14.10036563873291, + "learning_rate": 1.9622641509433963e-05, + "loss": 3.5902, "step": 452 }, { - "epoch": 0.14, - "grad_norm": 19.734464645385742, - "learning_rate": 1.909391600681568e-05, - "loss": 4.4298, + "epoch": 0.06, + "grad_norm": 21.69317054748535, + "learning_rate": 1.9621804794377276e-05, + "loss": 6.0422, "step": 453 }, { - "epoch": 0.14, - "grad_norm": 12.222415924072266, - "learning_rate": 1.9091911396211285e-05, - "loss": 2.5833, + "epoch": 0.06, + "grad_norm": 18.428606033325195, + "learning_rate": 1.9620968079320587e-05, + "loss": 3.6164, "step": 454 }, { - "epoch": 0.14, - "grad_norm": 21.09163475036621, - "learning_rate": 1.90899067856069e-05, - "loss": 3.8562, + "epoch": 0.06, + "grad_norm": 15.561418533325195, + "learning_rate": 1.96201313642639e-05, + "loss": 3.0053, "step": 455 }, { - "epoch": 0.14, - "grad_norm": 14.021400451660156, - "learning_rate": 1.908790217500251e-05, - "loss": 3.9132, + "epoch": 0.06, + "grad_norm": 16.941303253173828, + "learning_rate": 1.9619294649207214e-05, + "loss": 3.5116, "step": 456 }, { - "epoch": 0.14, - "grad_norm": 19.492759704589844, - "learning_rate": 1.9085897564398115e-05, - "loss": 4.7376, + "epoch": 0.06, + "grad_norm": 16.592493057250977, + "learning_rate": 1.9618457934150527e-05, + "loss": 4.3639, "step": 457 }, { - "epoch": 0.14, - "grad_norm": 13.180451393127441, - "learning_rate": 1.908389295379373e-05, - "loss": 2.8013, + "epoch": 0.06, + "grad_norm": 16.290321350097656, + "learning_rate": 1.9617621219093838e-05, + "loss": 1.1296, "step": 458 }, { - "epoch": 0.14, - "grad_norm": 19.046419143676758, - "learning_rate": 1.9081888343189336e-05, - "loss": 3.2353, + "epoch": 0.06, + "grad_norm": 36.56865692138672, + "learning_rate": 1.961678450403715e-05, + "loss": 4.6438, "step": 459 }, { - "epoch": 0.14, - "grad_norm": 24.7363338470459, - "learning_rate": 1.9079883732584946e-05, - "loss": 4.2447, + "epoch": 0.06, + "grad_norm": 20.538652420043945, + "learning_rate": 1.9615947788980465e-05, + "loss": 3.0496, "step": 460 }, { - "epoch": 0.14, - "grad_norm": 12.855481147766113, - "learning_rate": 1.9077879121980556e-05, - "loss": 3.3459, + "epoch": 0.06, + "grad_norm": 14.338871955871582, + "learning_rate": 1.9615111073923775e-05, + "loss": 4.9444, "step": 461 }, { - "epoch": 0.14, - "grad_norm": 13.02658462524414, - "learning_rate": 1.9075874511376166e-05, - "loss": 4.0793, + "epoch": 0.06, + "grad_norm": 20.616947174072266, + "learning_rate": 1.961427435886709e-05, + "loss": 2.8608, "step": 462 }, { - "epoch": 0.14, - "grad_norm": 17.221975326538086, - "learning_rate": 1.907386990077178e-05, - "loss": 3.3399, + "epoch": 0.06, + "grad_norm": 22.124589920043945, + "learning_rate": 1.9613437643810403e-05, + "loss": 2.2952, "step": 463 }, { - "epoch": 0.14, - "grad_norm": 22.799137115478516, - "learning_rate": 1.9071865290167386e-05, - "loss": 3.7892, + "epoch": 0.06, + "grad_norm": 16.3970947265625, + "learning_rate": 1.9612600928753716e-05, + "loss": 4.7596, "step": 464 }, { - "epoch": 0.14, - "grad_norm": 21.327260971069336, - "learning_rate": 1.9069860679562996e-05, - "loss": 2.5584, + "epoch": 0.06, + "grad_norm": 15.964753150939941, + "learning_rate": 1.9611764213697026e-05, + "loss": 3.176, "step": 465 }, { - "epoch": 0.14, - "grad_norm": 26.7298583984375, - "learning_rate": 1.9067856068958606e-05, - "loss": 4.5134, + "epoch": 0.06, + "grad_norm": 16.239675521850586, + "learning_rate": 1.961092749864034e-05, + "loss": 4.4223, "step": 466 }, { - "epoch": 0.14, - "grad_norm": 25.994600296020508, - "learning_rate": 1.9065851458354216e-05, - "loss": 2.8459, + "epoch": 0.06, + "grad_norm": 17.314661026000977, + "learning_rate": 1.9610090783583654e-05, + "loss": 3.0885, "step": 467 }, { - "epoch": 0.14, - "grad_norm": 22.67238998413086, - "learning_rate": 1.9063846847749826e-05, - "loss": 4.0606, + "epoch": 0.06, + "grad_norm": 9.08109188079834, + "learning_rate": 1.9609254068526964e-05, + "loss": 3.6177, "step": 468 }, { - "epoch": 0.14, - "grad_norm": 16.200130462646484, - "learning_rate": 1.9061842237145436e-05, - "loss": 4.0715, + "epoch": 0.06, + "grad_norm": 13.201598167419434, + "learning_rate": 1.9608417353470278e-05, + "loss": 5.4196, "step": 469 }, { - "epoch": 0.14, - "grad_norm": 12.668680191040039, - "learning_rate": 1.9059837626541046e-05, - "loss": 2.9427, + "epoch": 0.06, + "grad_norm": 12.478797912597656, + "learning_rate": 1.960758063841359e-05, + "loss": 2.9956, "step": 470 }, { - "epoch": 0.14, - "grad_norm": 24.67599868774414, - "learning_rate": 1.9057833015936657e-05, - "loss": 3.3925, + "epoch": 0.06, + "grad_norm": 16.85390281677246, + "learning_rate": 1.9606743923356905e-05, + "loss": 3.9526, "step": 471 }, { - "epoch": 0.14, - "grad_norm": 26.510074615478516, - "learning_rate": 1.9055828405332267e-05, - "loss": 2.774, + "epoch": 0.06, + "grad_norm": 14.20464038848877, + "learning_rate": 1.9605907208300215e-05, + "loss": 4.3399, "step": 472 }, { - "epoch": 0.14, - "grad_norm": 19.766721725463867, - "learning_rate": 1.9053823794727873e-05, - "loss": 3.1132, + "epoch": 0.06, + "grad_norm": 20.508764266967773, + "learning_rate": 1.960507049324353e-05, + "loss": 4.9114, "step": 473 }, { - "epoch": 0.14, - "grad_norm": 14.419114112854004, - "learning_rate": 1.9051819184123487e-05, - "loss": 2.8139, + "epoch": 0.06, + "grad_norm": 20.2303409576416, + "learning_rate": 1.960423377818684e-05, + "loss": 3.7521, "step": 474 }, { - "epoch": 0.14, - "grad_norm": 11.945268630981445, - "learning_rate": 1.9049814573519097e-05, - "loss": 3.4746, + "epoch": 0.06, + "grad_norm": 14.391060829162598, + "learning_rate": 1.9603397063130153e-05, + "loss": 5.0591, "step": 475 }, { - "epoch": 0.14, - "grad_norm": 19.036060333251953, - "learning_rate": 1.9047809962914704e-05, - "loss": 4.0888, + "epoch": 0.06, + "grad_norm": 23.435667037963867, + "learning_rate": 1.9602560348073466e-05, + "loss": 3.3647, "step": 476 }, { - "epoch": 0.14, - "grad_norm": 18.684444427490234, - "learning_rate": 1.9045805352310317e-05, - "loss": 3.8154, + "epoch": 0.06, + "grad_norm": 11.666590690612793, + "learning_rate": 1.9601723633016777e-05, + "loss": 5.2999, "step": 477 }, { - "epoch": 0.14, - "grad_norm": 10.9110689163208, - "learning_rate": 1.9043800741705924e-05, - "loss": 3.7957, + "epoch": 0.06, + "grad_norm": 14.70217514038086, + "learning_rate": 1.960088691796009e-05, + "loss": 3.5773, "step": 478 }, { - "epoch": 0.14, - "grad_norm": 22.50055503845215, - "learning_rate": 1.9041796131101534e-05, - "loss": 4.0542, + "epoch": 0.06, + "grad_norm": 10.582998275756836, + "learning_rate": 1.9600050202903404e-05, + "loss": 3.3664, "step": 479 }, { - "epoch": 0.14, - "grad_norm": 14.225284576416016, - "learning_rate": 1.9039791520497144e-05, - "loss": 2.9864, - "step": 480 - }, - { - "epoch": 0.14, - "eval_loss": 0.9622519612312317, - "eval_runtime": 43.4567, - "eval_samples_per_second": 34.034, - "eval_steps_per_second": 34.034, + "epoch": 0.06, + "grad_norm": 15.844144821166992, + "learning_rate": 1.9599213487846714e-05, + "loss": 1.8579, "step": 480 }, { - "epoch": 0.14, - "grad_norm": 14.51484489440918, - "learning_rate": 1.9037786909892754e-05, - "loss": 3.9772, + "epoch": 0.06, + "grad_norm": 11.509811401367188, + "learning_rate": 1.9598376772790028e-05, + "loss": 1.7489, "step": 481 }, { - "epoch": 0.14, - "grad_norm": 51.21427536010742, - "learning_rate": 1.9035782299288364e-05, - "loss": 3.8034, + "epoch": 0.06, + "grad_norm": 14.378329277038574, + "learning_rate": 1.9597540057733338e-05, + "loss": 3.6322, "step": 482 }, { - "epoch": 0.15, - "grad_norm": 26.110877990722656, - "learning_rate": 1.9033777688683974e-05, - "loss": 3.6146, + "epoch": 0.06, + "grad_norm": 43.20583724975586, + "learning_rate": 1.9596703342676652e-05, + "loss": 5.0654, "step": 483 }, { - "epoch": 0.15, - "grad_norm": 16.056385040283203, - "learning_rate": 1.9031773078079584e-05, - "loss": 3.8006, + "epoch": 0.06, + "grad_norm": 36.17166519165039, + "learning_rate": 1.9595866627619965e-05, + "loss": 3.6418, "step": 484 }, { - "epoch": 0.15, - "grad_norm": 15.42993450164795, - "learning_rate": 1.9029768467475194e-05, - "loss": 3.0251, + "epoch": 0.06, + "grad_norm": 14.509894371032715, + "learning_rate": 1.959502991256328e-05, + "loss": 4.5501, "step": 485 }, { - "epoch": 0.15, - "grad_norm": 12.17597484588623, - "learning_rate": 1.9027763856870804e-05, - "loss": 3.4756, + "epoch": 0.06, + "grad_norm": 13.088136672973633, + "learning_rate": 1.959419319750659e-05, + "loss": 5.2244, "step": 486 }, { - "epoch": 0.15, - "grad_norm": 11.121559143066406, - "learning_rate": 1.9025759246266415e-05, - "loss": 2.7291, + "epoch": 0.06, + "grad_norm": 27.84554100036621, + "learning_rate": 1.9593356482449903e-05, + "loss": 3.7659, "step": 487 }, { - "epoch": 0.15, - "grad_norm": 17.4909725189209, - "learning_rate": 1.9023754635662025e-05, - "loss": 3.6708, + "epoch": 0.06, + "grad_norm": 16.524335861206055, + "learning_rate": 1.9592519767393217e-05, + "loss": 5.8718, "step": 488 }, { - "epoch": 0.15, - "grad_norm": 23.629941940307617, - "learning_rate": 1.9021750025057635e-05, - "loss": 3.0275, + "epoch": 0.06, + "grad_norm": 12.565080642700195, + "learning_rate": 1.9591683052336527e-05, + "loss": 3.1432, "step": 489 }, { - "epoch": 0.15, - "grad_norm": 12.820405960083008, - "learning_rate": 1.9019745414453245e-05, - "loss": 3.0267, + "epoch": 0.06, + "grad_norm": 26.694808959960938, + "learning_rate": 1.959084633727984e-05, + "loss": 3.5153, "step": 490 }, { - "epoch": 0.15, - "grad_norm": 22.951032638549805, - "learning_rate": 1.9017740803848855e-05, - "loss": 2.393, + "epoch": 0.06, + "grad_norm": 19.632619857788086, + "learning_rate": 1.9590009622223154e-05, + "loss": 5.4893, "step": 491 }, { - "epoch": 0.15, - "grad_norm": 11.71376895904541, - "learning_rate": 1.901573619324446e-05, - "loss": 2.9705, + "epoch": 0.06, + "grad_norm": 13.182287216186523, + "learning_rate": 1.9589172907166468e-05, + "loss": 2.4552, "step": 492 }, { - "epoch": 0.15, - "grad_norm": 14.28131103515625, - "learning_rate": 1.9013731582640075e-05, - "loss": 2.727, + "epoch": 0.06, + "grad_norm": 25.17426872253418, + "learning_rate": 1.9588336192109778e-05, + "loss": 3.209, "step": 493 }, { - "epoch": 0.15, - "grad_norm": 16.59454345703125, - "learning_rate": 1.9011726972035685e-05, - "loss": 3.5118, + "epoch": 0.06, + "grad_norm": 17.61745834350586, + "learning_rate": 1.9587499477053092e-05, + "loss": 4.6118, "step": 494 }, { - "epoch": 0.15, - "grad_norm": 20.26374053955078, - "learning_rate": 1.9009722361431292e-05, - "loss": 4.6275, + "epoch": 0.06, + "grad_norm": 12.008716583251953, + "learning_rate": 1.9586662761996405e-05, + "loss": 3.6998, "step": 495 }, { - "epoch": 0.15, - "grad_norm": 13.29510498046875, - "learning_rate": 1.9007717750826905e-05, - "loss": 3.8611, + "epoch": 0.06, + "grad_norm": 13.607529640197754, + "learning_rate": 1.9585826046939716e-05, + "loss": 2.8954, "step": 496 }, { - "epoch": 0.15, - "grad_norm": 14.79573917388916, - "learning_rate": 1.9005713140222512e-05, - "loss": 4.632, + "epoch": 0.06, + "grad_norm": 13.992676734924316, + "learning_rate": 1.958498933188303e-05, + "loss": 1.3716, "step": 497 }, { - "epoch": 0.15, - "grad_norm": 19.15884017944336, - "learning_rate": 1.9003708529618122e-05, - "loss": 3.5274, + "epoch": 0.06, + "grad_norm": 15.480781555175781, + "learning_rate": 1.9584152616826343e-05, + "loss": 3.2069, "step": 498 }, { - "epoch": 0.15, - "grad_norm": 11.399006843566895, - "learning_rate": 1.9001703919013736e-05, - "loss": 4.0488, + "epoch": 0.06, + "grad_norm": 16.785871505737305, + "learning_rate": 1.9583315901769657e-05, + "loss": 4.2125, "step": 499 }, { - "epoch": 0.15, - "grad_norm": 12.227916717529297, - "learning_rate": 1.8999699308409342e-05, - "loss": 3.8005, + "epoch": 0.06, + "grad_norm": 43.2510871887207, + "learning_rate": 1.9582479186712967e-05, + "loss": 2.4049, "step": 500 }, { - "epoch": 0.15, - "grad_norm": 15.450165748596191, - "learning_rate": 1.8997694697804952e-05, - "loss": 2.9546, + "epoch": 0.06, + "grad_norm": 20.25304412841797, + "learning_rate": 1.958164247165628e-05, + "loss": 1.7508, "step": 501 }, { - "epoch": 0.15, - "grad_norm": 19.479894638061523, - "learning_rate": 1.8995690087200562e-05, - "loss": 3.056, + "epoch": 0.06, + "grad_norm": 10.431807518005371, + "learning_rate": 1.958080575659959e-05, + "loss": 2.7182, "step": 502 }, { - "epoch": 0.15, - "grad_norm": 18.609848022460938, - "learning_rate": 1.8993685476596172e-05, - "loss": 3.8573, + "epoch": 0.06, + "grad_norm": 14.986677169799805, + "learning_rate": 1.9579969041542904e-05, + "loss": 3.2185, "step": 503 }, { - "epoch": 0.15, - "grad_norm": 19.30628204345703, - "learning_rate": 1.8991680865991783e-05, - "loss": 2.9196, + "epoch": 0.06, + "grad_norm": 17.70635986328125, + "learning_rate": 1.9579132326486218e-05, + "loss": 4.4745, "step": 504 }, { - "epoch": 0.15, - "grad_norm": 15.707794189453125, - "learning_rate": 1.8989676255387393e-05, - "loss": 3.6089, + "epoch": 0.06, + "grad_norm": 13.41077995300293, + "learning_rate": 1.957829561142953e-05, + "loss": 2.5006, "step": 505 }, { - "epoch": 0.15, - "grad_norm": 18.78173065185547, - "learning_rate": 1.8987671644783003e-05, - "loss": 2.8028, + "epoch": 0.06, + "grad_norm": 20.12737464904785, + "learning_rate": 1.9577458896372842e-05, + "loss": 3.9931, "step": 506 }, { - "epoch": 0.15, - "grad_norm": 17.811344146728516, - "learning_rate": 1.8985667034178613e-05, - "loss": 3.645, + "epoch": 0.06, + "grad_norm": 21.820350646972656, + "learning_rate": 1.9576622181316152e-05, + "loss": 2.7736, "step": 507 }, { - "epoch": 0.15, - "grad_norm": 15.66889476776123, - "learning_rate": 1.8983662423574223e-05, - "loss": 3.5172, + "epoch": 0.06, + "grad_norm": 18.02536964416504, + "learning_rate": 1.9575785466259466e-05, + "loss": 2.1139, "step": 508 }, { - "epoch": 0.15, - "grad_norm": 13.496955871582031, - "learning_rate": 1.8981657812969833e-05, - "loss": 3.2022, + "epoch": 0.06, + "grad_norm": 14.28238582611084, + "learning_rate": 1.957494875120278e-05, + "loss": 4.572, "step": 509 }, { - "epoch": 0.15, - "grad_norm": 14.416739463806152, - "learning_rate": 1.8979653202365443e-05, - "loss": 3.7212, + "epoch": 0.06, + "grad_norm": 17.39335060119629, + "learning_rate": 1.957411203614609e-05, + "loss": 4.1144, "step": 510 }, { - "epoch": 0.15, - "grad_norm": 13.47453784942627, - "learning_rate": 1.897764859176105e-05, - "loss": 4.1187, + "epoch": 0.06, + "grad_norm": 16.540075302124023, + "learning_rate": 1.9573275321089403e-05, + "loss": 3.5938, "step": 511 }, { - "epoch": 0.15, - "grad_norm": 18.787689208984375, - "learning_rate": 1.8975643981156663e-05, - "loss": 3.6262, + "epoch": 0.06, + "grad_norm": 12.823972702026367, + "learning_rate": 1.9572438606032717e-05, + "loss": 2.467, "step": 512 }, { - "epoch": 0.15, - "grad_norm": 10.850929260253906, - "learning_rate": 1.8973639370552273e-05, - "loss": 2.937, + "epoch": 0.06, + "grad_norm": 16.05743408203125, + "learning_rate": 1.957160189097603e-05, + "loss": 4.8134, "step": 513 }, { - "epoch": 0.15, - "grad_norm": 14.07578182220459, - "learning_rate": 1.897163475994788e-05, - "loss": 4.5924, + "epoch": 0.06, + "grad_norm": 16.572154998779297, + "learning_rate": 1.957076517591934e-05, + "loss": 5.0406, "step": 514 }, { - "epoch": 0.15, - "grad_norm": 13.420841217041016, - "learning_rate": 1.8969630149343493e-05, - "loss": 2.5988, + "epoch": 0.06, + "grad_norm": 10.041598320007324, + "learning_rate": 1.9569928460862655e-05, + "loss": 3.0591, "step": 515 }, { - "epoch": 0.16, - "grad_norm": 16.54571533203125, - "learning_rate": 1.89676255387391e-05, - "loss": 3.835, + "epoch": 0.06, + "grad_norm": 16.29025650024414, + "learning_rate": 1.956909174580597e-05, + "loss": 2.9466, "step": 516 }, { - "epoch": 0.16, - "grad_norm": 68.53701782226562, - "learning_rate": 1.896562092813471e-05, - "loss": 5.1961, + "epoch": 0.06, + "grad_norm": 16.037534713745117, + "learning_rate": 1.956825503074928e-05, + "loss": 4.3558, "step": 517 }, { - "epoch": 0.16, - "grad_norm": 9.695450782775879, - "learning_rate": 1.8963616317530324e-05, - "loss": 3.4607, + "epoch": 0.07, + "grad_norm": 25.85873031616211, + "learning_rate": 1.9567418315692592e-05, + "loss": 2.2824, "step": 518 }, { - "epoch": 0.16, - "grad_norm": 15.207024574279785, - "learning_rate": 1.896161170692593e-05, - "loss": 3.3355, + "epoch": 0.07, + "grad_norm": 30.12692642211914, + "learning_rate": 1.9566581600635906e-05, + "loss": 2.4197, "step": 519 }, { - "epoch": 0.16, - "grad_norm": 14.207443237304688, - "learning_rate": 1.895960709632154e-05, - "loss": 3.5345, + "epoch": 0.07, + "grad_norm": 15.433709144592285, + "learning_rate": 1.956574488557922e-05, + "loss": 3.2912, "step": 520 }, { - "epoch": 0.16, - "grad_norm": 18.703203201293945, - "learning_rate": 1.895760248571715e-05, - "loss": 4.1013, + "epoch": 0.07, + "grad_norm": 13.080670356750488, + "learning_rate": 1.956490817052253e-05, + "loss": 5.63, "step": 521 }, { - "epoch": 0.16, - "grad_norm": 22.39604949951172, - "learning_rate": 1.895559787511276e-05, - "loss": 4.8708, + "epoch": 0.07, + "grad_norm": 16.858592987060547, + "learning_rate": 1.9564071455465843e-05, + "loss": 3.2096, "step": 522 }, { - "epoch": 0.16, - "grad_norm": 19.35679817199707, - "learning_rate": 1.895359326450837e-05, - "loss": 2.9551, + "epoch": 0.07, + "grad_norm": 7.967859268188477, + "learning_rate": 1.9563234740409157e-05, + "loss": 6.2087, "step": 523 }, { - "epoch": 0.16, - "grad_norm": 13.426315307617188, - "learning_rate": 1.895158865390398e-05, - "loss": 2.9543, + "epoch": 0.07, + "grad_norm": 9.962607383728027, + "learning_rate": 1.9562398025352467e-05, + "loss": 4.5942, "step": 524 }, { - "epoch": 0.16, - "grad_norm": 18.244346618652344, - "learning_rate": 1.894958404329959e-05, - "loss": 3.8422, + "epoch": 0.07, + "grad_norm": 24.848005294799805, + "learning_rate": 1.956156131029578e-05, + "loss": 4.6929, "step": 525 }, { - "epoch": 0.16, - "grad_norm": 16.004077911376953, - "learning_rate": 1.89475794326952e-05, - "loss": 2.7931, + "epoch": 0.07, + "grad_norm": 15.170857429504395, + "learning_rate": 1.9560724595239095e-05, + "loss": 3.1667, "step": 526 }, { - "epoch": 0.16, - "grad_norm": 23.495067596435547, - "learning_rate": 1.894557482209081e-05, - "loss": 4.6575, + "epoch": 0.07, + "grad_norm": 15.806061744689941, + "learning_rate": 1.9559887880182405e-05, + "loss": 4.0896, "step": 527 }, { - "epoch": 0.16, - "grad_norm": 16.585546493530273, - "learning_rate": 1.894357021148642e-05, - "loss": 3.2343, + "epoch": 0.07, + "grad_norm": 14.205904006958008, + "learning_rate": 1.955905116512572e-05, + "loss": 3.6082, "step": 528 }, { - "epoch": 0.16, - "grad_norm": 25.565746307373047, - "learning_rate": 1.894156560088203e-05, - "loss": 3.6168, + "epoch": 0.07, + "grad_norm": 16.476051330566406, + "learning_rate": 1.9558214450069032e-05, + "loss": 4.0229, "step": 529 }, { - "epoch": 0.16, - "grad_norm": 17.877038955688477, - "learning_rate": 1.893956099027764e-05, - "loss": 4.1159, + "epoch": 0.07, + "grad_norm": 18.11623764038086, + "learning_rate": 1.9557377735012342e-05, + "loss": 3.9217, "step": 530 }, { - "epoch": 0.16, - "grad_norm": 25.356380462646484, - "learning_rate": 1.893755637967325e-05, - "loss": 4.1038, + "epoch": 0.07, + "grad_norm": 13.680058479309082, + "learning_rate": 1.9556541019955656e-05, + "loss": 3.6137, "step": 531 }, { - "epoch": 0.16, - "grad_norm": 28.306737899780273, - "learning_rate": 1.893555176906886e-05, - "loss": 3.7835, + "epoch": 0.07, + "grad_norm": 11.90891170501709, + "learning_rate": 1.9555704304898966e-05, + "loss": 4.1743, "step": 532 }, { - "epoch": 0.16, - "grad_norm": 16.962968826293945, - "learning_rate": 1.8933547158464468e-05, - "loss": 2.9008, + "epoch": 0.07, + "grad_norm": 25.42872428894043, + "learning_rate": 1.955486758984228e-05, + "loss": 5.0914, "step": 533 }, { - "epoch": 0.16, - "grad_norm": 20.51753044128418, - "learning_rate": 1.8931542547860082e-05, - "loss": 4.6783, + "epoch": 0.07, + "grad_norm": 12.091551780700684, + "learning_rate": 1.9554030874785594e-05, + "loss": 4.2007, "step": 534 }, { - "epoch": 0.16, - "grad_norm": 17.836177825927734, - "learning_rate": 1.892953793725569e-05, - "loss": 2.8979, + "epoch": 0.07, + "grad_norm": 11.26998233795166, + "learning_rate": 1.9553194159728904e-05, + "loss": 3.4855, "step": 535 }, { - "epoch": 0.16, - "grad_norm": 14.355405807495117, - "learning_rate": 1.89275333266513e-05, - "loss": 3.6143, + "epoch": 0.07, + "grad_norm": 11.434247016906738, + "learning_rate": 1.9552357444672218e-05, + "loss": 4.8023, "step": 536 }, { - "epoch": 0.16, - "grad_norm": 17.681434631347656, - "learning_rate": 1.8925528716046912e-05, - "loss": 2.4729, + "epoch": 0.07, + "grad_norm": 14.716187477111816, + "learning_rate": 1.955152072961553e-05, + "loss": 1.4756, "step": 537 }, { - "epoch": 0.16, - "grad_norm": 16.02050018310547, - "learning_rate": 1.892352410544252e-05, - "loss": 2.3059, + "epoch": 0.07, + "grad_norm": 11.850278854370117, + "learning_rate": 1.955068401455884e-05, + "loss": 5.4931, "step": 538 }, { - "epoch": 0.16, - "grad_norm": 12.686012268066406, - "learning_rate": 1.892151949483813e-05, - "loss": 3.307, + "epoch": 0.07, + "grad_norm": 13.068528175354004, + "learning_rate": 1.9549847299502155e-05, + "loss": 4.0506, "step": 539 }, { - "epoch": 0.16, - "grad_norm": 13.235198974609375, - "learning_rate": 1.891951488423374e-05, - "loss": 4.249, + "epoch": 0.07, + "grad_norm": 13.046282768249512, + "learning_rate": 1.954901058444547e-05, + "loss": 3.9209, "step": 540 }, { - "epoch": 0.16, - "grad_norm": 16.329849243164062, - "learning_rate": 1.891751027362935e-05, - "loss": 3.9762, + "epoch": 0.07, + "grad_norm": 18.51168441772461, + "learning_rate": 1.954817386938878e-05, + "loss": 2.3712, "step": 541 }, { - "epoch": 0.16, - "grad_norm": 18.02645492553711, - "learning_rate": 1.891550566302496e-05, - "loss": 3.3566, + "epoch": 0.07, + "grad_norm": 12.119020462036133, + "learning_rate": 1.9547337154332093e-05, + "loss": 1.7651, "step": 542 }, { - "epoch": 0.16, - "grad_norm": 15.097955703735352, - "learning_rate": 1.891350105242057e-05, - "loss": 3.683, + "epoch": 0.07, + "grad_norm": 10.600531578063965, + "learning_rate": 1.9546500439275406e-05, + "loss": 4.215, "step": 543 }, { - "epoch": 0.16, - "grad_norm": 14.13657283782959, - "learning_rate": 1.891149644181618e-05, - "loss": 2.5315, + "epoch": 0.07, + "grad_norm": 18.362884521484375, + "learning_rate": 1.954566372421872e-05, + "loss": 3.8651, "step": 544 }, { - "epoch": 0.16, - "grad_norm": 13.064208984375, - "learning_rate": 1.890949183121179e-05, - "loss": 3.7249, + "epoch": 0.07, + "grad_norm": 10.03396987915039, + "learning_rate": 1.954482700916203e-05, + "loss": 3.7257, "step": 545 }, { - "epoch": 0.16, - "grad_norm": 12.626958847045898, - "learning_rate": 1.89074872206074e-05, - "loss": 3.4621, + "epoch": 0.07, + "grad_norm": 17.79392433166504, + "learning_rate": 1.9543990294105344e-05, + "loss": 2.2889, "step": 546 }, { - "epoch": 0.16, - "grad_norm": 15.529770851135254, - "learning_rate": 1.8905482610003006e-05, - "loss": 3.6776, + "epoch": 0.07, + "grad_norm": 15.041823387145996, + "learning_rate": 1.9543153579048658e-05, + "loss": 3.0014, "step": 547 }, { - "epoch": 0.16, - "grad_norm": 16.237417221069336, - "learning_rate": 1.890347799939862e-05, - "loss": 5.0018, + "epoch": 0.07, + "grad_norm": 19.62635612487793, + "learning_rate": 1.9542316863991968e-05, + "loss": 5.0205, "step": 548 }, { - "epoch": 0.17, - "grad_norm": 24.94883155822754, - "learning_rate": 1.890147338879423e-05, - "loss": 3.5904, + "epoch": 0.07, + "grad_norm": 23.438556671142578, + "learning_rate": 1.954148014893528e-05, + "loss": 3.028, "step": 549 }, { - "epoch": 0.17, - "grad_norm": 18.61686134338379, - "learning_rate": 1.8899468778189836e-05, - "loss": 3.47, + "epoch": 0.07, + "grad_norm": 19.551422119140625, + "learning_rate": 1.9540643433878595e-05, + "loss": 2.8019, "step": 550 }, { - "epoch": 0.17, - "grad_norm": 23.455402374267578, - "learning_rate": 1.889746416758545e-05, - "loss": 3.8038, + "epoch": 0.07, + "grad_norm": 17.06743049621582, + "learning_rate": 1.953980671882191e-05, + "loss": 3.2838, "step": 551 }, { - "epoch": 0.17, - "grad_norm": 14.99221420288086, - "learning_rate": 1.8895459556981056e-05, - "loss": 3.4294, + "epoch": 0.07, + "grad_norm": 16.77594566345215, + "learning_rate": 1.953897000376522e-05, + "loss": 3.2405, "step": 552 }, { - "epoch": 0.17, - "grad_norm": 15.767837524414062, - "learning_rate": 1.8893454946376667e-05, - "loss": 3.2229, + "epoch": 0.07, + "grad_norm": 15.835091590881348, + "learning_rate": 1.9538133288708533e-05, + "loss": 4.1767, "step": 553 }, { - "epoch": 0.17, - "grad_norm": 12.863175392150879, - "learning_rate": 1.8891450335772277e-05, - "loss": 3.6184, + "epoch": 0.07, + "grad_norm": 35.76700973510742, + "learning_rate": 1.9537296573651846e-05, + "loss": 1.8702, "step": 554 }, { - "epoch": 0.17, - "grad_norm": 14.412956237792969, - "learning_rate": 1.8889445725167887e-05, - "loss": 4.1913, + "epoch": 0.07, + "grad_norm": 21.611915588378906, + "learning_rate": 1.9536459858595157e-05, + "loss": 4.1775, "step": 555 }, { - "epoch": 0.17, - "grad_norm": 24.851470947265625, - "learning_rate": 1.8887441114563497e-05, - "loss": 3.9418, + "epoch": 0.07, + "grad_norm": 14.899382591247559, + "learning_rate": 1.953562314353847e-05, + "loss": 3.198, "step": 556 }, { - "epoch": 0.17, - "grad_norm": 47.26935958862305, - "learning_rate": 1.8885436503959107e-05, - "loss": 5.1826, + "epoch": 0.07, + "grad_norm": 15.35878849029541, + "learning_rate": 1.9534786428481784e-05, + "loss": 2.8263, "step": 557 }, { - "epoch": 0.17, - "grad_norm": 15.844667434692383, - "learning_rate": 1.8883431893354717e-05, - "loss": 3.7824, + "epoch": 0.07, + "grad_norm": 14.225360870361328, + "learning_rate": 1.9533949713425094e-05, + "loss": 3.9339, "step": 558 }, { - "epoch": 0.17, - "grad_norm": 11.842270851135254, - "learning_rate": 1.8881427282750327e-05, - "loss": 2.9349, + "epoch": 0.07, + "grad_norm": 28.126028060913086, + "learning_rate": 1.9533112998368408e-05, + "loss": 1.6965, "step": 559 }, { - "epoch": 0.17, - "grad_norm": 21.0179386138916, - "learning_rate": 1.8879422672145937e-05, - "loss": 3.2554, + "epoch": 0.07, + "grad_norm": 33.49496078491211, + "learning_rate": 1.9532276283311718e-05, + "loss": 3.8451, "step": 560 }, { - "epoch": 0.17, - "grad_norm": 19.371559143066406, - "learning_rate": 1.8877418061541547e-05, - "loss": 3.6965, + "epoch": 0.07, + "grad_norm": 22.24785041809082, + "learning_rate": 1.9531439568255032e-05, + "loss": 3.5929, "step": 561 }, { - "epoch": 0.17, - "grad_norm": 17.8704833984375, - "learning_rate": 1.8875413450937157e-05, - "loss": 3.6738, + "epoch": 0.07, + "grad_norm": 36.69216537475586, + "learning_rate": 1.9530602853198345e-05, + "loss": 3.6625, "step": 562 }, { - "epoch": 0.17, - "grad_norm": 11.61600112915039, - "learning_rate": 1.8873408840332767e-05, - "loss": 2.5997, + "epoch": 0.07, + "grad_norm": 13.007461547851562, + "learning_rate": 1.9529766138141656e-05, + "loss": 3.823, "step": 563 }, { - "epoch": 0.17, - "grad_norm": 17.82415199279785, - "learning_rate": 1.8871404229728377e-05, - "loss": 3.402, + "epoch": 0.07, + "grad_norm": 19.145416259765625, + "learning_rate": 1.952892942308497e-05, + "loss": 3.2177, "step": 564 }, { - "epoch": 0.17, - "grad_norm": 13.841615676879883, - "learning_rate": 1.8869399619123988e-05, - "loss": 2.5574, + "epoch": 0.07, + "grad_norm": 28.419158935546875, + "learning_rate": 1.9528092708028283e-05, + "loss": 1.5909, "step": 565 }, { - "epoch": 0.17, - "grad_norm": 17.705219268798828, - "learning_rate": 1.8867395008519594e-05, - "loss": 3.4216, + "epoch": 0.07, + "grad_norm": 14.794642448425293, + "learning_rate": 1.9527255992971593e-05, + "loss": 1.8523, "step": 566 }, { - "epoch": 0.17, - "grad_norm": 15.329522132873535, - "learning_rate": 1.8865390397915208e-05, - "loss": 2.5929, + "epoch": 0.07, + "grad_norm": 19.6970157623291, + "learning_rate": 1.9526419277914907e-05, + "loss": 5.4043, "step": 567 }, { - "epoch": 0.17, - "grad_norm": 14.040759086608887, - "learning_rate": 1.8863385787310818e-05, - "loss": 3.7188, + "epoch": 0.07, + "grad_norm": 20.794288635253906, + "learning_rate": 1.952558256285822e-05, + "loss": 2.3032, "step": 568 }, { - "epoch": 0.17, - "grad_norm": 20.66543960571289, - "learning_rate": 1.8861381176706424e-05, - "loss": 3.3664, + "epoch": 0.07, + "grad_norm": 18.208457946777344, + "learning_rate": 1.952474584780153e-05, + "loss": 3.9328, "step": 569 }, { - "epoch": 0.17, - "grad_norm": 10.702879905700684, - "learning_rate": 1.8859376566102038e-05, - "loss": 3.0084, + "epoch": 0.07, + "grad_norm": 18.695064544677734, + "learning_rate": 1.9523909132744844e-05, + "loss": 2.7843, "step": 570 }, { - "epoch": 0.17, - "grad_norm": 15.897429466247559, - "learning_rate": 1.8857371955497645e-05, - "loss": 3.7669, + "epoch": 0.07, + "grad_norm": 10.989561080932617, + "learning_rate": 1.9523072417688158e-05, + "loss": 2.9185, "step": 571 }, { - "epoch": 0.17, - "grad_norm": 11.95020866394043, - "learning_rate": 1.8855367344893255e-05, - "loss": 3.9482, + "epoch": 0.07, + "grad_norm": 13.847416877746582, + "learning_rate": 1.9522235702631472e-05, + "loss": 3.4503, "step": 572 }, { - "epoch": 0.17, - "grad_norm": 27.125240325927734, - "learning_rate": 1.8853362734288868e-05, - "loss": 2.7438, + "epoch": 0.07, + "grad_norm": 140.5792999267578, + "learning_rate": 1.9521398987574782e-05, + "loss": 3.8577, "step": 573 }, { - "epoch": 0.17, - "grad_norm": 13.781254768371582, - "learning_rate": 1.8851358123684475e-05, - "loss": 3.3872, + "epoch": 0.07, + "grad_norm": 11.583138465881348, + "learning_rate": 1.9520562272518096e-05, + "loss": 3.792, "step": 574 }, { - "epoch": 0.17, - "grad_norm": 29.12127113342285, - "learning_rate": 1.8849353513080085e-05, - "loss": 4.5567, + "epoch": 0.07, + "grad_norm": 27.13224983215332, + "learning_rate": 1.951972555746141e-05, + "loss": 6.0304, "step": 575 }, { - "epoch": 0.17, - "grad_norm": 15.671697616577148, - "learning_rate": 1.8847348902475695e-05, - "loss": 2.5302, + "epoch": 0.07, + "grad_norm": 21.271699905395508, + "learning_rate": 1.951888884240472e-05, + "loss": 4.7579, "step": 576 }, { - "epoch": 0.17, - "grad_norm": 17.996654510498047, - "learning_rate": 1.8845344291871305e-05, - "loss": 3.1327, + "epoch": 0.07, + "grad_norm": 13.934300422668457, + "learning_rate": 1.9518052127348033e-05, + "loss": 2.274, "step": 577 }, { - "epoch": 0.17, - "grad_norm": 16.846689224243164, - "learning_rate": 1.8843339681266915e-05, - "loss": 2.5453, + "epoch": 0.07, + "grad_norm": 14.28530216217041, + "learning_rate": 1.9517215412291347e-05, + "loss": 3.3553, "step": 578 }, { - "epoch": 0.17, - "grad_norm": 21.583017349243164, - "learning_rate": 1.8841335070662525e-05, - "loss": 3.5481, + "epoch": 0.07, + "grad_norm": 21.369596481323242, + "learning_rate": 1.951637869723466e-05, + "loss": 3.0815, "step": 579 }, { - "epoch": 0.17, - "grad_norm": 14.08506965637207, - "learning_rate": 1.8839330460058135e-05, - "loss": 3.8354, + "epoch": 0.07, + "grad_norm": 70.39269256591797, + "learning_rate": 1.951554198217797e-05, + "loss": 1.4678, "step": 580 }, { - "epoch": 0.17, - "grad_norm": 12.634489059448242, - "learning_rate": 1.8837325849453745e-05, - "loss": 3.1067, + "epoch": 0.07, + "grad_norm": 10.346799850463867, + "learning_rate": 1.9514705267121284e-05, + "loss": 3.7247, "step": 581 }, { - "epoch": 0.17, - "grad_norm": 11.801117897033691, - "learning_rate": 1.8835321238849356e-05, - "loss": 3.4585, + "epoch": 0.07, + "grad_norm": 16.728269577026367, + "learning_rate": 1.9513868552064598e-05, + "loss": 4.006, "step": 582 }, { - "epoch": 0.18, - "grad_norm": 15.204583168029785, - "learning_rate": 1.8833316628244966e-05, - "loss": 3.3097, + "epoch": 0.07, + "grad_norm": 11.536478996276855, + "learning_rate": 1.9513031837007908e-05, + "loss": 3.0046, "step": 583 }, { - "epoch": 0.18, - "grad_norm": 15.794861793518066, - "learning_rate": 1.8831312017640576e-05, - "loss": 3.0038, + "epoch": 0.07, + "grad_norm": 13.823684692382812, + "learning_rate": 1.9512195121951222e-05, + "loss": 3.4075, "step": 584 }, { - "epoch": 0.18, - "grad_norm": 13.39342212677002, - "learning_rate": 1.8829307407036186e-05, - "loss": 3.4626, + "epoch": 0.07, + "grad_norm": 11.703648567199707, + "learning_rate": 1.9511358406894532e-05, + "loss": 3.6496, "step": 585 }, { - "epoch": 0.18, - "grad_norm": 14.78641414642334, - "learning_rate": 1.8827302796431796e-05, - "loss": 3.4037, + "epoch": 0.07, + "grad_norm": 13.754158020019531, + "learning_rate": 1.9510521691837846e-05, + "loss": 2.1938, "step": 586 }, { - "epoch": 0.18, - "grad_norm": 16.814594268798828, - "learning_rate": 1.8825298185827406e-05, - "loss": 3.7882, + "epoch": 0.07, + "grad_norm": 14.499614715576172, + "learning_rate": 1.950968497678116e-05, + "loss": 2.7492, "step": 587 }, { - "epoch": 0.18, - "grad_norm": 27.2209415435791, - "learning_rate": 1.8823293575223013e-05, - "loss": 3.2455, + "epoch": 0.07, + "grad_norm": 15.740714073181152, + "learning_rate": 1.950884826172447e-05, + "loss": 2.9027, "step": 588 }, { - "epoch": 0.18, - "grad_norm": 16.686363220214844, - "learning_rate": 1.8821288964618626e-05, - "loss": 3.932, + "epoch": 0.07, + "grad_norm": 15.932597160339355, + "learning_rate": 1.9508011546667783e-05, + "loss": 3.1953, "step": 589 }, { - "epoch": 0.18, - "grad_norm": 17.841867446899414, - "learning_rate": 1.8819284354014233e-05, - "loss": 2.3729, + "epoch": 0.07, + "grad_norm": 21.080705642700195, + "learning_rate": 1.9507174831611097e-05, + "loss": 2.7916, "step": 590 }, { - "epoch": 0.18, - "grad_norm": 13.670007705688477, - "learning_rate": 1.8817279743409843e-05, - "loss": 3.9139, + "epoch": 0.07, + "grad_norm": 19.372400283813477, + "learning_rate": 1.9506338116554407e-05, + "loss": 4.3448, "step": 591 }, { - "epoch": 0.18, - "grad_norm": 12.825089454650879, - "learning_rate": 1.8815275132805456e-05, - "loss": 3.4298, + "epoch": 0.07, + "grad_norm": 18.336488723754883, + "learning_rate": 1.950550140149772e-05, + "loss": 4.4037, "step": 592 }, { - "epoch": 0.18, - "grad_norm": 21.140291213989258, - "learning_rate": 1.8813270522201063e-05, - "loss": 3.6672, + "epoch": 0.07, + "grad_norm": 12.87525749206543, + "learning_rate": 1.9504664686441035e-05, + "loss": 1.776, "step": 593 }, { - "epoch": 0.18, - "grad_norm": 19.407217025756836, - "learning_rate": 1.8811265911596673e-05, - "loss": 3.853, + "epoch": 0.07, + "grad_norm": 31.6224308013916, + "learning_rate": 1.9503827971384345e-05, + "loss": 2.0704, "step": 594 }, { - "epoch": 0.18, - "grad_norm": 17.949951171875, - "learning_rate": 1.8809261300992283e-05, - "loss": 4.0853, + "epoch": 0.07, + "grad_norm": 21.20492172241211, + "learning_rate": 1.950299125632766e-05, + "loss": 2.3727, "step": 595 }, { - "epoch": 0.18, - "grad_norm": 14.42286205291748, - "learning_rate": 1.8807256690387893e-05, - "loss": 3.8624, + "epoch": 0.07, + "grad_norm": 24.187694549560547, + "learning_rate": 1.9502154541270972e-05, + "loss": 2.6064, "step": 596 }, { - "epoch": 0.18, - "grad_norm": 13.524324417114258, - "learning_rate": 1.8805252079783503e-05, - "loss": 3.4537, + "epoch": 0.07, + "grad_norm": 17.776546478271484, + "learning_rate": 1.9501317826214282e-05, + "loss": 3.9096, "step": 597 }, { - "epoch": 0.18, - "grad_norm": 16.790464401245117, - "learning_rate": 1.8803247469179114e-05, - "loss": 3.3715, + "epoch": 0.08, + "grad_norm": 9.369264602661133, + "learning_rate": 1.9500481111157596e-05, + "loss": 2.1012, "step": 598 }, { - "epoch": 0.18, - "grad_norm": 28.930021286010742, - "learning_rate": 1.8801242858574724e-05, - "loss": 4.5601, + "epoch": 0.08, + "grad_norm": 15.953502655029297, + "learning_rate": 1.949964439610091e-05, + "loss": 6.1173, "step": 599 }, { - "epoch": 0.18, - "grad_norm": 17.764270782470703, - "learning_rate": 1.8799238247970334e-05, - "loss": 3.0025, - "step": 600 - }, - { - "epoch": 0.18, - "eval_loss": 0.8108684420585632, - "eval_runtime": 43.7575, - "eval_samples_per_second": 33.8, - "eval_steps_per_second": 33.8, + "epoch": 0.08, + "grad_norm": 17.869020462036133, + "learning_rate": 1.9498807681044223e-05, + "loss": 2.898, "step": 600 }, { - "epoch": 0.18, - "grad_norm": 12.37165641784668, - "learning_rate": 1.8797233637365944e-05, - "loss": 3.7558, + "epoch": 0.08, + "grad_norm": 13.049903869628906, + "learning_rate": 1.9497970965987534e-05, + "loss": 2.7439, "step": 601 }, { - "epoch": 0.18, - "grad_norm": 31.21900177001953, - "learning_rate": 1.8795229026761554e-05, - "loss": 3.6418, + "epoch": 0.08, + "grad_norm": 10.503606796264648, + "learning_rate": 1.9497134250930847e-05, + "loss": 1.334, "step": 602 }, { - "epoch": 0.18, - "grad_norm": 12.556445121765137, - "learning_rate": 1.8793224416157164e-05, - "loss": 2.7222, + "epoch": 0.08, + "grad_norm": 20.092920303344727, + "learning_rate": 1.949629753587416e-05, + "loss": 3.6018, "step": 603 }, { - "epoch": 0.18, - "grad_norm": 12.976155281066895, - "learning_rate": 1.8791219805552774e-05, - "loss": 3.1556, + "epoch": 0.08, + "grad_norm": 106.8263168334961, + "learning_rate": 1.949546082081747e-05, + "loss": 3.2016, "step": 604 }, { - "epoch": 0.18, - "grad_norm": 14.281742095947266, - "learning_rate": 1.8789215194948384e-05, - "loss": 3.5987, + "epoch": 0.08, + "grad_norm": 26.08072280883789, + "learning_rate": 1.9494624105760785e-05, + "loss": 4.7636, "step": 605 }, { - "epoch": 0.18, - "grad_norm": 17.95466423034668, - "learning_rate": 1.8787210584343994e-05, - "loss": 2.8724, + "epoch": 0.08, + "grad_norm": 18.82193374633789, + "learning_rate": 1.94937873907041e-05, + "loss": 2.8383, "step": 606 }, { - "epoch": 0.18, - "grad_norm": 11.985798835754395, - "learning_rate": 1.87852059737396e-05, - "loss": 3.3135, + "epoch": 0.08, + "grad_norm": 20.548948287963867, + "learning_rate": 1.9492950675647412e-05, + "loss": 2.4919, "step": 607 }, { - "epoch": 0.18, - "grad_norm": 11.839190483093262, - "learning_rate": 1.8783201363135214e-05, - "loss": 2.7356, + "epoch": 0.08, + "grad_norm": 14.64707088470459, + "learning_rate": 1.9492113960590722e-05, + "loss": 3.689, "step": 608 }, { - "epoch": 0.18, - "grad_norm": 19.593158721923828, - "learning_rate": 1.878119675253082e-05, - "loss": 2.48, + "epoch": 0.08, + "grad_norm": 16.494598388671875, + "learning_rate": 1.9491277245534036e-05, + "loss": 1.9832, "step": 609 }, { - "epoch": 0.18, - "grad_norm": 11.183398246765137, - "learning_rate": 1.877919214192643e-05, - "loss": 2.3999, + "epoch": 0.08, + "grad_norm": 19.26803207397461, + "learning_rate": 1.949044053047735e-05, + "loss": 3.0708, "step": 610 }, { - "epoch": 0.18, - "grad_norm": 16.24125862121582, - "learning_rate": 1.8777187531322045e-05, - "loss": 3.8464, + "epoch": 0.08, + "grad_norm": 20.39113426208496, + "learning_rate": 1.948960381542066e-05, + "loss": 2.8375, "step": 611 }, { - "epoch": 0.18, - "grad_norm": 17.56856346130371, - "learning_rate": 1.877518292071765e-05, - "loss": 3.5414, + "epoch": 0.08, + "grad_norm": 13.161185264587402, + "learning_rate": 1.9488767100363974e-05, + "loss": 2.4338, "step": 612 }, { - "epoch": 0.18, - "grad_norm": 15.031785011291504, - "learning_rate": 1.877317831011326e-05, - "loss": 3.4706, + "epoch": 0.08, + "grad_norm": 22.47385025024414, + "learning_rate": 1.9487930385307284e-05, + "loss": 5.3107, "step": 613 }, { - "epoch": 0.18, - "grad_norm": 14.207649230957031, - "learning_rate": 1.877117369950887e-05, - "loss": 3.0532, + "epoch": 0.08, + "grad_norm": 9.89582633972168, + "learning_rate": 1.9487093670250598e-05, + "loss": 2.2135, "step": 614 }, { - "epoch": 0.18, - "grad_norm": 15.561878204345703, - "learning_rate": 1.876916908890448e-05, - "loss": 3.5544, + "epoch": 0.08, + "grad_norm": 14.444812774658203, + "learning_rate": 1.948625695519391e-05, + "loss": 2.9484, "step": 615 }, { - "epoch": 0.19, - "grad_norm": 15.425860404968262, - "learning_rate": 1.876716447830009e-05, - "loss": 3.2101, + "epoch": 0.08, + "grad_norm": 17.883405685424805, + "learning_rate": 1.948542024013722e-05, + "loss": 1.481, "step": 616 }, { - "epoch": 0.19, - "grad_norm": 17.44911766052246, - "learning_rate": 1.8765159867695702e-05, - "loss": 4.0486, + "epoch": 0.08, + "grad_norm": 15.512957572937012, + "learning_rate": 1.9484583525080535e-05, + "loss": 3.8052, "step": 617 }, { - "epoch": 0.19, - "grad_norm": 16.89653205871582, - "learning_rate": 1.8763155257091312e-05, - "loss": 3.1268, + "epoch": 0.08, + "grad_norm": 9.79510498046875, + "learning_rate": 1.9483746810023845e-05, + "loss": 1.2129, "step": 618 }, { - "epoch": 0.19, - "grad_norm": 17.181591033935547, - "learning_rate": 1.8761150646486922e-05, - "loss": 3.0687, + "epoch": 0.08, + "grad_norm": 12.645746231079102, + "learning_rate": 1.948291009496716e-05, + "loss": 1.5934, "step": 619 }, { - "epoch": 0.19, - "grad_norm": 14.057995796203613, - "learning_rate": 1.8759146035882532e-05, - "loss": 2.7292, + "epoch": 0.08, + "grad_norm": 29.721193313598633, + "learning_rate": 1.9482073379910473e-05, + "loss": 5.759, "step": 620 }, { - "epoch": 0.19, - "grad_norm": 62.06224822998047, - "learning_rate": 1.875714142527814e-05, - "loss": 2.8716, + "epoch": 0.08, + "grad_norm": 21.66473388671875, + "learning_rate": 1.9481236664853786e-05, + "loss": 3.0079, "step": 621 }, { - "epoch": 0.19, - "grad_norm": 15.26970100402832, - "learning_rate": 1.8755136814673752e-05, - "loss": 3.4212, + "epoch": 0.08, + "grad_norm": 10.992793083190918, + "learning_rate": 1.9480399949797097e-05, + "loss": 3.4051, "step": 622 }, { - "epoch": 0.19, - "grad_norm": 12.904154777526855, - "learning_rate": 1.8753132204069362e-05, - "loss": 2.5465, + "epoch": 0.08, + "grad_norm": 14.940546989440918, + "learning_rate": 1.947956323474041e-05, + "loss": 4.0389, "step": 623 }, { - "epoch": 0.19, - "grad_norm": 16.669986724853516, - "learning_rate": 1.875112759346497e-05, - "loss": 2.8482, + "epoch": 0.08, + "grad_norm": 14.911699295043945, + "learning_rate": 1.9478726519683724e-05, + "loss": 3.7682, "step": 624 }, { - "epoch": 0.19, - "grad_norm": 10.211058616638184, - "learning_rate": 1.8749122982860582e-05, - "loss": 2.5476, + "epoch": 0.08, + "grad_norm": 13.005440711975098, + "learning_rate": 1.9477889804627034e-05, + "loss": 1.5709, "step": 625 }, { - "epoch": 0.19, - "grad_norm": 21.396862030029297, - "learning_rate": 1.874711837225619e-05, - "loss": 3.7639, + "epoch": 0.08, + "grad_norm": 15.729487419128418, + "learning_rate": 1.9477053089570348e-05, + "loss": 1.1742, "step": 626 }, { - "epoch": 0.19, - "grad_norm": 20.889707565307617, - "learning_rate": 1.87451137616518e-05, - "loss": 3.3019, + "epoch": 0.08, + "grad_norm": 20.291719436645508, + "learning_rate": 1.947621637451366e-05, + "loss": 4.8735, "step": 627 }, { - "epoch": 0.19, - "grad_norm": 25.096691131591797, - "learning_rate": 1.874310915104741e-05, - "loss": 4.246, + "epoch": 0.08, + "grad_norm": 13.93189525604248, + "learning_rate": 1.9475379659456975e-05, + "loss": 2.2031, "step": 628 }, { - "epoch": 0.19, - "grad_norm": 13.878023147583008, - "learning_rate": 1.874110454044302e-05, - "loss": 2.9438, + "epoch": 0.08, + "grad_norm": 11.237191200256348, + "learning_rate": 1.9474542944400285e-05, + "loss": 4.0729, "step": 629 }, { - "epoch": 0.19, - "grad_norm": 18.34180450439453, - "learning_rate": 1.873909992983863e-05, - "loss": 4.3573, + "epoch": 0.08, + "grad_norm": 14.943650245666504, + "learning_rate": 1.94737062293436e-05, + "loss": 1.8902, "step": 630 }, { - "epoch": 0.19, - "grad_norm": 18.738731384277344, - "learning_rate": 1.873709531923424e-05, - "loss": 2.7748, + "epoch": 0.08, + "grad_norm": 14.849425315856934, + "learning_rate": 1.9472869514286913e-05, + "loss": 3.2545, "step": 631 }, { - "epoch": 0.19, - "grad_norm": 15.163655281066895, - "learning_rate": 1.873509070862985e-05, - "loss": 3.4938, + "epoch": 0.08, + "grad_norm": 11.831103324890137, + "learning_rate": 1.9472032799230223e-05, + "loss": 2.6383, "step": 632 }, { - "epoch": 0.19, - "grad_norm": 16.00517463684082, - "learning_rate": 1.873308609802546e-05, - "loss": 4.1243, + "epoch": 0.08, + "grad_norm": 12.109277725219727, + "learning_rate": 1.9471196084173537e-05, + "loss": 4.2325, "step": 633 }, { - "epoch": 0.19, - "grad_norm": 15.933029174804688, - "learning_rate": 1.873108148742107e-05, - "loss": 3.8793, + "epoch": 0.08, + "grad_norm": 14.199204444885254, + "learning_rate": 1.947035936911685e-05, + "loss": 3.5085, "step": 634 }, { - "epoch": 0.19, - "grad_norm": 17.073196411132812, - "learning_rate": 1.872907687681668e-05, - "loss": 3.3813, + "epoch": 0.08, + "grad_norm": 17.686521530151367, + "learning_rate": 1.9469522654060164e-05, + "loss": 4.728, "step": 635 }, { - "epoch": 0.19, - "grad_norm": 16.49427604675293, - "learning_rate": 1.872707226621229e-05, - "loss": 3.9439, + "epoch": 0.08, + "grad_norm": 11.224983215332031, + "learning_rate": 1.9468685939003474e-05, + "loss": 1.8126, "step": 636 }, { - "epoch": 0.19, - "grad_norm": 14.988619804382324, - "learning_rate": 1.87250676556079e-05, - "loss": 3.4706, + "epoch": 0.08, + "grad_norm": 11.27236270904541, + "learning_rate": 1.9467849223946788e-05, + "loss": 2.9768, "step": 637 }, { - "epoch": 0.19, - "grad_norm": 14.796496391296387, - "learning_rate": 1.872306304500351e-05, - "loss": 2.572, + "epoch": 0.08, + "grad_norm": 17.28792953491211, + "learning_rate": 1.9467012508890098e-05, + "loss": 3.3681, "step": 638 }, { - "epoch": 0.19, - "grad_norm": 14.737870216369629, - "learning_rate": 1.872105843439912e-05, - "loss": 4.6644, + "epoch": 0.08, + "grad_norm": 134.4596405029297, + "learning_rate": 1.946617579383341e-05, + "loss": 2.3149, "step": 639 }, { - "epoch": 0.19, - "grad_norm": 14.593988418579102, - "learning_rate": 1.8719053823794727e-05, - "loss": 2.9476, + "epoch": 0.08, + "grad_norm": 15.561910629272461, + "learning_rate": 1.9465339078776725e-05, + "loss": 3.1515, "step": 640 }, { - "epoch": 0.19, - "grad_norm": 24.125364303588867, - "learning_rate": 1.871704921319034e-05, - "loss": 3.0569, + "epoch": 0.08, + "grad_norm": 23.368066787719727, + "learning_rate": 1.9464502363720036e-05, + "loss": 4.4627, "step": 641 }, { - "epoch": 0.19, - "grad_norm": 12.783120155334473, - "learning_rate": 1.871504460258595e-05, - "loss": 2.7435, + "epoch": 0.08, + "grad_norm": 18.397462844848633, + "learning_rate": 1.946366564866335e-05, + "loss": 3.3775, "step": 642 }, { - "epoch": 0.19, - "grad_norm": 20.349702835083008, - "learning_rate": 1.8713039991981557e-05, - "loss": 3.924, + "epoch": 0.08, + "grad_norm": 13.091896057128906, + "learning_rate": 1.946282893360666e-05, + "loss": 5.6346, "step": 643 }, { - "epoch": 0.19, - "grad_norm": 18.848377227783203, - "learning_rate": 1.871103538137717e-05, - "loss": 3.6072, + "epoch": 0.08, + "grad_norm": 12.951624870300293, + "learning_rate": 1.9461992218549973e-05, + "loss": 5.0077, "step": 644 }, { - "epoch": 0.19, - "grad_norm": 16.555147171020508, - "learning_rate": 1.8709030770772777e-05, - "loss": 2.6862, + "epoch": 0.08, + "grad_norm": 9.754615783691406, + "learning_rate": 1.9461155503493287e-05, + "loss": 3.2439, "step": 645 }, { - "epoch": 0.19, - "grad_norm": 93.45834350585938, - "learning_rate": 1.8707026160168387e-05, - "loss": 3.1187, + "epoch": 0.08, + "grad_norm": 11.347737312316895, + "learning_rate": 1.9460318788436597e-05, + "loss": 2.7666, "step": 646 }, { - "epoch": 0.19, - "grad_norm": 12.666962623596191, - "learning_rate": 1.8705021549564e-05, - "loss": 3.3528, + "epoch": 0.08, + "grad_norm": 45.7727165222168, + "learning_rate": 1.945948207337991e-05, + "loss": 4.1038, "step": 647 }, { - "epoch": 0.19, - "grad_norm": 13.548149108886719, - "learning_rate": 1.8703016938959608e-05, - "loss": 2.8839, + "epoch": 0.08, + "grad_norm": 12.478466033935547, + "learning_rate": 1.9458645358323224e-05, + "loss": 4.6921, "step": 648 }, { - "epoch": 0.2, - "grad_norm": 20.26407241821289, - "learning_rate": 1.8701012328355218e-05, - "loss": 3.2999, + "epoch": 0.08, + "grad_norm": 21.725603103637695, + "learning_rate": 1.9457808643266538e-05, + "loss": 3.9947, "step": 649 }, { - "epoch": 0.2, - "grad_norm": 16.36279296875, - "learning_rate": 1.8699007717750828e-05, - "loss": 2.7201, + "epoch": 0.08, + "grad_norm": 19.537290573120117, + "learning_rate": 1.9456971928209848e-05, + "loss": 5.4066, "step": 650 }, { - "epoch": 0.2, - "grad_norm": 15.859626770019531, - "learning_rate": 1.8697003107146438e-05, - "loss": 3.3978, + "epoch": 0.08, + "grad_norm": 16.776832580566406, + "learning_rate": 1.9456135213153162e-05, + "loss": 4.4481, "step": 651 }, { - "epoch": 0.2, - "grad_norm": 20.308744430541992, - "learning_rate": 1.8694998496542048e-05, - "loss": 3.7883, + "epoch": 0.08, + "grad_norm": 12.141162872314453, + "learning_rate": 1.9455298498096476e-05, + "loss": 2.9702, "step": 652 }, { - "epoch": 0.2, - "grad_norm": 14.849808692932129, - "learning_rate": 1.8692993885937658e-05, - "loss": 3.0277, + "epoch": 0.08, + "grad_norm": 14.00488567352295, + "learning_rate": 1.9454461783039786e-05, + "loss": 3.5068, "step": 653 }, { - "epoch": 0.2, - "grad_norm": 10.114745140075684, - "learning_rate": 1.8690989275333268e-05, - "loss": 2.4441, + "epoch": 0.08, + "grad_norm": 21.130966186523438, + "learning_rate": 1.94536250679831e-05, + "loss": 4.976, "step": 654 }, { - "epoch": 0.2, - "grad_norm": 21.375455856323242, - "learning_rate": 1.8688984664728878e-05, - "loss": 2.9501, + "epoch": 0.08, + "grad_norm": 79.34236907958984, + "learning_rate": 1.9452788352926413e-05, + "loss": 2.7563, "step": 655 }, { - "epoch": 0.2, - "grad_norm": 21.958810806274414, - "learning_rate": 1.8686980054124488e-05, - "loss": 2.8862, + "epoch": 0.08, + "grad_norm": 22.180267333984375, + "learning_rate": 1.9451951637869727e-05, + "loss": 3.4531, "step": 656 }, { - "epoch": 0.2, - "grad_norm": 14.946088790893555, - "learning_rate": 1.86849754435201e-05, - "loss": 3.4693, + "epoch": 0.08, + "grad_norm": 26.059926986694336, + "learning_rate": 1.9451114922813037e-05, + "loss": 3.8926, "step": 657 }, { - "epoch": 0.2, - "grad_norm": 13.369003295898438, - "learning_rate": 1.868297083291571e-05, - "loss": 4.4842, + "epoch": 0.08, + "grad_norm": 16.83011245727539, + "learning_rate": 1.945027820775635e-05, + "loss": 3.0436, "step": 658 }, { - "epoch": 0.2, - "grad_norm": 13.760272979736328, - "learning_rate": 1.868096622231132e-05, - "loss": 4.0259, + "epoch": 0.08, + "grad_norm": 13.718125343322754, + "learning_rate": 1.9449441492699664e-05, + "loss": 3.8004, "step": 659 }, { - "epoch": 0.2, - "grad_norm": 18.290048599243164, - "learning_rate": 1.867896161170693e-05, - "loss": 2.4139, + "epoch": 0.08, + "grad_norm": 12.119871139526367, + "learning_rate": 1.9448604777642975e-05, + "loss": 3.8753, "step": 660 }, { - "epoch": 0.2, - "grad_norm": 20.004146575927734, - "learning_rate": 1.867695700110254e-05, - "loss": 3.8439, + "epoch": 0.08, + "grad_norm": 22.682151794433594, + "learning_rate": 1.9447768062586288e-05, + "loss": 4.6795, "step": 661 }, { - "epoch": 0.2, - "grad_norm": 15.349987983703613, - "learning_rate": 1.8674952390498145e-05, - "loss": 2.9132, + "epoch": 0.08, + "grad_norm": 17.96092987060547, + "learning_rate": 1.9446931347529602e-05, + "loss": 1.6681, "step": 662 }, { - "epoch": 0.2, - "grad_norm": 14.10637092590332, - "learning_rate": 1.867294777989376e-05, - "loss": 2.9145, + "epoch": 0.08, + "grad_norm": 13.491999626159668, + "learning_rate": 1.9446094632472915e-05, + "loss": 3.6529, "step": 663 }, { - "epoch": 0.2, - "grad_norm": 15.24035358428955, - "learning_rate": 1.8670943169289366e-05, - "loss": 2.748, + "epoch": 0.08, + "grad_norm": 12.081392288208008, + "learning_rate": 1.9445257917416226e-05, + "loss": 3.4167, "step": 664 }, { - "epoch": 0.2, - "grad_norm": 23.505083084106445, - "learning_rate": 1.8668938558684976e-05, - "loss": 2.8452, + "epoch": 0.08, + "grad_norm": 16.117155075073242, + "learning_rate": 1.944442120235954e-05, + "loss": 4.0835, "step": 665 }, { - "epoch": 0.2, - "grad_norm": 82.25447845458984, - "learning_rate": 1.866693394808059e-05, - "loss": 2.8634, + "epoch": 0.08, + "grad_norm": 12.962418556213379, + "learning_rate": 1.944358448730285e-05, + "loss": 3.6016, "step": 666 }, { - "epoch": 0.2, - "grad_norm": 24.59209632873535, - "learning_rate": 1.8664929337476196e-05, - "loss": 2.9632, + "epoch": 0.08, + "grad_norm": 11.88910961151123, + "learning_rate": 1.9442747772246163e-05, + "loss": 1.6991, "step": 667 }, { - "epoch": 0.2, - "grad_norm": 18.12932014465332, - "learning_rate": 1.8662924726871806e-05, - "loss": 3.9964, + "epoch": 0.08, + "grad_norm": 13.218146324157715, + "learning_rate": 1.9441911057189477e-05, + "loss": 3.126, "step": 668 }, { - "epoch": 0.2, - "grad_norm": 13.9307222366333, - "learning_rate": 1.8660920116267416e-05, - "loss": 3.37, + "epoch": 0.08, + "grad_norm": 11.371163368225098, + "learning_rate": 1.9441074342132787e-05, + "loss": 2.4902, "step": 669 }, { - "epoch": 0.2, - "grad_norm": 15.804916381835938, - "learning_rate": 1.8658915505663026e-05, - "loss": 3.7911, + "epoch": 0.08, + "grad_norm": 23.65850257873535, + "learning_rate": 1.94402376270761e-05, + "loss": 2.7793, "step": 670 }, { - "epoch": 0.2, - "grad_norm": 12.954413414001465, - "learning_rate": 1.8656910895058636e-05, - "loss": 3.4076, + "epoch": 0.08, + "grad_norm": 10.849726676940918, + "learning_rate": 1.943940091201941e-05, + "loss": 1.7024, "step": 671 }, { - "epoch": 0.2, - "grad_norm": 19.96038055419922, - "learning_rate": 1.8654906284454246e-05, - "loss": 2.6706, + "epoch": 0.08, + "grad_norm": 19.88506317138672, + "learning_rate": 1.9438564196962725e-05, + "loss": 2.9413, "step": 672 }, { - "epoch": 0.2, - "grad_norm": 14.459244728088379, - "learning_rate": 1.8652901673849856e-05, - "loss": 2.6447, + "epoch": 0.08, + "grad_norm": 43.79939270019531, + "learning_rate": 1.943772748190604e-05, + "loss": 2.4117, "step": 673 }, { - "epoch": 0.2, - "grad_norm": 14.168243408203125, - "learning_rate": 1.8650897063245466e-05, - "loss": 2.8916, + "epoch": 0.08, + "grad_norm": 17.82375144958496, + "learning_rate": 1.943689076684935e-05, + "loss": 2.5445, "step": 674 }, { - "epoch": 0.2, - "grad_norm": 12.294631004333496, - "learning_rate": 1.8648892452641076e-05, - "loss": 3.911, + "epoch": 0.08, + "grad_norm": 14.555415153503418, + "learning_rate": 1.9436054051792662e-05, + "loss": 3.7538, "step": 675 }, { - "epoch": 0.2, - "grad_norm": 16.20514488220215, - "learning_rate": 1.8646887842036687e-05, - "loss": 4.1275, + "epoch": 0.08, + "grad_norm": 12.511565208435059, + "learning_rate": 1.9435217336735976e-05, + "loss": 2.6467, "step": 676 }, { - "epoch": 0.2, - "grad_norm": 13.80794906616211, - "learning_rate": 1.8644883231432297e-05, - "loss": 2.5423, + "epoch": 0.08, + "grad_norm": 8.727785110473633, + "learning_rate": 1.943438062167929e-05, + "loss": 2.6967, "step": 677 }, { - "epoch": 0.2, - "grad_norm": 13.199609756469727, - "learning_rate": 1.8642878620827907e-05, - "loss": 3.119, + "epoch": 0.09, + "grad_norm": 9.602141380310059, + "learning_rate": 1.94335439066226e-05, + "loss": 2.9577, "step": 678 }, { - "epoch": 0.2, - "grad_norm": 17.830242156982422, - "learning_rate": 1.8640874010223517e-05, - "loss": 3.9047, + "epoch": 0.09, + "grad_norm": 33.51832962036133, + "learning_rate": 1.9432707191565914e-05, + "loss": 5.2958, "step": 679 }, { - "epoch": 0.2, - "grad_norm": 17.04703140258789, - "learning_rate": 1.8638869399619127e-05, - "loss": 2.4903, + "epoch": 0.09, + "grad_norm": 14.140719413757324, + "learning_rate": 1.9431870476509227e-05, + "loss": 4.4345, "step": 680 }, { - "epoch": 0.2, - "grad_norm": 16.005630493164062, - "learning_rate": 1.8636864789014734e-05, - "loss": 3.5141, + "epoch": 0.09, + "grad_norm": 18.002798080444336, + "learning_rate": 1.9431033761452537e-05, + "loss": 4.8873, "step": 681 }, { - "epoch": 0.21, - "grad_norm": 12.92525577545166, - "learning_rate": 1.8634860178410347e-05, - "loss": 3.5435, + "epoch": 0.09, + "grad_norm": 18.150636672973633, + "learning_rate": 1.943019704639585e-05, + "loss": 2.9637, "step": 682 }, { - "epoch": 0.21, - "grad_norm": 13.173433303833008, - "learning_rate": 1.8632855567805954e-05, - "loss": 3.4652, + "epoch": 0.09, + "grad_norm": 35.52592086791992, + "learning_rate": 1.9429360331339165e-05, + "loss": 3.5024, "step": 683 }, { - "epoch": 0.21, - "grad_norm": 15.04622745513916, - "learning_rate": 1.8630850957201564e-05, - "loss": 3.2746, + "epoch": 0.09, + "grad_norm": 16.548412322998047, + "learning_rate": 1.942852361628248e-05, + "loss": 3.9262, "step": 684 }, { - "epoch": 0.21, - "grad_norm": 16.044187545776367, - "learning_rate": 1.8628846346597177e-05, - "loss": 3.5208, + "epoch": 0.09, + "grad_norm": 12.454635620117188, + "learning_rate": 1.942768690122579e-05, + "loss": 3.7851, "step": 685 }, { - "epoch": 0.21, - "grad_norm": 12.709644317626953, - "learning_rate": 1.8626841735992784e-05, - "loss": 2.9898, + "epoch": 0.09, + "grad_norm": 35.238014221191406, + "learning_rate": 1.9426850186169102e-05, + "loss": 4.1137, "step": 686 }, { - "epoch": 0.21, - "grad_norm": 62.63556671142578, - "learning_rate": 1.8624837125388394e-05, - "loss": 4.3599, + "epoch": 0.09, + "grad_norm": 14.9273099899292, + "learning_rate": 1.9426013471112416e-05, + "loss": 3.4451, "step": 687 }, { - "epoch": 0.21, - "grad_norm": 14.936622619628906, - "learning_rate": 1.8622832514784004e-05, - "loss": 3.7374, + "epoch": 0.09, + "grad_norm": 12.957579612731934, + "learning_rate": 1.9425176756055726e-05, + "loss": 2.667, "step": 688 }, { - "epoch": 0.21, - "grad_norm": 18.37685203552246, - "learning_rate": 1.8620827904179614e-05, - "loss": 2.8972, + "epoch": 0.09, + "grad_norm": 22.085285186767578, + "learning_rate": 1.942434004099904e-05, + "loss": 4.9414, "step": 689 }, { - "epoch": 0.21, - "grad_norm": 18.769283294677734, - "learning_rate": 1.8618823293575224e-05, - "loss": 2.7669, + "epoch": 0.09, + "grad_norm": 12.428492546081543, + "learning_rate": 1.9423503325942354e-05, + "loss": 2.9308, "step": 690 }, { - "epoch": 0.21, - "grad_norm": 14.026557922363281, - "learning_rate": 1.8616818682970834e-05, - "loss": 3.4005, + "epoch": 0.09, + "grad_norm": 21.980300903320312, + "learning_rate": 1.9422666610885664e-05, + "loss": 3.3054, "step": 691 }, { - "epoch": 0.21, - "grad_norm": 15.341630935668945, - "learning_rate": 1.8614814072366445e-05, - "loss": 3.8892, + "epoch": 0.09, + "grad_norm": 10.78618335723877, + "learning_rate": 1.9421829895828977e-05, + "loss": 0.7016, "step": 692 }, { - "epoch": 0.21, - "grad_norm": 20.44842529296875, - "learning_rate": 1.8612809461762055e-05, - "loss": 4.4409, + "epoch": 0.09, + "grad_norm": 14.255931854248047, + "learning_rate": 1.942099318077229e-05, + "loss": 4.1619, "step": 693 }, { - "epoch": 0.21, - "grad_norm": 23.54950523376465, - "learning_rate": 1.8610804851157665e-05, - "loss": 3.9646, + "epoch": 0.09, + "grad_norm": 11.36681842803955, + "learning_rate": 1.94201564657156e-05, + "loss": 1.7237, "step": 694 }, { - "epoch": 0.21, - "grad_norm": 13.78188419342041, - "learning_rate": 1.860880024055327e-05, - "loss": 4.5521, + "epoch": 0.09, + "grad_norm": 14.176616668701172, + "learning_rate": 1.9419319750658915e-05, + "loss": 3.3943, "step": 695 }, { - "epoch": 0.21, - "grad_norm": 21.157424926757812, - "learning_rate": 1.8606795629948885e-05, - "loss": 3.6927, + "epoch": 0.09, + "grad_norm": 35.78500747680664, + "learning_rate": 1.9418483035602225e-05, + "loss": 3.7633, "step": 696 }, { - "epoch": 0.21, - "grad_norm": 12.406957626342773, - "learning_rate": 1.8604791019344495e-05, - "loss": 3.2458, + "epoch": 0.09, + "grad_norm": 11.32670783996582, + "learning_rate": 1.941764632054554e-05, + "loss": 1.5977, "step": 697 }, { - "epoch": 0.21, - "grad_norm": 18.12921142578125, - "learning_rate": 1.86027864087401e-05, - "loss": 3.7537, + "epoch": 0.09, + "grad_norm": 9.296537399291992, + "learning_rate": 1.9416809605488853e-05, + "loss": 1.8491, "step": 698 }, { - "epoch": 0.21, - "grad_norm": 21.271060943603516, - "learning_rate": 1.8600781798135715e-05, - "loss": 4.3055, + "epoch": 0.09, + "grad_norm": 12.421931266784668, + "learning_rate": 1.9415972890432163e-05, + "loss": 2.4607, "step": 699 }, { - "epoch": 0.21, - "grad_norm": 13.396303176879883, - "learning_rate": 1.8598777187531322e-05, - "loss": 3.5275, + "epoch": 0.09, + "grad_norm": 11.723788261413574, + "learning_rate": 1.9415136175375476e-05, + "loss": 3.9441, "step": 700 }, { - "epoch": 0.21, - "grad_norm": 11.93570613861084, - "learning_rate": 1.8596772576926935e-05, - "loss": 3.0972, + "epoch": 0.09, + "grad_norm": 120.77423095703125, + "learning_rate": 1.941429946031879e-05, + "loss": 2.6284, "step": 701 }, { - "epoch": 0.21, - "grad_norm": 14.72830581665039, - "learning_rate": 1.8594767966322545e-05, - "loss": 4.0018, + "epoch": 0.09, + "grad_norm": 13.209537506103516, + "learning_rate": 1.94134627452621e-05, + "loss": 2.1441, "step": 702 }, { - "epoch": 0.21, - "grad_norm": 23.694110870361328, - "learning_rate": 1.8592763355718152e-05, - "loss": 3.2218, + "epoch": 0.09, + "grad_norm": 10.035263061523438, + "learning_rate": 1.9412626030205414e-05, + "loss": 2.6173, "step": 703 }, { - "epoch": 0.21, - "grad_norm": 17.545129776000977, - "learning_rate": 1.8590758745113766e-05, - "loss": 3.2768, + "epoch": 0.09, + "grad_norm": 15.197259902954102, + "learning_rate": 1.9411789315148728e-05, + "loss": 4.7402, "step": 704 }, { - "epoch": 0.21, - "grad_norm": 16.36639976501465, - "learning_rate": 1.8588754134509372e-05, - "loss": 3.1514, + "epoch": 0.09, + "grad_norm": 15.673565864562988, + "learning_rate": 1.941095260009204e-05, + "loss": 3.5522, "step": 705 }, { - "epoch": 0.21, - "grad_norm": 11.386521339416504, - "learning_rate": 1.8586749523904982e-05, - "loss": 2.3357, + "epoch": 0.09, + "grad_norm": 13.20335578918457, + "learning_rate": 1.941011588503535e-05, + "loss": 4.3, "step": 706 }, { - "epoch": 0.21, - "grad_norm": 10.972953796386719, - "learning_rate": 1.8584744913300592e-05, - "loss": 2.8289, + "epoch": 0.09, + "grad_norm": 25.170381546020508, + "learning_rate": 1.9409279169978665e-05, + "loss": 5.6345, "step": 707 }, { - "epoch": 0.21, - "grad_norm": 16.927831649780273, - "learning_rate": 1.8582740302696202e-05, - "loss": 3.086, + "epoch": 0.09, + "grad_norm": 20.28534507751465, + "learning_rate": 1.940844245492198e-05, + "loss": 4.7721, "step": 708 }, { - "epoch": 0.21, - "grad_norm": 14.967674255371094, - "learning_rate": 1.8580735692091813e-05, - "loss": 3.6209, + "epoch": 0.09, + "grad_norm": 22.23134422302246, + "learning_rate": 1.940760573986529e-05, + "loss": 3.1, "step": 709 }, { - "epoch": 0.21, - "grad_norm": 16.293773651123047, - "learning_rate": 1.8578731081487423e-05, - "loss": 4.2359, + "epoch": 0.09, + "grad_norm": 11.009767532348633, + "learning_rate": 1.9406769024808603e-05, + "loss": 0.5144, "step": 710 }, { - "epoch": 0.21, - "grad_norm": 14.0774564743042, - "learning_rate": 1.8576726470883033e-05, - "loss": 3.7756, + "epoch": 0.09, + "grad_norm": 15.88630199432373, + "learning_rate": 1.9405932309751916e-05, + "loss": 4.2522, "step": 711 }, { - "epoch": 0.21, - "grad_norm": 26.526830673217773, - "learning_rate": 1.8574721860278643e-05, - "loss": 3.1339, + "epoch": 0.09, + "grad_norm": 13.1906156539917, + "learning_rate": 1.940509559469523e-05, + "loss": 2.494, "step": 712 }, { - "epoch": 0.21, - "grad_norm": 13.169925689697266, - "learning_rate": 1.8572717249674253e-05, - "loss": 2.6322, + "epoch": 0.09, + "grad_norm": 21.883312225341797, + "learning_rate": 1.940425887963854e-05, + "loss": 2.4317, "step": 713 }, { - "epoch": 0.21, - "grad_norm": 13.879220962524414, - "learning_rate": 1.857071263906986e-05, - "loss": 2.5253, + "epoch": 0.09, + "grad_norm": 11.292802810668945, + "learning_rate": 1.9403422164581854e-05, + "loss": 2.2441, "step": 714 }, { - "epoch": 0.21, - "grad_norm": 15.33066177368164, - "learning_rate": 1.8568708028465473e-05, - "loss": 3.3604, + "epoch": 0.09, + "grad_norm": 13.797185897827148, + "learning_rate": 1.9402585449525168e-05, + "loss": 4.7378, "step": 715 }, { - "epoch": 0.22, - "grad_norm": 16.04281997680664, - "learning_rate": 1.8566703417861083e-05, - "loss": 2.824, + "epoch": 0.09, + "grad_norm": 22.5351505279541, + "learning_rate": 1.9401748734468478e-05, + "loss": 4.5674, "step": 716 }, { - "epoch": 0.22, - "grad_norm": 13.022286415100098, - "learning_rate": 1.856469880725669e-05, - "loss": 2.6692, + "epoch": 0.09, + "grad_norm": 8.449362754821777, + "learning_rate": 1.940091201941179e-05, + "loss": 1.1736, "step": 717 }, { - "epoch": 0.22, - "grad_norm": 14.34041976928711, - "learning_rate": 1.8562694196652303e-05, - "loss": 3.0903, + "epoch": 0.09, + "grad_norm": 13.076533317565918, + "learning_rate": 1.9400075304355105e-05, + "loss": 3.7764, "step": 718 }, { - "epoch": 0.22, - "grad_norm": 17.68020248413086, - "learning_rate": 1.856068958604791e-05, - "loss": 3.4143, + "epoch": 0.09, + "grad_norm": 14.160146713256836, + "learning_rate": 1.9399238589298415e-05, + "loss": 2.7199, "step": 719 }, { - "epoch": 0.22, - "grad_norm": 13.933920860290527, - "learning_rate": 1.855868497544352e-05, - "loss": 3.2975, - "step": 720 - }, - { - "epoch": 0.22, - "eval_loss": 0.7311321496963501, - "eval_runtime": 43.9321, - "eval_samples_per_second": 33.666, - "eval_steps_per_second": 33.666, + "epoch": 0.09, + "grad_norm": 11.873703956604004, + "learning_rate": 1.939840187424173e-05, + "loss": 2.8223, "step": 720 }, { - "epoch": 0.22, - "grad_norm": 14.72537612915039, - "learning_rate": 1.8556680364839134e-05, - "loss": 3.9612, + "epoch": 0.09, + "grad_norm": 12.563404083251953, + "learning_rate": 1.9397565159185043e-05, + "loss": 2.5746, "step": 721 }, { - "epoch": 0.22, - "grad_norm": 26.851642608642578, - "learning_rate": 1.855467575423474e-05, - "loss": 3.4687, + "epoch": 0.09, + "grad_norm": 10.250195503234863, + "learning_rate": 1.9396728444128353e-05, + "loss": 2.0382, "step": 722 }, { - "epoch": 0.22, - "grad_norm": 16.372344970703125, - "learning_rate": 1.855267114363035e-05, - "loss": 3.7605, + "epoch": 0.09, + "grad_norm": 7.887467384338379, + "learning_rate": 1.9395891729071667e-05, + "loss": 0.775, "step": 723 }, { - "epoch": 0.22, - "grad_norm": 17.88177490234375, - "learning_rate": 1.855066653302596e-05, - "loss": 3.4436, + "epoch": 0.09, + "grad_norm": 16.83657455444336, + "learning_rate": 1.9395055014014977e-05, + "loss": 5.009, "step": 724 }, { - "epoch": 0.22, - "grad_norm": 16.162473678588867, - "learning_rate": 1.854866192242157e-05, - "loss": 3.4066, + "epoch": 0.09, + "grad_norm": 11.711037635803223, + "learning_rate": 1.939421829895829e-05, + "loss": 3.1593, "step": 725 }, { - "epoch": 0.22, - "grad_norm": 12.335894584655762, - "learning_rate": 1.854665731181718e-05, - "loss": 2.9006, + "epoch": 0.09, + "grad_norm": 17.11760711669922, + "learning_rate": 1.9393381583901604e-05, + "loss": 4.1511, "step": 726 }, { - "epoch": 0.22, - "grad_norm": 19.66415786743164, - "learning_rate": 1.854465270121279e-05, - "loss": 3.5752, + "epoch": 0.09, + "grad_norm": 18.830835342407227, + "learning_rate": 1.9392544868844914e-05, + "loss": 4.5674, "step": 727 }, { - "epoch": 0.22, - "grad_norm": 11.808516502380371, - "learning_rate": 1.85426480906084e-05, - "loss": 1.8741, + "epoch": 0.09, + "grad_norm": 13.822317123413086, + "learning_rate": 1.9391708153788228e-05, + "loss": 2.7104, "step": 728 }, { - "epoch": 0.22, - "grad_norm": 15.641440391540527, - "learning_rate": 1.854064348000401e-05, - "loss": 3.3391, + "epoch": 0.09, + "grad_norm": 11.05170726776123, + "learning_rate": 1.9390871438731542e-05, + "loss": 2.7689, "step": 729 }, { - "epoch": 0.22, - "grad_norm": 13.590608596801758, - "learning_rate": 1.853863886939962e-05, - "loss": 2.9265, + "epoch": 0.09, + "grad_norm": 13.445638656616211, + "learning_rate": 1.9390034723674852e-05, + "loss": 2.9387, "step": 730 }, { - "epoch": 0.22, - "grad_norm": 18.237276077270508, - "learning_rate": 1.853663425879523e-05, - "loss": 3.6717, + "epoch": 0.09, + "grad_norm": 22.369338989257812, + "learning_rate": 1.9389198008618166e-05, + "loss": 3.1684, "step": 731 }, { - "epoch": 0.22, - "grad_norm": 41.96309280395508, - "learning_rate": 1.853462964819084e-05, - "loss": 2.857, + "epoch": 0.09, + "grad_norm": 11.012389183044434, + "learning_rate": 1.938836129356148e-05, + "loss": 2.211, "step": 732 }, { - "epoch": 0.22, - "grad_norm": 15.107841491699219, - "learning_rate": 1.853262503758645e-05, - "loss": 3.9187, + "epoch": 0.09, + "grad_norm": 11.709720611572266, + "learning_rate": 1.9387524578504793e-05, + "loss": 2.822, "step": 733 }, { - "epoch": 0.22, - "grad_norm": 15.574593544006348, - "learning_rate": 1.853062042698206e-05, - "loss": 3.1606, + "epoch": 0.09, + "grad_norm": 13.748392105102539, + "learning_rate": 1.9386687863448103e-05, + "loss": 2.3774, "step": 734 }, { - "epoch": 0.22, - "grad_norm": 21.812761306762695, - "learning_rate": 1.852861581637767e-05, - "loss": 2.8953, + "epoch": 0.09, + "grad_norm": 17.709924697875977, + "learning_rate": 1.9385851148391417e-05, + "loss": 2.5742, "step": 735 }, { - "epoch": 0.22, - "grad_norm": 21.730745315551758, - "learning_rate": 1.8526611205773278e-05, - "loss": 3.3289, + "epoch": 0.09, + "grad_norm": 35.1605110168457, + "learning_rate": 1.938501443333473e-05, + "loss": 1.6553, "step": 736 }, { - "epoch": 0.22, - "grad_norm": 33.00816345214844, - "learning_rate": 1.852460659516889e-05, - "loss": 2.98, + "epoch": 0.09, + "grad_norm": 17.295238494873047, + "learning_rate": 1.938417771827804e-05, + "loss": 2.2971, "step": 737 }, { - "epoch": 0.22, - "grad_norm": 16.756553649902344, - "learning_rate": 1.8522601984564498e-05, - "loss": 3.0438, + "epoch": 0.09, + "grad_norm": 14.092886924743652, + "learning_rate": 1.9383341003221354e-05, + "loss": 2.3941, "step": 738 }, { - "epoch": 0.22, - "grad_norm": 24.259780883789062, - "learning_rate": 1.852059737396011e-05, - "loss": 3.9168, + "epoch": 0.09, + "grad_norm": 20.23340606689453, + "learning_rate": 1.9382504288164668e-05, + "loss": 1.5338, "step": 739 }, { - "epoch": 0.22, - "grad_norm": 10.961197853088379, - "learning_rate": 1.8518592763355722e-05, - "loss": 2.521, + "epoch": 0.09, + "grad_norm": 15.133199691772461, + "learning_rate": 1.9381667573107982e-05, + "loss": 1.0674, "step": 740 }, { - "epoch": 0.22, - "grad_norm": 11.985860824584961, - "learning_rate": 1.851658815275133e-05, - "loss": 3.6082, + "epoch": 0.09, + "grad_norm": 14.169252395629883, + "learning_rate": 1.9380830858051292e-05, + "loss": 2.9105, "step": 741 }, { - "epoch": 0.22, - "grad_norm": 13.038666725158691, - "learning_rate": 1.851458354214694e-05, - "loss": 2.8915, + "epoch": 0.09, + "grad_norm": 153.477783203125, + "learning_rate": 1.9379994142994606e-05, + "loss": 2.8067, "step": 742 }, { - "epoch": 0.22, - "grad_norm": 15.792445182800293, - "learning_rate": 1.851257893154255e-05, - "loss": 3.0136, + "epoch": 0.09, + "grad_norm": 7.675416946411133, + "learning_rate": 1.937915742793792e-05, + "loss": 2.2885, "step": 743 }, { - "epoch": 0.22, - "grad_norm": 11.698639869689941, - "learning_rate": 1.851057432093816e-05, - "loss": 2.9538, + "epoch": 0.09, + "grad_norm": 16.272003173828125, + "learning_rate": 1.937832071288123e-05, + "loss": 2.3131, "step": 744 }, { - "epoch": 0.22, - "grad_norm": 14.640687942504883, - "learning_rate": 1.850856971033377e-05, - "loss": 2.1114, + "epoch": 0.09, + "grad_norm": 104.13090515136719, + "learning_rate": 1.9377483997824543e-05, + "loss": 3.6301, "step": 745 }, { - "epoch": 0.22, - "grad_norm": 21.3159122467041, - "learning_rate": 1.850656509972938e-05, - "loss": 3.1929, + "epoch": 0.09, + "grad_norm": 19.843830108642578, + "learning_rate": 1.9376647282767857e-05, + "loss": 1.6103, "step": 746 }, { - "epoch": 0.22, - "grad_norm": 15.516459465026855, - "learning_rate": 1.850456048912499e-05, - "loss": 3.4558, + "epoch": 0.09, + "grad_norm": 33.996437072753906, + "learning_rate": 1.9375810567711167e-05, + "loss": 5.4643, "step": 747 }, { - "epoch": 0.22, - "grad_norm": 12.31661605834961, - "learning_rate": 1.85025558785206e-05, - "loss": 3.5311, + "epoch": 0.09, + "grad_norm": 9.253028869628906, + "learning_rate": 1.937497385265448e-05, + "loss": 1.7174, "step": 748 }, { - "epoch": 0.23, - "grad_norm": 17.94566535949707, - "learning_rate": 1.850055126791621e-05, - "loss": 3.6713, + "epoch": 0.09, + "grad_norm": 19.09623908996582, + "learning_rate": 1.937413713759779e-05, + "loss": 2.9651, "step": 749 }, { - "epoch": 0.23, - "grad_norm": 13.146416664123535, - "learning_rate": 1.849854665731182e-05, - "loss": 2.0929, + "epoch": 0.09, + "grad_norm": 16.510766983032227, + "learning_rate": 1.9373300422541105e-05, + "loss": 5.6302, "step": 750 }, { - "epoch": 0.23, - "grad_norm": 15.036728858947754, - "learning_rate": 1.849654204670743e-05, - "loss": 3.2656, + "epoch": 0.09, + "grad_norm": 13.26391315460205, + "learning_rate": 1.937246370748442e-05, + "loss": 2.3617, "step": 751 }, { - "epoch": 0.23, - "grad_norm": 12.377967834472656, - "learning_rate": 1.849453743610304e-05, - "loss": 2.811, + "epoch": 0.09, + "grad_norm": 16.245691299438477, + "learning_rate": 1.937162699242773e-05, + "loss": 2.2042, "step": 752 }, { - "epoch": 0.23, - "grad_norm": 48.185482025146484, - "learning_rate": 1.849253282549865e-05, - "loss": 4.3531, + "epoch": 0.09, + "grad_norm": 14.730291366577148, + "learning_rate": 1.9370790277371042e-05, + "loss": 4.1242, "step": 753 }, { - "epoch": 0.23, - "grad_norm": 16.886388778686523, - "learning_rate": 1.849052821489426e-05, - "loss": 3.3402, + "epoch": 0.09, + "grad_norm": 15.235692977905273, + "learning_rate": 1.9369953562314356e-05, + "loss": 2.8035, "step": 754 }, { - "epoch": 0.23, - "grad_norm": 10.66983699798584, - "learning_rate": 1.8488523604289866e-05, - "loss": 2.5462, + "epoch": 0.09, + "grad_norm": 8.804192543029785, + "learning_rate": 1.9369116847257666e-05, + "loss": 3.199, "step": 755 }, { - "epoch": 0.23, - "grad_norm": 13.7207670211792, - "learning_rate": 1.848651899368548e-05, - "loss": 4.2536, + "epoch": 0.09, + "grad_norm": 14.533276557922363, + "learning_rate": 1.936828013220098e-05, + "loss": 4.2, "step": 756 }, { - "epoch": 0.23, - "grad_norm": 11.786395072937012, - "learning_rate": 1.8484514383081086e-05, - "loss": 3.3007, + "epoch": 0.1, + "grad_norm": 12.746406555175781, + "learning_rate": 1.9367443417144293e-05, + "loss": 2.9343, "step": 757 }, { - "epoch": 0.23, - "grad_norm": 19.73274803161621, - "learning_rate": 1.8482509772476697e-05, - "loss": 3.2301, + "epoch": 0.1, + "grad_norm": 9.367931365966797, + "learning_rate": 1.9366606702087604e-05, + "loss": 1.7168, "step": 758 }, { - "epoch": 0.23, - "grad_norm": 15.213275909423828, - "learning_rate": 1.848050516187231e-05, - "loss": 3.0059, + "epoch": 0.1, + "grad_norm": 17.672361373901367, + "learning_rate": 1.9365769987030917e-05, + "loss": 4.2855, "step": 759 }, { - "epoch": 0.23, - "grad_norm": 19.37066078186035, - "learning_rate": 1.8478500551267917e-05, - "loss": 2.8112, + "epoch": 0.1, + "grad_norm": 24.997514724731445, + "learning_rate": 1.936493327197423e-05, + "loss": 4.1359, "step": 760 }, { - "epoch": 0.23, - "grad_norm": 48.94891357421875, - "learning_rate": 1.8476495940663527e-05, - "loss": 4.0552, + "epoch": 0.1, + "grad_norm": 13.02668285369873, + "learning_rate": 1.9364096556917545e-05, + "loss": 2.802, "step": 761 }, { - "epoch": 0.23, - "grad_norm": 11.666791915893555, - "learning_rate": 1.8474491330059137e-05, - "loss": 3.0329, + "epoch": 0.1, + "grad_norm": 16.64696502685547, + "learning_rate": 1.9363259841860855e-05, + "loss": 2.6909, "step": 762 }, { - "epoch": 0.23, - "grad_norm": 14.530978202819824, - "learning_rate": 1.8472486719454747e-05, - "loss": 2.9601, + "epoch": 0.1, + "grad_norm": 10.787168502807617, + "learning_rate": 1.936242312680417e-05, + "loss": 1.8418, "step": 763 }, { - "epoch": 0.23, - "grad_norm": 15.476301193237305, - "learning_rate": 1.8470482108850357e-05, - "loss": 3.8685, + "epoch": 0.1, + "grad_norm": 24.767377853393555, + "learning_rate": 1.9361586411747482e-05, + "loss": 5.6238, "step": 764 }, { - "epoch": 0.23, - "grad_norm": 34.36149597167969, - "learning_rate": 1.8468477498245967e-05, - "loss": 3.6309, + "epoch": 0.1, + "grad_norm": 12.260977745056152, + "learning_rate": 1.9360749696690792e-05, + "loss": 3.3651, "step": 765 }, { - "epoch": 0.23, - "grad_norm": 13.753215789794922, - "learning_rate": 1.8466472887641577e-05, - "loss": 2.5185, + "epoch": 0.1, + "grad_norm": 15.056289672851562, + "learning_rate": 1.9359912981634106e-05, + "loss": 2.7135, "step": 766 }, { - "epoch": 0.23, - "grad_norm": 14.692520141601562, - "learning_rate": 1.8464468277037187e-05, - "loss": 3.4938, + "epoch": 0.1, + "grad_norm": 27.636878967285156, + "learning_rate": 1.935907626657742e-05, + "loss": 3.2168, "step": 767 }, { - "epoch": 0.23, - "grad_norm": 19.021665573120117, - "learning_rate": 1.8462463666432797e-05, - "loss": 2.4541, + "epoch": 0.1, + "grad_norm": 13.80523681640625, + "learning_rate": 1.9358239551520733e-05, + "loss": 4.6771, "step": 768 }, { - "epoch": 0.23, - "grad_norm": 15.532613754272461, - "learning_rate": 1.8460459055828407e-05, - "loss": 3.5129, + "epoch": 0.1, + "grad_norm": 12.087339401245117, + "learning_rate": 1.9357402836464044e-05, + "loss": 3.8014, "step": 769 }, { - "epoch": 0.23, - "grad_norm": 19.325273513793945, - "learning_rate": 1.8458454445224018e-05, - "loss": 3.8114, + "epoch": 0.1, + "grad_norm": 13.019579887390137, + "learning_rate": 1.9356566121407357e-05, + "loss": 2.8844, "step": 770 }, { - "epoch": 0.23, - "grad_norm": 19.3370361328125, - "learning_rate": 1.8456449834619628e-05, - "loss": 3.7453, + "epoch": 0.1, + "grad_norm": 10.89457893371582, + "learning_rate": 1.935572940635067e-05, + "loss": 2.0939, "step": 771 }, { - "epoch": 0.23, - "grad_norm": 17.998004913330078, - "learning_rate": 1.8454445224015238e-05, - "loss": 4.1352, + "epoch": 0.1, + "grad_norm": 13.409041404724121, + "learning_rate": 1.935489269129398e-05, + "loss": 3.9463, "step": 772 }, { - "epoch": 0.23, - "grad_norm": 14.474387168884277, - "learning_rate": 1.8452440613410848e-05, - "loss": 3.0304, + "epoch": 0.1, + "grad_norm": 10.823927879333496, + "learning_rate": 1.9354055976237295e-05, + "loss": 3.0251, "step": 773 }, { - "epoch": 0.23, - "grad_norm": 12.42952823638916, - "learning_rate": 1.8450436002806454e-05, - "loss": 2.8432, + "epoch": 0.1, + "grad_norm": 14.99421501159668, + "learning_rate": 1.935321926118061e-05, + "loss": 1.5831, "step": 774 }, { - "epoch": 0.23, - "grad_norm": 15.06114387512207, - "learning_rate": 1.8448431392202068e-05, - "loss": 2.9204, + "epoch": 0.1, + "grad_norm": 14.503152847290039, + "learning_rate": 1.935238254612392e-05, + "loss": 3.4288, "step": 775 }, { - "epoch": 0.23, - "grad_norm": 19.3477840423584, - "learning_rate": 1.8446426781597678e-05, - "loss": 2.6387, + "epoch": 0.1, + "grad_norm": 12.275667190551758, + "learning_rate": 1.9351545831067232e-05, + "loss": 3.406, "step": 776 }, { - "epoch": 0.23, - "grad_norm": 19.3367862701416, - "learning_rate": 1.8444422170993285e-05, - "loss": 3.7189, + "epoch": 0.1, + "grad_norm": 16.560117721557617, + "learning_rate": 1.9350709116010543e-05, + "loss": 4.3817, "step": 777 }, { - "epoch": 0.23, - "grad_norm": 18.679487228393555, - "learning_rate": 1.8442417560388898e-05, - "loss": 2.7699, + "epoch": 0.1, + "grad_norm": 16.34395980834961, + "learning_rate": 1.9349872400953856e-05, + "loss": 1.9061, "step": 778 }, { - "epoch": 0.23, - "grad_norm": 17.268980026245117, - "learning_rate": 1.8440412949784505e-05, - "loss": 3.1304, + "epoch": 0.1, + "grad_norm": 11.401074409484863, + "learning_rate": 1.934903568589717e-05, + "loss": 2.8721, "step": 779 }, { - "epoch": 0.23, - "grad_norm": 12.782285690307617, - "learning_rate": 1.8438408339180115e-05, - "loss": 2.1415, + "epoch": 0.1, + "grad_norm": 18.35641098022461, + "learning_rate": 1.934819897084048e-05, + "loss": 4.7659, "step": 780 }, { - "epoch": 0.23, - "grad_norm": 13.401495933532715, - "learning_rate": 1.8436403728575725e-05, - "loss": 2.8394, + "epoch": 0.1, + "grad_norm": 14.767510414123535, + "learning_rate": 1.9347362255783794e-05, + "loss": 4.2284, "step": 781 }, { - "epoch": 0.24, - "grad_norm": 17.430240631103516, - "learning_rate": 1.8434399117971335e-05, - "loss": 3.3336, + "epoch": 0.1, + "grad_norm": 42.032958984375, + "learning_rate": 1.9346525540727108e-05, + "loss": 2.738, "step": 782 }, { - "epoch": 0.24, - "grad_norm": 50.752410888671875, - "learning_rate": 1.8432394507366945e-05, - "loss": 3.1897, + "epoch": 0.1, + "grad_norm": 15.29719352722168, + "learning_rate": 1.9345688825670418e-05, + "loss": 3.3126, "step": 783 }, { - "epoch": 0.24, - "grad_norm": 14.428933143615723, - "learning_rate": 1.8430389896762555e-05, - "loss": 2.8947, + "epoch": 0.1, + "grad_norm": 29.157716751098633, + "learning_rate": 1.934485211061373e-05, + "loss": 5.0679, "step": 784 }, { - "epoch": 0.24, - "grad_norm": 60.45037078857422, - "learning_rate": 1.8428385286158165e-05, - "loss": 2.7407, + "epoch": 0.1, + "grad_norm": 11.626741409301758, + "learning_rate": 1.9344015395557045e-05, + "loss": 2.2906, "step": 785 }, { - "epoch": 0.24, - "grad_norm": 15.153972625732422, - "learning_rate": 1.8426380675553776e-05, - "loss": 3.9118, + "epoch": 0.1, + "grad_norm": 14.68950366973877, + "learning_rate": 1.9343178680500355e-05, + "loss": 4.9856, "step": 786 }, { - "epoch": 0.24, - "grad_norm": 13.527739524841309, - "learning_rate": 1.8424376064949386e-05, - "loss": 2.8687, + "epoch": 0.1, + "grad_norm": 9.994388580322266, + "learning_rate": 1.934234196544367e-05, + "loss": 4.3106, "step": 787 }, { - "epoch": 0.24, - "grad_norm": 13.59441089630127, - "learning_rate": 1.8422371454344996e-05, - "loss": 3.4082, + "epoch": 0.1, + "grad_norm": 9.545372009277344, + "learning_rate": 1.9341505250386983e-05, + "loss": 2.5157, "step": 788 }, { - "epoch": 0.24, - "grad_norm": 18.013368606567383, - "learning_rate": 1.8420366843740606e-05, - "loss": 3.1523, + "epoch": 0.1, + "grad_norm": 12.77479362487793, + "learning_rate": 1.9340668535330296e-05, + "loss": 1.9969, "step": 789 }, { - "epoch": 0.24, - "grad_norm": 23.031423568725586, - "learning_rate": 1.8418362233136216e-05, - "loss": 3.8803, + "epoch": 0.1, + "grad_norm": 16.627382278442383, + "learning_rate": 1.9339831820273607e-05, + "loss": 3.4785, "step": 790 }, { - "epoch": 0.24, - "grad_norm": 12.914963722229004, - "learning_rate": 1.8416357622531823e-05, - "loss": 3.0994, + "epoch": 0.1, + "grad_norm": 27.862689971923828, + "learning_rate": 1.933899510521692e-05, + "loss": 4.5648, "step": 791 }, { - "epoch": 0.24, - "grad_norm": 25.980188369750977, - "learning_rate": 1.8414353011927436e-05, - "loss": 1.9718, + "epoch": 0.1, + "grad_norm": 15.007376670837402, + "learning_rate": 1.9338158390160234e-05, + "loss": 1.4494, "step": 792 }, { - "epoch": 0.24, - "grad_norm": 14.725564002990723, - "learning_rate": 1.8412348401323043e-05, - "loss": 2.472, + "epoch": 0.1, + "grad_norm": 12.32243537902832, + "learning_rate": 1.9337321675103544e-05, + "loss": 4.0946, "step": 793 }, { - "epoch": 0.24, - "grad_norm": 13.297785758972168, - "learning_rate": 1.8410343790718653e-05, - "loss": 4.0097, + "epoch": 0.1, + "grad_norm": 13.405266761779785, + "learning_rate": 1.9336484960046858e-05, + "loss": 3.2625, "step": 794 }, { - "epoch": 0.24, - "grad_norm": 15.004578590393066, - "learning_rate": 1.8408339180114266e-05, - "loss": 2.8653, + "epoch": 0.1, + "grad_norm": 23.589693069458008, + "learning_rate": 1.933564824499017e-05, + "loss": 5.7331, "step": 795 }, { - "epoch": 0.24, - "grad_norm": 19.27330780029297, - "learning_rate": 1.8406334569509873e-05, - "loss": 3.1684, + "epoch": 0.1, + "grad_norm": 35.675025939941406, + "learning_rate": 1.9334811529933485e-05, + "loss": 2.3257, "step": 796 }, { - "epoch": 0.24, - "grad_norm": 25.846576690673828, - "learning_rate": 1.8404329958905483e-05, - "loss": 2.6346, + "epoch": 0.1, + "grad_norm": 12.68811321258545, + "learning_rate": 1.9333974814876795e-05, + "loss": 4.5812, "step": 797 }, { - "epoch": 0.24, - "grad_norm": 9.999025344848633, - "learning_rate": 1.8402325348301093e-05, - "loss": 1.9912, + "epoch": 0.1, + "grad_norm": 22.3652286529541, + "learning_rate": 1.933313809982011e-05, + "loss": 3.5698, "step": 798 }, { - "epoch": 0.24, - "grad_norm": 11.801385879516602, - "learning_rate": 1.8400320737696703e-05, - "loss": 3.3957, + "epoch": 0.1, + "grad_norm": 20.88276481628418, + "learning_rate": 1.9332301384763423e-05, + "loss": 5.2997, "step": 799 }, { - "epoch": 0.24, - "grad_norm": 12.033352851867676, - "learning_rate": 1.8398316127092313e-05, - "loss": 2.62, + "epoch": 0.1, + "grad_norm": 7.848814010620117, + "learning_rate": 1.9331464669706733e-05, + "loss": 4.3608, "step": 800 }, { - "epoch": 0.24, - "grad_norm": 12.087538719177246, - "learning_rate": 1.8396311516487923e-05, - "loss": 3.2451, + "epoch": 0.1, + "eval_loss": 0.4046749472618103, + "eval_runtime": 95.7577, + "eval_samples_per_second": 36.989, + "eval_steps_per_second": 36.989, + "step": 800 + }, + { + "epoch": 0.1, + "grad_norm": 10.814807891845703, + "learning_rate": 1.9330627954650047e-05, + "loss": 1.7813, "step": 801 }, { - "epoch": 0.24, - "grad_norm": 9.926271438598633, - "learning_rate": 1.8394306905883533e-05, - "loss": 2.5499, + "epoch": 0.1, + "grad_norm": 10.413105010986328, + "learning_rate": 1.9329791239593357e-05, + "loss": 3.4774, "step": 802 }, { - "epoch": 0.24, - "grad_norm": 15.592676162719727, - "learning_rate": 1.8392302295279144e-05, - "loss": 3.7471, + "epoch": 0.1, + "grad_norm": 21.069353103637695, + "learning_rate": 1.932895452453667e-05, + "loss": 2.5785, "step": 803 }, { - "epoch": 0.24, - "grad_norm": 16.60494613647461, - "learning_rate": 1.8390297684674754e-05, - "loss": 3.218, + "epoch": 0.1, + "grad_norm": 12.616759300231934, + "learning_rate": 1.9328117809479984e-05, + "loss": 2.8462, "step": 804 }, { - "epoch": 0.24, - "grad_norm": 20.184545516967773, - "learning_rate": 1.8388293074070364e-05, - "loss": 2.9737, + "epoch": 0.1, + "grad_norm": 23.985736846923828, + "learning_rate": 1.9327281094423294e-05, + "loss": 4.0714, "step": 805 }, { - "epoch": 0.24, - "grad_norm": 12.5513334274292, - "learning_rate": 1.8386288463465974e-05, - "loss": 2.3381, + "epoch": 0.1, + "grad_norm": 21.885786056518555, + "learning_rate": 1.9326444379366608e-05, + "loss": 3.4187, "step": 806 }, { - "epoch": 0.24, - "grad_norm": 19.2335262298584, - "learning_rate": 1.8384283852861584e-05, - "loss": 3.5287, + "epoch": 0.1, + "grad_norm": 15.95827865600586, + "learning_rate": 1.9325607664309918e-05, + "loss": 3.7852, "step": 807 }, { - "epoch": 0.24, - "grad_norm": 16.872840881347656, - "learning_rate": 1.8382279242257194e-05, - "loss": 3.1892, + "epoch": 0.1, + "grad_norm": 11.442026138305664, + "learning_rate": 1.9324770949253232e-05, + "loss": 4.8199, "step": 808 }, { - "epoch": 0.24, - "grad_norm": 22.987356185913086, - "learning_rate": 1.8380274631652804e-05, - "loss": 2.534, + "epoch": 0.1, + "grad_norm": 12.986435890197754, + "learning_rate": 1.9323934234196546e-05, + "loss": 2.5731, "step": 809 }, { - "epoch": 0.24, - "grad_norm": 18.01645851135254, - "learning_rate": 1.837827002104841e-05, - "loss": 3.5278, + "epoch": 0.1, + "grad_norm": 9.10637092590332, + "learning_rate": 1.932309751913986e-05, + "loss": 2.5007, "step": 810 }, { - "epoch": 0.24, - "grad_norm": 23.722457885742188, - "learning_rate": 1.8376265410444024e-05, - "loss": 3.9305, + "epoch": 0.1, + "grad_norm": 15.682181358337402, + "learning_rate": 1.932226080408317e-05, + "loss": 3.6707, "step": 811 }, { - "epoch": 0.24, - "grad_norm": 11.13327693939209, - "learning_rate": 1.837426079983963e-05, - "loss": 3.9406, + "epoch": 0.1, + "grad_norm": 9.2945556640625, + "learning_rate": 1.9321424089026483e-05, + "loss": 2.9492, "step": 812 }, { - "epoch": 0.24, - "grad_norm": 18.1392765045166, - "learning_rate": 1.837225618923524e-05, - "loss": 2.3532, + "epoch": 0.1, + "grad_norm": 19.119644165039062, + "learning_rate": 1.9320587373969797e-05, + "loss": 5.1159, "step": 813 }, { - "epoch": 0.24, - "grad_norm": 17.197660446166992, - "learning_rate": 1.8370251578630854e-05, - "loss": 3.8571, + "epoch": 0.1, + "grad_norm": 12.646649360656738, + "learning_rate": 1.9319750658913107e-05, + "loss": 3.6129, "step": 814 }, { - "epoch": 0.25, - "grad_norm": 16.233530044555664, - "learning_rate": 1.836824696802646e-05, - "loss": 2.8763, + "epoch": 0.1, + "grad_norm": 14.362308502197266, + "learning_rate": 1.931891394385642e-05, + "loss": 3.9929, "step": 815 }, { - "epoch": 0.25, - "grad_norm": 104.82502746582031, - "learning_rate": 1.836624235742207e-05, - "loss": 3.3302, + "epoch": 0.1, + "grad_norm": 13.062503814697266, + "learning_rate": 1.9318077228799734e-05, + "loss": 3.6153, "step": 816 }, { - "epoch": 0.25, - "grad_norm": 14.149030685424805, - "learning_rate": 1.836423774681768e-05, - "loss": 3.2956, + "epoch": 0.1, + "grad_norm": 17.66871452331543, + "learning_rate": 1.9317240513743048e-05, + "loss": 2.5809, "step": 817 }, { - "epoch": 0.25, - "grad_norm": 13.704061508178711, - "learning_rate": 1.836223313621329e-05, - "loss": 2.7446, + "epoch": 0.1, + "grad_norm": 39.23408508300781, + "learning_rate": 1.9316403798686358e-05, + "loss": 2.9452, "step": 818 }, { - "epoch": 0.25, - "grad_norm": 14.284625053405762, - "learning_rate": 1.83602285256089e-05, - "loss": 2.9887, + "epoch": 0.1, + "grad_norm": 20.960430145263672, + "learning_rate": 1.9315567083629672e-05, + "loss": 4.0357, "step": 819 }, { - "epoch": 0.25, - "grad_norm": 20.733549118041992, - "learning_rate": 1.835822391500451e-05, - "loss": 3.3273, + "epoch": 0.1, + "grad_norm": 11.633240699768066, + "learning_rate": 1.9314730368572986e-05, + "loss": 2.0731, "step": 820 }, { - "epoch": 0.25, - "grad_norm": 27.305700302124023, - "learning_rate": 1.835621930440012e-05, - "loss": 3.061, + "epoch": 0.1, + "grad_norm": 10.769817352294922, + "learning_rate": 1.9313893653516296e-05, + "loss": 1.9501, "step": 821 }, { - "epoch": 0.25, - "grad_norm": 11.420647621154785, - "learning_rate": 1.8354214693795732e-05, - "loss": 3.1825, + "epoch": 0.1, + "grad_norm": 16.530324935913086, + "learning_rate": 1.931305693845961e-05, + "loss": 3.8412, "step": 822 }, { - "epoch": 0.25, - "grad_norm": 13.486950874328613, - "learning_rate": 1.8352210083191342e-05, - "loss": 3.1205, + "epoch": 0.1, + "grad_norm": 10.702472686767578, + "learning_rate": 1.9312220223402923e-05, + "loss": 2.675, "step": 823 }, { - "epoch": 0.25, - "grad_norm": 21.656282424926758, - "learning_rate": 1.8350205472586952e-05, - "loss": 2.7608, + "epoch": 0.1, + "grad_norm": 26.523433685302734, + "learning_rate": 1.9311383508346237e-05, + "loss": 1.1108, "step": 824 }, { - "epoch": 0.25, - "grad_norm": 31.164608001708984, - "learning_rate": 1.8348200861982562e-05, - "loss": 3.3097, + "epoch": 0.1, + "grad_norm": 21.036243438720703, + "learning_rate": 1.9310546793289547e-05, + "loss": 3.4756, "step": 825 }, { - "epoch": 0.25, - "grad_norm": 36.02324676513672, - "learning_rate": 1.8346196251378172e-05, - "loss": 3.2008, + "epoch": 0.1, + "grad_norm": 18.37494468688965, + "learning_rate": 1.930971007823286e-05, + "loss": 4.5667, "step": 826 }, { - "epoch": 0.25, - "grad_norm": 15.944084167480469, - "learning_rate": 1.8344191640773782e-05, - "loss": 2.9221, + "epoch": 0.1, + "grad_norm": 12.04539966583252, + "learning_rate": 1.9308873363176174e-05, + "loss": 1.546, "step": 827 }, { - "epoch": 0.25, - "grad_norm": 14.965399742126465, - "learning_rate": 1.8342187030169392e-05, - "loss": 3.1973, + "epoch": 0.1, + "grad_norm": 12.4345064163208, + "learning_rate": 1.9308036648119485e-05, + "loss": 3.3334, "step": 828 }, { - "epoch": 0.25, - "grad_norm": 20.134925842285156, - "learning_rate": 1.8340182419565e-05, - "loss": 3.521, + "epoch": 0.1, + "grad_norm": 69.78355407714844, + "learning_rate": 1.9307199933062798e-05, + "loss": 5.0178, "step": 829 }, { - "epoch": 0.25, - "grad_norm": 17.916906356811523, - "learning_rate": 1.8338177808960612e-05, - "loss": 3.8329, + "epoch": 0.1, + "grad_norm": 15.295698165893555, + "learning_rate": 1.930636321800611e-05, + "loss": 2.6952, "step": 830 }, { - "epoch": 0.25, - "grad_norm": 22.273197174072266, - "learning_rate": 1.8336173198356223e-05, - "loss": 3.0719, + "epoch": 0.1, + "grad_norm": 11.989553451538086, + "learning_rate": 1.9305526502949422e-05, + "loss": 2.7478, "step": 831 }, { - "epoch": 0.25, - "grad_norm": 18.33119773864746, - "learning_rate": 1.833416858775183e-05, - "loss": 3.1312, + "epoch": 0.1, + "grad_norm": 14.243673324584961, + "learning_rate": 1.9304689787892736e-05, + "loss": 4.1795, "step": 832 }, { - "epoch": 0.25, - "grad_norm": 17.575429916381836, - "learning_rate": 1.8332163977147443e-05, - "loss": 2.7529, + "epoch": 0.1, + "grad_norm": 15.110532760620117, + "learning_rate": 1.9303853072836046e-05, + "loss": 2.8877, "step": 833 }, { - "epoch": 0.25, - "grad_norm": 16.5457820892334, - "learning_rate": 1.833015936654305e-05, - "loss": 2.3905, + "epoch": 0.1, + "grad_norm": 12.30280590057373, + "learning_rate": 1.930301635777936e-05, + "loss": 3.3959, "step": 834 }, { - "epoch": 0.25, - "grad_norm": 19.157955169677734, - "learning_rate": 1.832815475593866e-05, - "loss": 3.6365, + "epoch": 0.1, + "grad_norm": 10.67061996459961, + "learning_rate": 1.930217964272267e-05, + "loss": 3.561, "step": 835 }, { - "epoch": 0.25, - "grad_norm": 16.239450454711914, - "learning_rate": 1.832615014533427e-05, - "loss": 2.864, + "epoch": 0.1, + "grad_norm": 16.788816452026367, + "learning_rate": 1.9301342927665984e-05, + "loss": 3.3843, "step": 836 }, { - "epoch": 0.25, - "grad_norm": 24.656652450561523, - "learning_rate": 1.832414553472988e-05, - "loss": 3.4563, + "epoch": 0.11, + "grad_norm": 14.812087059020996, + "learning_rate": 1.9300506212609297e-05, + "loss": 1.2697, "step": 837 }, { - "epoch": 0.25, - "grad_norm": 21.09084129333496, - "learning_rate": 1.832214092412549e-05, - "loss": 3.3509, + "epoch": 0.11, + "grad_norm": 17.998449325561523, + "learning_rate": 1.929966949755261e-05, + "loss": 3.3954, "step": 838 }, { - "epoch": 0.25, - "grad_norm": 14.556492805480957, - "learning_rate": 1.83201363135211e-05, - "loss": 3.0753, + "epoch": 0.11, + "grad_norm": 10.879467010498047, + "learning_rate": 1.929883278249592e-05, + "loss": 3.1961, "step": 839 }, { - "epoch": 0.25, - "grad_norm": 19.413835525512695, - "learning_rate": 1.831813170291671e-05, - "loss": 2.9689, - "step": 840 - }, - { - "epoch": 0.25, - "eval_loss": 0.5996547937393188, - "eval_runtime": 43.6208, - "eval_samples_per_second": 33.906, - "eval_steps_per_second": 33.906, + "epoch": 0.11, + "grad_norm": 13.773147583007812, + "learning_rate": 1.9297996067439235e-05, + "loss": 5.1689, "step": 840 }, { - "epoch": 0.25, - "grad_norm": 17.024858474731445, - "learning_rate": 1.831612709231232e-05, - "loss": 4.6989, + "epoch": 0.11, + "grad_norm": 19.973291397094727, + "learning_rate": 1.929715935238255e-05, + "loss": 4.2266, "step": 841 }, { - "epoch": 0.25, - "grad_norm": 12.914769172668457, - "learning_rate": 1.831412248170793e-05, - "loss": 3.6312, + "epoch": 0.11, + "grad_norm": 14.159008979797363, + "learning_rate": 1.929632263732586e-05, + "loss": 5.0419, "step": 842 }, { - "epoch": 0.25, - "grad_norm": 89.747802734375, - "learning_rate": 1.831211787110354e-05, - "loss": 2.7291, + "epoch": 0.11, + "grad_norm": 9.060222625732422, + "learning_rate": 1.9295485922269172e-05, + "loss": 1.2281, "step": 843 }, { - "epoch": 0.25, - "grad_norm": 15.173033714294434, - "learning_rate": 1.831011326049915e-05, - "loss": 3.1805, + "epoch": 0.11, + "grad_norm": 18.025102615356445, + "learning_rate": 1.9294649207212486e-05, + "loss": 1.9924, "step": 844 }, { - "epoch": 0.25, - "grad_norm": 40.06729507446289, - "learning_rate": 1.830810864989476e-05, - "loss": 4.5534, + "epoch": 0.11, + "grad_norm": 20.9349308013916, + "learning_rate": 1.92938124921558e-05, + "loss": 2.8242, "step": 845 }, { - "epoch": 0.25, - "grad_norm": 13.402242660522461, - "learning_rate": 1.830610403929037e-05, - "loss": 2.946, + "epoch": 0.11, + "grad_norm": 12.945772171020508, + "learning_rate": 1.929297577709911e-05, + "loss": 5.5559, "step": 846 }, { - "epoch": 0.25, - "grad_norm": 18.814979553222656, - "learning_rate": 1.830409942868598e-05, - "loss": 3.3728, + "epoch": 0.11, + "grad_norm": 37.96195983886719, + "learning_rate": 1.9292139062042424e-05, + "loss": 2.7299, "step": 847 }, { - "epoch": 0.25, - "grad_norm": 12.975325584411621, - "learning_rate": 1.8302094818081587e-05, - "loss": 2.8497, + "epoch": 0.11, + "grad_norm": 10.461493492126465, + "learning_rate": 1.9291302346985737e-05, + "loss": 3.2226, "step": 848 }, { - "epoch": 0.26, - "grad_norm": 9.28498363494873, - "learning_rate": 1.83000902074772e-05, - "loss": 2.4016, + "epoch": 0.11, + "grad_norm": 17.57033920288086, + "learning_rate": 1.9290465631929047e-05, + "loss": 3.6556, "step": 849 }, { - "epoch": 0.26, - "grad_norm": 15.675614356994629, - "learning_rate": 1.829808559687281e-05, - "loss": 2.7848, + "epoch": 0.11, + "grad_norm": 10.791287422180176, + "learning_rate": 1.928962891687236e-05, + "loss": 2.4405, "step": 850 }, { - "epoch": 0.26, - "grad_norm": 13.038680076599121, - "learning_rate": 1.8296080986268417e-05, - "loss": 3.4155, + "epoch": 0.11, + "grad_norm": 14.49622631072998, + "learning_rate": 1.9288792201815675e-05, + "loss": 5.1708, "step": 851 }, { - "epoch": 0.26, - "grad_norm": 11.621955871582031, - "learning_rate": 1.829407637566403e-05, - "loss": 2.8026, + "epoch": 0.11, + "grad_norm": 14.331541061401367, + "learning_rate": 1.928795548675899e-05, + "loss": 3.636, "step": 852 }, { - "epoch": 0.26, - "grad_norm": 11.502103805541992, - "learning_rate": 1.8292071765059638e-05, - "loss": 2.2732, + "epoch": 0.11, + "grad_norm": 11.339052200317383, + "learning_rate": 1.92871187717023e-05, + "loss": 2.8692, "step": 853 }, { - "epoch": 0.26, - "grad_norm": 27.41511344909668, - "learning_rate": 1.8290067154455248e-05, - "loss": 4.0694, + "epoch": 0.11, + "grad_norm": 26.334575653076172, + "learning_rate": 1.9286282056645612e-05, + "loss": 3.4792, "step": 854 }, { - "epoch": 0.26, - "grad_norm": 16.437061309814453, - "learning_rate": 1.8288062543850858e-05, - "loss": 3.0165, + "epoch": 0.11, + "grad_norm": 13.978888511657715, + "learning_rate": 1.9285445341588923e-05, + "loss": 2.6604, "step": 855 }, { - "epoch": 0.26, - "grad_norm": 14.086379051208496, - "learning_rate": 1.8286057933246468e-05, - "loss": 2.6271, + "epoch": 0.11, + "grad_norm": 12.099700927734375, + "learning_rate": 1.9284608626532236e-05, + "loss": 2.2098, "step": 856 }, { - "epoch": 0.26, - "grad_norm": 18.89293670654297, - "learning_rate": 1.8284053322642078e-05, - "loss": 3.2895, + "epoch": 0.11, + "grad_norm": 13.464815139770508, + "learning_rate": 1.928377191147555e-05, + "loss": 1.981, "step": 857 }, { - "epoch": 0.26, - "grad_norm": 14.886481285095215, - "learning_rate": 1.8282048712037688e-05, - "loss": 3.3004, + "epoch": 0.11, + "grad_norm": 40.965389251708984, + "learning_rate": 1.928293519641886e-05, + "loss": 4.6309, "step": 858 }, { - "epoch": 0.26, - "grad_norm": 21.090091705322266, - "learning_rate": 1.8280044101433298e-05, - "loss": 3.0523, + "epoch": 0.11, + "grad_norm": 13.609042167663574, + "learning_rate": 1.9282098481362174e-05, + "loss": 1.7419, "step": 859 }, { - "epoch": 0.26, - "grad_norm": 13.093547821044922, - "learning_rate": 1.8278039490828908e-05, - "loss": 2.3511, + "epoch": 0.11, + "grad_norm": 12.86327075958252, + "learning_rate": 1.9281261766305484e-05, + "loss": 3.5797, "step": 860 }, { - "epoch": 0.26, - "grad_norm": 13.202871322631836, - "learning_rate": 1.8276034880224518e-05, - "loss": 2.7941, + "epoch": 0.11, + "grad_norm": 11.691978454589844, + "learning_rate": 1.9280425051248798e-05, + "loss": 2.4073, "step": 861 }, { - "epoch": 0.26, - "grad_norm": 10.028319358825684, - "learning_rate": 1.827403026962013e-05, - "loss": 2.6855, + "epoch": 0.11, + "grad_norm": 14.143892288208008, + "learning_rate": 1.927958833619211e-05, + "loss": 3.1444, "step": 862 }, { - "epoch": 0.26, - "grad_norm": 14.725035667419434, - "learning_rate": 1.827202565901574e-05, - "loss": 3.3114, + "epoch": 0.11, + "grad_norm": 13.53709888458252, + "learning_rate": 1.927875162113542e-05, + "loss": 2.2037, "step": 863 }, { - "epoch": 0.26, - "grad_norm": 20.055673599243164, - "learning_rate": 1.827002104841135e-05, - "loss": 4.0679, + "epoch": 0.11, + "grad_norm": 18.455381393432617, + "learning_rate": 1.9277914906078735e-05, + "loss": 3.7846, "step": 864 }, { - "epoch": 0.26, - "grad_norm": 15.065644264221191, - "learning_rate": 1.8268016437806955e-05, - "loss": 2.8739, + "epoch": 0.11, + "grad_norm": 18.374515533447266, + "learning_rate": 1.927707819102205e-05, + "loss": 2.4646, "step": 865 }, { - "epoch": 0.26, - "grad_norm": 11.230770111083984, - "learning_rate": 1.826601182720257e-05, - "loss": 3.2987, + "epoch": 0.11, + "grad_norm": 11.123920440673828, + "learning_rate": 1.9276241475965363e-05, + "loss": 3.8492, "step": 866 }, { - "epoch": 0.26, - "grad_norm": 27.67386245727539, - "learning_rate": 1.8264007216598175e-05, - "loss": 3.0173, + "epoch": 0.11, + "grad_norm": 10.278346061706543, + "learning_rate": 1.9275404760908673e-05, + "loss": 1.2724, "step": 867 }, { - "epoch": 0.26, - "grad_norm": 18.033613204956055, - "learning_rate": 1.8262002605993785e-05, - "loss": 3.4757, + "epoch": 0.11, + "grad_norm": 15.350423812866211, + "learning_rate": 1.9274568045851986e-05, + "loss": 2.0491, "step": 868 }, { - "epoch": 0.26, - "grad_norm": 12.066390037536621, - "learning_rate": 1.82599979953894e-05, - "loss": 2.815, + "epoch": 0.11, + "grad_norm": 11.977092742919922, + "learning_rate": 1.92737313307953e-05, + "loss": 3.7265, "step": 869 }, { - "epoch": 0.26, - "grad_norm": 18.521556854248047, - "learning_rate": 1.8257993384785006e-05, - "loss": 4.0135, + "epoch": 0.11, + "grad_norm": 18.873859405517578, + "learning_rate": 1.927289461573861e-05, + "loss": 3.1926, "step": 870 }, { - "epoch": 0.26, - "grad_norm": 19.63759422302246, - "learning_rate": 1.825598877418062e-05, - "loss": 4.0507, + "epoch": 0.11, + "grad_norm": 15.339083671569824, + "learning_rate": 1.9272057900681924e-05, + "loss": 3.3854, "step": 871 }, { - "epoch": 0.26, - "grad_norm": 34.88851547241211, - "learning_rate": 1.8253984163576226e-05, - "loss": 4.7139, + "epoch": 0.11, + "grad_norm": 23.82866668701172, + "learning_rate": 1.9271221185625238e-05, + "loss": 4.5427, "step": 872 }, { - "epoch": 0.26, - "grad_norm": 15.407425880432129, - "learning_rate": 1.8251979552971836e-05, - "loss": 3.7188, + "epoch": 0.11, + "grad_norm": 16.461936950683594, + "learning_rate": 1.927038447056855e-05, + "loss": 3.9018, "step": 873 }, { - "epoch": 0.26, - "grad_norm": 12.915898323059082, - "learning_rate": 1.824997494236745e-05, - "loss": 2.5801, + "epoch": 0.11, + "grad_norm": 19.81767463684082, + "learning_rate": 1.926954775551186e-05, + "loss": 4.0158, "step": 874 }, { - "epoch": 0.26, - "grad_norm": 12.690547943115234, - "learning_rate": 1.8247970331763056e-05, - "loss": 2.3076, + "epoch": 0.11, + "grad_norm": 21.873647689819336, + "learning_rate": 1.9268711040455175e-05, + "loss": 3.0256, "step": 875 }, { - "epoch": 0.26, - "grad_norm": 31.50592041015625, - "learning_rate": 1.8245965721158666e-05, - "loss": 2.8265, + "epoch": 0.11, + "grad_norm": 20.212047576904297, + "learning_rate": 1.926787432539849e-05, + "loss": 3.4756, "step": 876 }, { - "epoch": 0.26, - "grad_norm": 13.253562927246094, - "learning_rate": 1.8243961110554276e-05, - "loss": 2.612, + "epoch": 0.11, + "grad_norm": 13.910633087158203, + "learning_rate": 1.92670376103418e-05, + "loss": 3.4597, "step": 877 }, { - "epoch": 0.26, - "grad_norm": 21.80856704711914, - "learning_rate": 1.8241956499949886e-05, - "loss": 3.0365, + "epoch": 0.11, + "grad_norm": 13.977246284484863, + "learning_rate": 1.9266200895285113e-05, + "loss": 2.26, "step": 878 }, { - "epoch": 0.26, - "grad_norm": 17.941879272460938, - "learning_rate": 1.8239951889345496e-05, - "loss": 2.9886, + "epoch": 0.11, + "grad_norm": 16.114044189453125, + "learning_rate": 1.9265364180228426e-05, + "loss": 5.6168, "step": 879 }, { - "epoch": 0.26, - "grad_norm": 14.684907913208008, - "learning_rate": 1.8237947278741106e-05, - "loss": 2.6941, + "epoch": 0.11, + "grad_norm": 21.8970890045166, + "learning_rate": 1.926452746517174e-05, + "loss": 2.5029, "step": 880 }, { - "epoch": 0.26, - "grad_norm": 19.08102798461914, - "learning_rate": 1.8235942668136717e-05, - "loss": 3.4722, + "epoch": 0.11, + "grad_norm": 21.701828002929688, + "learning_rate": 1.926369075011505e-05, + "loss": 5.1717, "step": 881 }, { - "epoch": 0.27, - "grad_norm": 14.615809440612793, - "learning_rate": 1.8233938057532327e-05, - "loss": 2.9938, + "epoch": 0.11, + "grad_norm": 13.911944389343262, + "learning_rate": 1.9262854035058364e-05, + "loss": 3.6187, "step": 882 }, { - "epoch": 0.27, - "grad_norm": 22.291290283203125, - "learning_rate": 1.8231933446927937e-05, - "loss": 4.3829, + "epoch": 0.11, + "grad_norm": 15.300573348999023, + "learning_rate": 1.9262017320001674e-05, + "loss": 4.3989, "step": 883 }, { - "epoch": 0.27, - "grad_norm": 16.102741241455078, - "learning_rate": 1.8229928836323543e-05, - "loss": 3.5552, + "epoch": 0.11, + "grad_norm": 22.996200561523438, + "learning_rate": 1.9261180604944988e-05, + "loss": 3.8584, "step": 884 }, { - "epoch": 0.27, - "grad_norm": 20.495725631713867, - "learning_rate": 1.8227924225719157e-05, - "loss": 3.3252, + "epoch": 0.11, + "grad_norm": 9.07385540008545, + "learning_rate": 1.92603438898883e-05, + "loss": 4.0722, "step": 885 }, { - "epoch": 0.27, - "grad_norm": 25.180877685546875, - "learning_rate": 1.8225919615114764e-05, - "loss": 2.6339, + "epoch": 0.11, + "grad_norm": 20.0780086517334, + "learning_rate": 1.9259507174831612e-05, + "loss": 3.5784, "step": 886 }, { - "epoch": 0.27, - "grad_norm": 14.054975509643555, - "learning_rate": 1.8223915004510374e-05, - "loss": 3.2697, + "epoch": 0.11, + "grad_norm": 15.519294738769531, + "learning_rate": 1.9258670459774925e-05, + "loss": 4.7674, "step": 887 }, { - "epoch": 0.27, - "grad_norm": 14.858386993408203, - "learning_rate": 1.8221910393905987e-05, - "loss": 2.3764, + "epoch": 0.11, + "grad_norm": 11.765948295593262, + "learning_rate": 1.9257833744718236e-05, + "loss": 1.0629, "step": 888 }, { - "epoch": 0.27, - "grad_norm": 14.020156860351562, - "learning_rate": 1.8219905783301594e-05, - "loss": 2.4207, + "epoch": 0.11, + "grad_norm": 10.282224655151367, + "learning_rate": 1.925699702966155e-05, + "loss": 0.4704, "step": 889 }, { - "epoch": 0.27, - "grad_norm": 20.844745635986328, - "learning_rate": 1.8217901172697204e-05, - "loss": 2.8965, + "epoch": 0.11, + "grad_norm": 11.294329643249512, + "learning_rate": 1.9256160314604863e-05, + "loss": 3.0352, "step": 890 }, { - "epoch": 0.27, - "grad_norm": 12.82536792755127, - "learning_rate": 1.8215896562092814e-05, - "loss": 2.1605, + "epoch": 0.11, + "grad_norm": 21.708587646484375, + "learning_rate": 1.9255323599548173e-05, + "loss": 3.151, "step": 891 }, { - "epoch": 0.27, - "grad_norm": 11.74587345123291, - "learning_rate": 1.8213891951488424e-05, - "loss": 3.171, + "epoch": 0.11, + "grad_norm": 14.7886323928833, + "learning_rate": 1.9254486884491487e-05, + "loss": 3.4023, "step": 892 }, { - "epoch": 0.27, - "grad_norm": 21.482084274291992, - "learning_rate": 1.8211887340884034e-05, - "loss": 3.142, + "epoch": 0.11, + "grad_norm": 12.128108978271484, + "learning_rate": 1.92536501694348e-05, + "loss": 1.6436, "step": 893 }, { - "epoch": 0.27, - "grad_norm": 28.953731536865234, - "learning_rate": 1.8209882730279644e-05, - "loss": 3.2382, + "epoch": 0.11, + "grad_norm": 27.2979736328125, + "learning_rate": 1.925281345437811e-05, + "loss": 3.554, "step": 894 }, { - "epoch": 0.27, - "grad_norm": 17.727638244628906, - "learning_rate": 1.8207878119675254e-05, - "loss": 3.2516, + "epoch": 0.11, + "grad_norm": 12.120756149291992, + "learning_rate": 1.9251976739321425e-05, + "loss": 3.8833, "step": 895 }, { - "epoch": 0.27, - "grad_norm": 24.647932052612305, - "learning_rate": 1.8205873509070864e-05, - "loss": 3.1515, + "epoch": 0.11, + "grad_norm": 15.177332878112793, + "learning_rate": 1.9251140024264738e-05, + "loss": 2.3427, "step": 896 }, { - "epoch": 0.27, - "grad_norm": 15.397368431091309, - "learning_rate": 1.8203868898466475e-05, - "loss": 3.8315, + "epoch": 0.11, + "grad_norm": 14.57490062713623, + "learning_rate": 1.9250303309208052e-05, + "loss": 2.8223, "step": 897 }, { - "epoch": 0.27, - "grad_norm": 10.54749584197998, - "learning_rate": 1.8201864287862085e-05, - "loss": 1.3177, + "epoch": 0.11, + "grad_norm": 14.658095359802246, + "learning_rate": 1.9249466594151362e-05, + "loss": 4.0262, "step": 898 }, { - "epoch": 0.27, - "grad_norm": 22.014423370361328, - "learning_rate": 1.8199859677257695e-05, - "loss": 3.8405, + "epoch": 0.11, + "grad_norm": 25.267988204956055, + "learning_rate": 1.9248629879094676e-05, + "loss": 3.9113, "step": 899 }, { - "epoch": 0.27, - "grad_norm": 18.356632232666016, - "learning_rate": 1.8197855066653305e-05, - "loss": 3.0761, + "epoch": 0.11, + "grad_norm": 11.966208457946777, + "learning_rate": 1.924779316403799e-05, + "loss": 4.6879, "step": 900 }, { - "epoch": 0.27, - "grad_norm": 19.23956298828125, - "learning_rate": 1.8195850456048915e-05, - "loss": 3.8153, + "epoch": 0.11, + "grad_norm": 20.84747314453125, + "learning_rate": 1.92469564489813e-05, + "loss": 3.4313, "step": 901 }, { - "epoch": 0.27, - "grad_norm": 21.569475173950195, - "learning_rate": 1.8193845845444525e-05, - "loss": 2.913, + "epoch": 0.11, + "grad_norm": 23.972240447998047, + "learning_rate": 1.9246119733924613e-05, + "loss": 3.6033, "step": 902 }, { - "epoch": 0.27, - "grad_norm": 10.557537078857422, - "learning_rate": 1.819184123484013e-05, - "loss": 2.8867, + "epoch": 0.11, + "grad_norm": 14.662901878356934, + "learning_rate": 1.9245283018867927e-05, + "loss": 1.7671, "step": 903 }, { - "epoch": 0.27, - "grad_norm": 14.843564987182617, - "learning_rate": 1.8189836624235745e-05, - "loss": 2.9857, + "epoch": 0.11, + "grad_norm": 17.601343154907227, + "learning_rate": 1.924444630381124e-05, + "loss": 3.5829, "step": 904 }, { - "epoch": 0.27, - "grad_norm": 10.520699501037598, - "learning_rate": 1.8187832013631355e-05, - "loss": 3.0075, + "epoch": 0.11, + "grad_norm": 13.459490776062012, + "learning_rate": 1.924360958875455e-05, + "loss": 3.098, "step": 905 }, { - "epoch": 0.27, - "grad_norm": 18.192188262939453, - "learning_rate": 1.8185827403026962e-05, - "loss": 4.4008, + "epoch": 0.11, + "grad_norm": 16.708049774169922, + "learning_rate": 1.9242772873697864e-05, + "loss": 4.3347, "step": 906 }, { - "epoch": 0.27, - "grad_norm": 13.926733016967773, - "learning_rate": 1.8183822792422575e-05, - "loss": 3.1211, + "epoch": 0.11, + "grad_norm": 15.664067268371582, + "learning_rate": 1.9241936158641178e-05, + "loss": 2.6975, "step": 907 }, { - "epoch": 0.27, - "grad_norm": 17.12418556213379, - "learning_rate": 1.8181818181818182e-05, - "loss": 3.4901, + "epoch": 0.11, + "grad_norm": 20.520610809326172, + "learning_rate": 1.924109944358449e-05, + "loss": 3.8704, "step": 908 }, { - "epoch": 0.27, - "grad_norm": 16.681673049926758, - "learning_rate": 1.8179813571213792e-05, - "loss": 3.2524, + "epoch": 0.11, + "grad_norm": 7.673875331878662, + "learning_rate": 1.9240262728527802e-05, + "loss": 1.7854, "step": 909 }, { - "epoch": 0.27, - "grad_norm": 18.500484466552734, - "learning_rate": 1.8177808960609402e-05, - "loss": 3.7737, + "epoch": 0.11, + "grad_norm": 12.935189247131348, + "learning_rate": 1.9239426013471116e-05, + "loss": 3.7663, "step": 910 }, { - "epoch": 0.27, - "grad_norm": 13.746018409729004, - "learning_rate": 1.8175804350005012e-05, - "loss": 2.4568, + "epoch": 0.11, + "grad_norm": 11.801560401916504, + "learning_rate": 1.9238589298414426e-05, + "loss": 3.6699, "step": 911 }, { - "epoch": 0.27, - "grad_norm": 12.869297981262207, - "learning_rate": 1.8173799739400622e-05, - "loss": 2.6976, + "epoch": 0.11, + "grad_norm": 16.79932403564453, + "learning_rate": 1.923775258335774e-05, + "loss": 5.206, "step": 912 }, { - "epoch": 0.27, - "grad_norm": 17.630205154418945, - "learning_rate": 1.8171795128796232e-05, - "loss": 3.2424, + "epoch": 0.11, + "grad_norm": 10.527442932128906, + "learning_rate": 1.923691586830105e-05, + "loss": 2.3717, "step": 913 }, { - "epoch": 0.27, - "grad_norm": 11.685369491577148, - "learning_rate": 1.8169790518191843e-05, - "loss": 2.3216, + "epoch": 0.11, + "grad_norm": 10.9661865234375, + "learning_rate": 1.9236079153244364e-05, + "loss": 2.4747, "step": 914 }, { - "epoch": 0.28, - "grad_norm": 23.479877471923828, - "learning_rate": 1.8167785907587453e-05, - "loss": 3.3934, + "epoch": 0.11, + "grad_norm": 10.854809761047363, + "learning_rate": 1.9235242438187677e-05, + "loss": 4.4707, "step": 915 }, { - "epoch": 0.28, - "grad_norm": 11.527375221252441, - "learning_rate": 1.8165781296983063e-05, - "loss": 2.9907, + "epoch": 0.11, + "grad_norm": 13.150390625, + "learning_rate": 1.9234405723130987e-05, + "loss": 3.8095, "step": 916 }, { - "epoch": 0.28, - "grad_norm": 11.39271354675293, - "learning_rate": 1.8163776686378673e-05, - "loss": 2.8909, + "epoch": 0.12, + "grad_norm": 15.951024055480957, + "learning_rate": 1.92335690080743e-05, + "loss": 3.7779, "step": 917 }, { - "epoch": 0.28, - "grad_norm": 18.55435562133789, - "learning_rate": 1.8161772075774283e-05, - "loss": 2.9944, + "epoch": 0.12, + "grad_norm": 14.399053573608398, + "learning_rate": 1.9232732293017615e-05, + "loss": 2.7399, "step": 918 }, { - "epoch": 0.28, - "grad_norm": 13.561935424804688, - "learning_rate": 1.8159767465169893e-05, - "loss": 2.385, + "epoch": 0.12, + "grad_norm": 15.080406188964844, + "learning_rate": 1.9231895577960925e-05, + "loss": 4.3843, "step": 919 }, { - "epoch": 0.28, - "grad_norm": 16.513639450073242, - "learning_rate": 1.8157762854565503e-05, - "loss": 3.0903, + "epoch": 0.12, + "grad_norm": 9.267796516418457, + "learning_rate": 1.923105886290424e-05, + "loss": 1.9494, "step": 920 }, { - "epoch": 0.28, - "grad_norm": 19.51789093017578, - "learning_rate": 1.8155758243961113e-05, - "loss": 4.0376, + "epoch": 0.12, + "grad_norm": 13.215700149536133, + "learning_rate": 1.9230222147847552e-05, + "loss": 3.3097, "step": 921 }, { - "epoch": 0.28, - "grad_norm": 16.57154655456543, - "learning_rate": 1.815375363335672e-05, - "loss": 3.3258, + "epoch": 0.12, + "grad_norm": 14.999835014343262, + "learning_rate": 1.9229385432790863e-05, + "loss": 0.6485, "step": 922 }, { - "epoch": 0.28, - "grad_norm": 12.257933616638184, - "learning_rate": 1.8151749022752333e-05, - "loss": 2.85, + "epoch": 0.12, + "grad_norm": 20.776092529296875, + "learning_rate": 1.9228548717734176e-05, + "loss": 4.4746, "step": 923 }, { - "epoch": 0.28, - "grad_norm": 19.822603225708008, - "learning_rate": 1.8149744412147943e-05, - "loss": 2.7183, + "epoch": 0.12, + "grad_norm": 13.484353065490723, + "learning_rate": 1.922771200267749e-05, + "loss": 3.3389, "step": 924 }, { - "epoch": 0.28, - "grad_norm": 13.144048690795898, - "learning_rate": 1.814773980154355e-05, - "loss": 2.9005, + "epoch": 0.12, + "grad_norm": 17.923723220825195, + "learning_rate": 1.9226875287620803e-05, + "loss": 2.1387, "step": 925 }, { - "epoch": 0.28, - "grad_norm": 13.302081108093262, - "learning_rate": 1.8145735190939164e-05, - "loss": 3.5627, + "epoch": 0.12, + "grad_norm": 90.81083679199219, + "learning_rate": 1.9226038572564114e-05, + "loss": 1.8454, "step": 926 }, { - "epoch": 0.28, - "grad_norm": 18.423614501953125, - "learning_rate": 1.814373058033477e-05, - "loss": 3.7219, + "epoch": 0.12, + "grad_norm": 22.24589729309082, + "learning_rate": 1.9225201857507427e-05, + "loss": 2.0114, "step": 927 }, { - "epoch": 0.28, - "grad_norm": 16.71343231201172, - "learning_rate": 1.814172596973038e-05, - "loss": 2.3099, + "epoch": 0.12, + "grad_norm": 15.252634048461914, + "learning_rate": 1.922436514245074e-05, + "loss": 4.191, "step": 928 }, { - "epoch": 0.28, - "grad_norm": 12.817221641540527, - "learning_rate": 1.813972135912599e-05, - "loss": 3.1187, + "epoch": 0.12, + "grad_norm": 23.059764862060547, + "learning_rate": 1.922352842739405e-05, + "loss": 2.4768, "step": 929 }, { - "epoch": 0.28, - "grad_norm": 23.68539047241211, - "learning_rate": 1.81377167485216e-05, - "loss": 2.8866, + "epoch": 0.12, + "grad_norm": 18.246435165405273, + "learning_rate": 1.9222691712337365e-05, + "loss": 4.5082, "step": 930 }, { - "epoch": 0.28, - "grad_norm": 19.9428768157959, - "learning_rate": 1.813571213791721e-05, - "loss": 3.3457, + "epoch": 0.12, + "grad_norm": 13.866202354431152, + "learning_rate": 1.922185499728068e-05, + "loss": 4.214, "step": 931 }, { - "epoch": 0.28, - "grad_norm": 15.457541465759277, - "learning_rate": 1.813370752731282e-05, - "loss": 2.5672, + "epoch": 0.12, + "grad_norm": 17.0211238861084, + "learning_rate": 1.9221018282223992e-05, + "loss": 2.7061, "step": 932 }, { - "epoch": 0.28, - "grad_norm": 14.215147972106934, - "learning_rate": 1.813170291670843e-05, - "loss": 3.2077, + "epoch": 0.12, + "grad_norm": 9.928666114807129, + "learning_rate": 1.9220181567167303e-05, + "loss": 2.093, "step": 933 }, { - "epoch": 0.28, - "grad_norm": 23.65352439880371, - "learning_rate": 1.812969830610404e-05, - "loss": 4.0376, + "epoch": 0.12, + "grad_norm": 12.430704116821289, + "learning_rate": 1.9219344852110616e-05, + "loss": 1.8956, "step": 934 }, { - "epoch": 0.28, - "grad_norm": 15.811484336853027, - "learning_rate": 1.812769369549965e-05, - "loss": 3.4806, + "epoch": 0.12, + "grad_norm": 25.73609161376953, + "learning_rate": 1.921850813705393e-05, + "loss": 3.4497, "step": 935 }, { - "epoch": 0.28, - "grad_norm": 22.097625732421875, - "learning_rate": 1.812568908489526e-05, - "loss": 2.9302, + "epoch": 0.12, + "grad_norm": 10.325471878051758, + "learning_rate": 1.921767142199724e-05, + "loss": 3.3559, "step": 936 }, { - "epoch": 0.28, - "grad_norm": 21.592880249023438, - "learning_rate": 1.812368447429087e-05, - "loss": 3.5155, + "epoch": 0.12, + "grad_norm": 14.831570625305176, + "learning_rate": 1.9216834706940554e-05, + "loss": 2.6587, "step": 937 }, { - "epoch": 0.28, - "grad_norm": 13.204874992370605, - "learning_rate": 1.812167986368648e-05, - "loss": 3.172, + "epoch": 0.12, + "grad_norm": 13.535395622253418, + "learning_rate": 1.9215997991883867e-05, + "loss": 2.3793, "step": 938 }, { - "epoch": 0.28, - "grad_norm": 19.575117111206055, - "learning_rate": 1.811967525308209e-05, - "loss": 3.1032, + "epoch": 0.12, + "grad_norm": 27.883773803710938, + "learning_rate": 1.9215161276827178e-05, + "loss": 3.3018, "step": 939 }, { - "epoch": 0.28, - "grad_norm": 19.439678192138672, - "learning_rate": 1.81176706424777e-05, - "loss": 3.1572, + "epoch": 0.12, + "grad_norm": 12.214103698730469, + "learning_rate": 1.921432456177049e-05, + "loss": 4.0168, "step": 940 }, { - "epoch": 0.28, - "grad_norm": 18.003063201904297, - "learning_rate": 1.8115666031873308e-05, - "loss": 3.6915, + "epoch": 0.12, + "grad_norm": 13.346578598022461, + "learning_rate": 1.92134878467138e-05, + "loss": 2.6091, "step": 941 }, { - "epoch": 0.28, - "grad_norm": 14.612069129943848, - "learning_rate": 1.811366142126892e-05, - "loss": 2.6144, + "epoch": 0.12, + "grad_norm": 13.722108840942383, + "learning_rate": 1.9212651131657115e-05, + "loss": 3.6267, "step": 942 }, { - "epoch": 0.28, - "grad_norm": 26.311756134033203, - "learning_rate": 1.811165681066453e-05, - "loss": 4.1978, + "epoch": 0.12, + "grad_norm": 25.921424865722656, + "learning_rate": 1.921181441660043e-05, + "loss": 3.3123, "step": 943 }, { - "epoch": 0.28, - "grad_norm": 12.142837524414062, - "learning_rate": 1.810965220006014e-05, - "loss": 1.9791, + "epoch": 0.12, + "grad_norm": 11.782764434814453, + "learning_rate": 1.921097770154374e-05, + "loss": 2.578, "step": 944 }, { - "epoch": 0.28, - "grad_norm": 16.259824752807617, - "learning_rate": 1.8107647589455752e-05, - "loss": 3.0366, + "epoch": 0.12, + "grad_norm": 23.230119705200195, + "learning_rate": 1.9210140986487053e-05, + "loss": 2.4671, "step": 945 }, { - "epoch": 0.28, - "grad_norm": 15.532299041748047, - "learning_rate": 1.810564297885136e-05, - "loss": 2.9808, + "epoch": 0.12, + "grad_norm": 17.044708251953125, + "learning_rate": 1.9209304271430366e-05, + "loss": 3.4549, "step": 946 }, { - "epoch": 0.28, - "grad_norm": 31.902145385742188, - "learning_rate": 1.810363836824697e-05, - "loss": 4.2394, + "epoch": 0.12, + "grad_norm": 12.964016914367676, + "learning_rate": 1.9208467556373677e-05, + "loss": 3.3802, "step": 947 }, { - "epoch": 0.29, - "grad_norm": 20.337215423583984, - "learning_rate": 1.8101633757642582e-05, - "loss": 3.8077, + "epoch": 0.12, + "grad_norm": 10.344799041748047, + "learning_rate": 1.920763084131699e-05, + "loss": 2.1356, "step": 948 }, { - "epoch": 0.29, - "grad_norm": 15.51062297821045, - "learning_rate": 1.809962914703819e-05, - "loss": 1.9024, + "epoch": 0.12, + "grad_norm": 15.343401908874512, + "learning_rate": 1.9206794126260304e-05, + "loss": 3.9159, "step": 949 }, { - "epoch": 0.29, - "grad_norm": 21.733741760253906, - "learning_rate": 1.80976245364338e-05, - "loss": 3.1563, + "epoch": 0.12, + "grad_norm": 18.93767738342285, + "learning_rate": 1.9205957411203614e-05, + "loss": 3.2314, "step": 950 }, { - "epoch": 0.29, - "grad_norm": 27.472270965576172, - "learning_rate": 1.809561992582941e-05, - "loss": 3.7646, + "epoch": 0.12, + "grad_norm": 16.269479751586914, + "learning_rate": 1.9205120696146928e-05, + "loss": 2.7732, "step": 951 }, { - "epoch": 0.29, - "grad_norm": 33.5841178894043, - "learning_rate": 1.809361531522502e-05, - "loss": 3.6314, + "epoch": 0.12, + "grad_norm": 11.748882293701172, + "learning_rate": 1.920428398109024e-05, + "loss": 2.2064, "step": 952 }, { - "epoch": 0.29, - "grad_norm": 15.488965034484863, - "learning_rate": 1.809161070462063e-05, - "loss": 3.8817, + "epoch": 0.12, + "grad_norm": 19.187292098999023, + "learning_rate": 1.9203447266033555e-05, + "loss": 2.2557, "step": 953 }, { - "epoch": 0.29, - "grad_norm": 11.746240615844727, - "learning_rate": 1.808960609401624e-05, - "loss": 2.1518, + "epoch": 0.12, + "grad_norm": 15.426502227783203, + "learning_rate": 1.9202610550976865e-05, + "loss": 3.5879, "step": 954 }, { - "epoch": 0.29, - "grad_norm": 9.70715618133545, - "learning_rate": 1.808760148341185e-05, - "loss": 2.5758, + "epoch": 0.12, + "grad_norm": 18.423494338989258, + "learning_rate": 1.920177383592018e-05, + "loss": 2.3315, "step": 955 }, { - "epoch": 0.29, - "grad_norm": 15.384037017822266, - "learning_rate": 1.808559687280746e-05, - "loss": 3.2143, + "epoch": 0.12, + "grad_norm": 21.4342041015625, + "learning_rate": 1.9200937120863493e-05, + "loss": 3.2196, "step": 956 }, { - "epoch": 0.29, - "grad_norm": 18.4316349029541, - "learning_rate": 1.808359226220307e-05, - "loss": 4.8752, + "epoch": 0.12, + "grad_norm": 27.630748748779297, + "learning_rate": 1.9200100405806803e-05, + "loss": 5.156, "step": 957 }, { - "epoch": 0.29, - "grad_norm": 13.498906135559082, - "learning_rate": 1.8081587651598676e-05, - "loss": 3.2658, + "epoch": 0.12, + "grad_norm": 11.966787338256836, + "learning_rate": 1.9199263690750117e-05, + "loss": 1.4733, "step": 958 }, { - "epoch": 0.29, - "grad_norm": 26.12436866760254, - "learning_rate": 1.807958304099429e-05, - "loss": 1.9117, + "epoch": 0.12, + "grad_norm": 16.689373016357422, + "learning_rate": 1.919842697569343e-05, + "loss": 2.2288, "step": 959 }, { - "epoch": 0.29, - "grad_norm": 14.987210273742676, - "learning_rate": 1.80775784303899e-05, - "loss": 2.1122, - "step": 960 - }, - { - "epoch": 0.29, - "eval_loss": 0.5726422667503357, - "eval_runtime": 43.6835, - "eval_samples_per_second": 33.857, - "eval_steps_per_second": 33.857, + "epoch": 0.12, + "grad_norm": 36.253570556640625, + "learning_rate": 1.9197590260636744e-05, + "loss": 3.6702, "step": 960 }, { - "epoch": 0.29, - "grad_norm": 11.189974784851074, - "learning_rate": 1.8075573819785506e-05, - "loss": 2.7843, + "epoch": 0.12, + "grad_norm": 22.745553970336914, + "learning_rate": 1.9196753545580054e-05, + "loss": 2.2173, "step": 961 }, { - "epoch": 0.29, - "grad_norm": 13.831470489501953, - "learning_rate": 1.807356920918112e-05, - "loss": 3.8859, + "epoch": 0.12, + "grad_norm": 8.689067840576172, + "learning_rate": 1.9195916830523368e-05, + "loss": 1.2961, "step": 962 }, { - "epoch": 0.29, - "grad_norm": 14.756105422973633, - "learning_rate": 1.8071564598576727e-05, - "loss": 2.6933, + "epoch": 0.12, + "grad_norm": 15.15867805480957, + "learning_rate": 1.919508011546668e-05, + "loss": 3.4019, "step": 963 }, { - "epoch": 0.29, - "grad_norm": 13.506275177001953, - "learning_rate": 1.8069559987972337e-05, - "loss": 2.7927, + "epoch": 0.12, + "grad_norm": 15.15098762512207, + "learning_rate": 1.9194243400409992e-05, + "loss": 3.566, "step": 964 }, { - "epoch": 0.29, - "grad_norm": 27.999460220336914, - "learning_rate": 1.8067555377367947e-05, - "loss": 3.3839, + "epoch": 0.12, + "grad_norm": 17.65106201171875, + "learning_rate": 1.9193406685353305e-05, + "loss": 2.8859, "step": 965 }, { - "epoch": 0.29, - "grad_norm": 13.807269096374512, - "learning_rate": 1.8065550766763557e-05, - "loss": 2.0444, + "epoch": 0.12, + "grad_norm": 15.059209823608398, + "learning_rate": 1.9192569970296616e-05, + "loss": 2.474, "step": 966 }, { - "epoch": 0.29, - "grad_norm": 15.027080535888672, - "learning_rate": 1.8063546156159167e-05, - "loss": 2.5851, + "epoch": 0.12, + "grad_norm": 13.547049522399902, + "learning_rate": 1.919173325523993e-05, + "loss": 2.8793, "step": 967 }, { - "epoch": 0.29, - "grad_norm": 46.652530670166016, - "learning_rate": 1.8061541545554777e-05, - "loss": 2.5573, + "epoch": 0.12, + "grad_norm": 15.953766822814941, + "learning_rate": 1.9190896540183243e-05, + "loss": 2.6781, "step": 968 }, { - "epoch": 0.29, - "grad_norm": 17.231346130371094, - "learning_rate": 1.8059536934950387e-05, - "loss": 2.3659, + "epoch": 0.12, + "grad_norm": 19.517332077026367, + "learning_rate": 1.9190059825126553e-05, + "loss": 4.631, "step": 969 }, { - "epoch": 0.29, - "grad_norm": 17.74903678894043, - "learning_rate": 1.8057532324345997e-05, - "loss": 2.8746, + "epoch": 0.12, + "grad_norm": 13.69670295715332, + "learning_rate": 1.9189223110069867e-05, + "loss": 3.5081, "step": 970 }, { - "epoch": 0.29, - "grad_norm": 9.483039855957031, - "learning_rate": 1.8055527713741607e-05, - "loss": 2.6829, + "epoch": 0.12, + "grad_norm": 26.607080459594727, + "learning_rate": 1.9188386395013177e-05, + "loss": 3.4868, "step": 971 }, { - "epoch": 0.29, - "grad_norm": 10.787821769714355, - "learning_rate": 1.8053523103137217e-05, - "loss": 2.395, + "epoch": 0.12, + "grad_norm": 11.74852466583252, + "learning_rate": 1.918754967995649e-05, + "loss": 1.8016, "step": 972 }, { - "epoch": 0.29, - "grad_norm": 16.904396057128906, - "learning_rate": 1.8051518492532827e-05, - "loss": 2.8498, + "epoch": 0.12, + "grad_norm": 14.13051986694336, + "learning_rate": 1.9186712964899804e-05, + "loss": 3.496, "step": 973 }, { - "epoch": 0.29, - "grad_norm": 18.882564544677734, - "learning_rate": 1.8049513881928437e-05, - "loss": 3.3251, + "epoch": 0.12, + "grad_norm": 13.406763076782227, + "learning_rate": 1.9185876249843118e-05, + "loss": 4.1372, "step": 974 }, { - "epoch": 0.29, - "grad_norm": 18.128250122070312, - "learning_rate": 1.8047509271324048e-05, - "loss": 3.1162, + "epoch": 0.12, + "grad_norm": 12.86937141418457, + "learning_rate": 1.918503953478643e-05, + "loss": 4.6107, "step": 975 }, { - "epoch": 0.29, - "grad_norm": 19.998552322387695, - "learning_rate": 1.8045504660719658e-05, - "loss": 3.183, + "epoch": 0.12, + "grad_norm": 17.223657608032227, + "learning_rate": 1.9184202819729742e-05, + "loss": 3.7678, "step": 976 }, { - "epoch": 0.29, - "grad_norm": 19.723297119140625, - "learning_rate": 1.8043500050115264e-05, - "loss": 3.0132, + "epoch": 0.12, + "grad_norm": 13.638578414916992, + "learning_rate": 1.9183366104673056e-05, + "loss": 3.242, "step": 977 }, { - "epoch": 0.29, - "grad_norm": 18.414817810058594, - "learning_rate": 1.8041495439510878e-05, - "loss": 2.6902, + "epoch": 0.12, + "grad_norm": 14.202733993530273, + "learning_rate": 1.9182529389616366e-05, + "loss": 2.0694, "step": 978 }, { - "epoch": 0.29, - "grad_norm": 15.55300521850586, - "learning_rate": 1.8039490828906488e-05, - "loss": 2.0389, + "epoch": 0.12, + "grad_norm": 20.721847534179688, + "learning_rate": 1.918169267455968e-05, + "loss": 4.5076, "step": 979 }, { - "epoch": 0.29, - "grad_norm": 19.29793357849121, - "learning_rate": 1.8037486218302095e-05, - "loss": 3.276, + "epoch": 0.12, + "grad_norm": 14.14547348022461, + "learning_rate": 1.9180855959502993e-05, + "loss": 4.7961, "step": 980 }, { - "epoch": 0.29, - "grad_norm": 18.001596450805664, - "learning_rate": 1.8035481607697708e-05, - "loss": 2.2783, + "epoch": 0.12, + "grad_norm": 13.157936096191406, + "learning_rate": 1.9180019244446307e-05, + "loss": 1.7192, "step": 981 }, { - "epoch": 0.3, - "grad_norm": 12.103147506713867, - "learning_rate": 1.8033476997093315e-05, - "loss": 3.8083, + "epoch": 0.12, + "grad_norm": 15.036370277404785, + "learning_rate": 1.9179182529389617e-05, + "loss": 2.2217, "step": 982 }, { - "epoch": 0.3, - "grad_norm": 15.265912055969238, - "learning_rate": 1.8031472386488925e-05, - "loss": 2.1125, + "epoch": 0.12, + "grad_norm": 33.67008590698242, + "learning_rate": 1.917834581433293e-05, + "loss": 2.4785, "step": 983 }, { - "epoch": 0.3, - "grad_norm": 20.835962295532227, - "learning_rate": 1.8029467775884535e-05, - "loss": 2.128, + "epoch": 0.12, + "grad_norm": 14.870255470275879, + "learning_rate": 1.9177509099276244e-05, + "loss": 2.7684, "step": 984 }, { - "epoch": 0.3, - "grad_norm": 27.07541847229004, - "learning_rate": 1.8027463165280145e-05, - "loss": 2.9855, + "epoch": 0.12, + "grad_norm": 25.22587013244629, + "learning_rate": 1.9176672384219555e-05, + "loss": 2.7044, "step": 985 }, { - "epoch": 0.3, - "grad_norm": 17.84712791442871, - "learning_rate": 1.8025458554675755e-05, - "loss": 2.7988, + "epoch": 0.12, + "grad_norm": 9.532674789428711, + "learning_rate": 1.917583566916287e-05, + "loss": 2.7126, "step": 986 }, { - "epoch": 0.3, - "grad_norm": 15.661683082580566, - "learning_rate": 1.8023453944071365e-05, - "loss": 3.1825, + "epoch": 0.12, + "grad_norm": 12.704583168029785, + "learning_rate": 1.9174998954106182e-05, + "loss": 2.1161, "step": 987 }, { - "epoch": 0.3, - "grad_norm": 21.294462203979492, - "learning_rate": 1.8021449333466975e-05, - "loss": 3.2597, + "epoch": 0.12, + "grad_norm": 25.719287872314453, + "learning_rate": 1.9174162239049496e-05, + "loss": 4.9047, "step": 988 }, { - "epoch": 0.3, - "grad_norm": 14.863018035888672, - "learning_rate": 1.8019444722862585e-05, - "loss": 2.7602, + "epoch": 0.12, + "grad_norm": 12.979522705078125, + "learning_rate": 1.9173325523992806e-05, + "loss": 2.8941, "step": 989 }, { - "epoch": 0.3, - "grad_norm": 14.493101119995117, - "learning_rate": 1.8017440112258195e-05, - "loss": 3.5392, + "epoch": 0.12, + "grad_norm": 10.867280960083008, + "learning_rate": 1.917248880893612e-05, + "loss": 1.9709, "step": 990 }, { - "epoch": 0.3, - "grad_norm": 16.697307586669922, - "learning_rate": 1.8015435501653806e-05, - "loss": 3.0317, + "epoch": 0.12, + "grad_norm": 11.747751235961914, + "learning_rate": 1.9171652093879433e-05, + "loss": 2.9887, "step": 991 }, { - "epoch": 0.3, - "grad_norm": 21.106035232543945, - "learning_rate": 1.8013430891049416e-05, - "loss": 2.6286, + "epoch": 0.12, + "grad_norm": 18.359580993652344, + "learning_rate": 1.9170815378822743e-05, + "loss": 4.3102, "step": 992 }, { - "epoch": 0.3, - "grad_norm": 14.834815979003906, - "learning_rate": 1.8011426280445026e-05, - "loss": 2.2224, + "epoch": 0.12, + "grad_norm": 19.873567581176758, + "learning_rate": 1.9169978663766057e-05, + "loss": 3.9871, "step": 993 }, { - "epoch": 0.3, - "grad_norm": 16.66942596435547, - "learning_rate": 1.8009421669840636e-05, - "loss": 3.2398, + "epoch": 0.12, + "grad_norm": 18.60597038269043, + "learning_rate": 1.9169141948709367e-05, + "loss": 3.5496, "step": 994 }, { - "epoch": 0.3, - "grad_norm": 20.955211639404297, - "learning_rate": 1.8007417059236246e-05, - "loss": 3.2694, + "epoch": 0.12, + "grad_norm": 17.307985305786133, + "learning_rate": 1.916830523365268e-05, + "loss": 3.6558, "step": 995 }, { - "epoch": 0.3, - "grad_norm": 16.78022575378418, - "learning_rate": 1.8005412448631853e-05, - "loss": 3.3401, + "epoch": 0.12, + "grad_norm": 16.86280059814453, + "learning_rate": 1.9167468518595995e-05, + "loss": 2.0322, "step": 996 }, { - "epoch": 0.3, - "grad_norm": 10.238494873046875, - "learning_rate": 1.8003407838027466e-05, - "loss": 2.686, + "epoch": 0.13, + "grad_norm": 13.198040962219238, + "learning_rate": 1.9166631803539305e-05, + "loss": 2.366, "step": 997 }, { - "epoch": 0.3, - "grad_norm": 25.67633628845215, - "learning_rate": 1.8001403227423076e-05, - "loss": 4.3305, + "epoch": 0.13, + "grad_norm": 14.232464790344238, + "learning_rate": 1.916579508848262e-05, + "loss": 2.8017, "step": 998 }, { - "epoch": 0.3, - "grad_norm": 18.46412467956543, - "learning_rate": 1.7999398616818683e-05, - "loss": 4.0384, + "epoch": 0.13, + "grad_norm": 15.965517044067383, + "learning_rate": 1.916495837342593e-05, + "loss": 4.077, "step": 999 }, { - "epoch": 0.3, - "grad_norm": 16.967388153076172, - "learning_rate": 1.7997394006214296e-05, - "loss": 2.752, + "epoch": 0.13, + "grad_norm": 10.01248550415039, + "learning_rate": 1.9164121658369242e-05, + "loss": 0.4904, "step": 1000 }, { - "epoch": 0.3, - "grad_norm": 16.72841453552246, - "learning_rate": 1.7995389395609903e-05, - "loss": 1.9186, + "epoch": 0.13, + "grad_norm": 36.76567077636719, + "learning_rate": 1.9163284943312556e-05, + "loss": 1.9846, "step": 1001 }, { - "epoch": 0.3, - "grad_norm": 17.65038299560547, - "learning_rate": 1.7993384785005513e-05, - "loss": 2.8156, + "epoch": 0.13, + "grad_norm": 14.137666702270508, + "learning_rate": 1.916244822825587e-05, + "loss": 1.5229, "step": 1002 }, { - "epoch": 0.3, - "grad_norm": 15.546381950378418, - "learning_rate": 1.7991380174401123e-05, - "loss": 2.9373, + "epoch": 0.13, + "grad_norm": 21.321977615356445, + "learning_rate": 1.916161151319918e-05, + "loss": 5.1578, "step": 1003 }, { - "epoch": 0.3, - "grad_norm": 13.01362133026123, - "learning_rate": 1.7989375563796733e-05, - "loss": 2.4807, + "epoch": 0.13, + "grad_norm": 11.998373985290527, + "learning_rate": 1.9160774798142494e-05, + "loss": 2.938, "step": 1004 }, { - "epoch": 0.3, - "grad_norm": 18.31947135925293, - "learning_rate": 1.7987370953192343e-05, - "loss": 2.6384, + "epoch": 0.13, + "grad_norm": 15.488204956054688, + "learning_rate": 1.9159938083085807e-05, + "loss": 5.3228, "step": 1005 }, { - "epoch": 0.3, - "grad_norm": 19.710500717163086, - "learning_rate": 1.7985366342587953e-05, - "loss": 2.9314, + "epoch": 0.13, + "grad_norm": 24.641403198242188, + "learning_rate": 1.9159101368029118e-05, + "loss": 2.3875, "step": 1006 }, { - "epoch": 0.3, - "grad_norm": 12.663653373718262, - "learning_rate": 1.7983361731983563e-05, - "loss": 1.9977, + "epoch": 0.13, + "grad_norm": 12.331722259521484, + "learning_rate": 1.915826465297243e-05, + "loss": 1.4247, "step": 1007 }, { - "epoch": 0.3, - "grad_norm": 23.557924270629883, - "learning_rate": 1.7981357121379174e-05, - "loss": 3.6326, + "epoch": 0.13, + "grad_norm": 10.565719604492188, + "learning_rate": 1.9157427937915745e-05, + "loss": 2.8828, "step": 1008 }, { - "epoch": 0.3, - "grad_norm": 12.505498886108398, - "learning_rate": 1.7979352510774784e-05, - "loss": 2.3971, + "epoch": 0.13, + "grad_norm": 15.967330932617188, + "learning_rate": 1.915659122285906e-05, + "loss": 4.3271, "step": 1009 }, { - "epoch": 0.3, - "grad_norm": 21.709392547607422, - "learning_rate": 1.7977347900170394e-05, - "loss": 2.4877, + "epoch": 0.13, + "grad_norm": 9.954456329345703, + "learning_rate": 1.915575450780237e-05, + "loss": 3.6863, "step": 1010 }, { - "epoch": 0.3, - "grad_norm": 12.365106582641602, - "learning_rate": 1.7975343289566004e-05, - "loss": 3.1574, + "epoch": 0.13, + "grad_norm": 18.42413902282715, + "learning_rate": 1.9154917792745682e-05, + "loss": 3.9867, "step": 1011 }, { - "epoch": 0.3, - "grad_norm": 18.052745819091797, - "learning_rate": 1.7973338678961614e-05, - "loss": 3.6156, + "epoch": 0.13, + "grad_norm": 31.44066047668457, + "learning_rate": 1.9154081077688996e-05, + "loss": 5.7757, "step": 1012 }, { - "epoch": 0.3, - "grad_norm": 16.1375732421875, - "learning_rate": 1.7971334068357224e-05, - "loss": 3.0042, + "epoch": 0.13, + "grad_norm": 22.84227752685547, + "learning_rate": 1.9153244362632306e-05, + "loss": 2.1757, "step": 1013 }, { - "epoch": 0.3, - "grad_norm": 15.52522087097168, - "learning_rate": 1.7969329457752834e-05, - "loss": 3.3669, + "epoch": 0.13, + "grad_norm": 16.95429801940918, + "learning_rate": 1.915240764757562e-05, + "loss": 3.5305, "step": 1014 }, { - "epoch": 0.31, - "grad_norm": 18.78600311279297, - "learning_rate": 1.796732484714844e-05, - "loss": 4.9568, + "epoch": 0.13, + "grad_norm": 14.703499794006348, + "learning_rate": 1.9151570932518934e-05, + "loss": 2.3546, "step": 1015 }, { - "epoch": 0.31, - "grad_norm": 32.19879150390625, - "learning_rate": 1.7965320236544054e-05, - "loss": 3.4862, + "epoch": 0.13, + "grad_norm": 14.213774681091309, + "learning_rate": 1.9150734217462247e-05, + "loss": 3.4051, "step": 1016 }, { - "epoch": 0.31, - "grad_norm": 22.817644119262695, - "learning_rate": 1.7963315625939664e-05, - "loss": 3.6447, + "epoch": 0.13, + "grad_norm": 14.093331336975098, + "learning_rate": 1.9149897502405558e-05, + "loss": 3.1891, "step": 1017 }, { - "epoch": 0.31, - "grad_norm": 13.087103843688965, - "learning_rate": 1.796131101533527e-05, - "loss": 3.1701, + "epoch": 0.13, + "grad_norm": 14.156906127929688, + "learning_rate": 1.914906078734887e-05, + "loss": 2.7102, "step": 1018 }, { - "epoch": 0.31, - "grad_norm": 15.170973777770996, - "learning_rate": 1.7959306404730884e-05, - "loss": 2.907, + "epoch": 0.13, + "grad_norm": 17.22342872619629, + "learning_rate": 1.914822407229218e-05, + "loss": 3.1369, "step": 1019 }, { - "epoch": 0.31, - "grad_norm": 18.237186431884766, - "learning_rate": 1.795730179412649e-05, - "loss": 3.34, + "epoch": 0.13, + "grad_norm": 16.823516845703125, + "learning_rate": 1.9147387357235495e-05, + "loss": 3.5317, "step": 1020 }, { - "epoch": 0.31, - "grad_norm": 21.131467819213867, - "learning_rate": 1.79552971835221e-05, - "loss": 3.0868, + "epoch": 0.13, + "grad_norm": 10.506781578063965, + "learning_rate": 1.914655064217881e-05, + "loss": 2.212, "step": 1021 }, { - "epoch": 0.31, - "grad_norm": 21.808650970458984, - "learning_rate": 1.7953292572917715e-05, - "loss": 2.4194, + "epoch": 0.13, + "grad_norm": 10.249368667602539, + "learning_rate": 1.914571392712212e-05, + "loss": 2.169, "step": 1022 }, { - "epoch": 0.31, - "grad_norm": 8.451123237609863, - "learning_rate": 1.795128796231332e-05, - "loss": 2.0371, + "epoch": 0.13, + "grad_norm": 11.512591361999512, + "learning_rate": 1.9144877212065433e-05, + "loss": 2.7708, "step": 1023 }, { - "epoch": 0.31, - "grad_norm": 13.38468074798584, - "learning_rate": 1.794928335170893e-05, - "loss": 2.1737, + "epoch": 0.13, + "grad_norm": 17.4201602935791, + "learning_rate": 1.9144040497008743e-05, + "loss": 4.4981, "step": 1024 }, { - "epoch": 0.31, - "grad_norm": 11.502458572387695, - "learning_rate": 1.794727874110454e-05, - "loss": 2.5593, + "epoch": 0.13, + "grad_norm": 8.540460586547852, + "learning_rate": 1.9143203781952057e-05, + "loss": 1.1926, "step": 1025 }, { - "epoch": 0.31, - "grad_norm": 8.056184768676758, - "learning_rate": 1.794527413050015e-05, - "loss": 2.0176, + "epoch": 0.13, + "grad_norm": 10.993705749511719, + "learning_rate": 1.914236706689537e-05, + "loss": 1.6827, "step": 1026 }, { - "epoch": 0.31, - "grad_norm": 12.703288078308105, - "learning_rate": 1.7943269519895762e-05, - "loss": 3.1376, + "epoch": 0.13, + "grad_norm": 14.419856071472168, + "learning_rate": 1.914153035183868e-05, + "loss": 3.2344, "step": 1027 }, { - "epoch": 0.31, - "grad_norm": 18.933555603027344, - "learning_rate": 1.7941264909291372e-05, - "loss": 2.6452, + "epoch": 0.13, + "grad_norm": 15.001279830932617, + "learning_rate": 1.9140693636781994e-05, + "loss": 3.0493, "step": 1028 }, { - "epoch": 0.31, - "grad_norm": 19.963056564331055, - "learning_rate": 1.7939260298686982e-05, - "loss": 3.3245, + "epoch": 0.13, + "grad_norm": 30.563142776489258, + "learning_rate": 1.9139856921725308e-05, + "loss": 3.8982, "step": 1029 }, { - "epoch": 0.31, - "grad_norm": 11.356467247009277, - "learning_rate": 1.7937255688082592e-05, - "loss": 2.8729, + "epoch": 0.13, + "grad_norm": 12.270652770996094, + "learning_rate": 1.913902020666862e-05, + "loss": 3.1659, "step": 1030 }, { - "epoch": 0.31, - "grad_norm": 22.96855926513672, - "learning_rate": 1.7935251077478202e-05, - "loss": 3.4278, + "epoch": 0.13, + "grad_norm": 11.87707805633545, + "learning_rate": 1.913818349161193e-05, + "loss": 1.5629, "step": 1031 }, { - "epoch": 0.31, - "grad_norm": 9.410229682922363, - "learning_rate": 1.793324646687381e-05, - "loss": 2.4024, + "epoch": 0.13, + "grad_norm": 14.46357250213623, + "learning_rate": 1.9137346776555245e-05, + "loss": 2.2168, "step": 1032 }, { - "epoch": 0.31, - "grad_norm": 15.325308799743652, - "learning_rate": 1.7931241856269422e-05, - "loss": 3.0026, + "epoch": 0.13, + "grad_norm": 12.322527885437012, + "learning_rate": 1.913651006149856e-05, + "loss": 2.4828, "step": 1033 }, { - "epoch": 0.31, - "grad_norm": 17.43353271484375, - "learning_rate": 1.7929237245665032e-05, - "loss": 3.4274, + "epoch": 0.13, + "grad_norm": 11.011430740356445, + "learning_rate": 1.913567334644187e-05, + "loss": 3.1574, "step": 1034 }, { - "epoch": 0.31, - "grad_norm": 18.91429328918457, - "learning_rate": 1.792723263506064e-05, - "loss": 2.8543, + "epoch": 0.13, + "grad_norm": 9.569558143615723, + "learning_rate": 1.9134836631385183e-05, + "loss": 3.0719, "step": 1035 }, { - "epoch": 0.31, - "grad_norm": 13.564133644104004, - "learning_rate": 1.7925228024456253e-05, - "loss": 1.9484, + "epoch": 0.13, + "grad_norm": 18.550840377807617, + "learning_rate": 1.9133999916328497e-05, + "loss": 2.4526, "step": 1036 }, { - "epoch": 0.31, - "grad_norm": 21.08160972595215, - "learning_rate": 1.792322341385186e-05, - "loss": 3.0899, + "epoch": 0.13, + "grad_norm": 33.769222259521484, + "learning_rate": 1.913316320127181e-05, + "loss": 2.9887, "step": 1037 }, { - "epoch": 0.31, - "grad_norm": 11.906721115112305, - "learning_rate": 1.792121880324747e-05, - "loss": 2.3356, + "epoch": 0.13, + "grad_norm": 24.496849060058594, + "learning_rate": 1.913232648621512e-05, + "loss": 2.9969, "step": 1038 }, { - "epoch": 0.31, - "grad_norm": 12.753665924072266, - "learning_rate": 1.791921419264308e-05, - "loss": 1.7421, + "epoch": 0.13, + "grad_norm": 9.507521629333496, + "learning_rate": 1.9131489771158434e-05, + "loss": 2.0447, "step": 1039 }, { - "epoch": 0.31, - "grad_norm": 13.137770652770996, - "learning_rate": 1.791720958203869e-05, - "loss": 3.0557, + "epoch": 0.13, + "grad_norm": 14.459228515625, + "learning_rate": 1.9130653056101748e-05, + "loss": 2.9356, "step": 1040 }, { - "epoch": 0.31, - "grad_norm": 18.3424129486084, - "learning_rate": 1.7915204971434303e-05, - "loss": 2.9359, + "epoch": 0.13, + "grad_norm": 7.550471782684326, + "learning_rate": 1.9129816341045058e-05, + "loss": 1.9061, "step": 1041 }, { - "epoch": 0.31, - "grad_norm": 14.686424255371094, - "learning_rate": 1.791320036082991e-05, - "loss": 2.947, + "epoch": 0.13, + "grad_norm": 17.238767623901367, + "learning_rate": 1.912897962598837e-05, + "loss": 4.294, "step": 1042 }, { - "epoch": 0.31, - "grad_norm": 18.53430938720703, - "learning_rate": 1.791119575022552e-05, - "loss": 3.6603, + "epoch": 0.13, + "grad_norm": 19.979581832885742, + "learning_rate": 1.9128142910931685e-05, + "loss": 3.699, "step": 1043 }, { - "epoch": 0.31, - "grad_norm": 12.61064338684082, - "learning_rate": 1.790919113962113e-05, - "loss": 3.071, + "epoch": 0.13, + "grad_norm": 22.493305206298828, + "learning_rate": 1.9127306195875e-05, + "loss": 3.2351, "step": 1044 }, { - "epoch": 0.31, - "grad_norm": 16.695178985595703, - "learning_rate": 1.790718652901674e-05, - "loss": 3.6908, + "epoch": 0.13, + "grad_norm": 11.701733589172363, + "learning_rate": 1.912646948081831e-05, + "loss": 2.3459, "step": 1045 }, { - "epoch": 0.31, - "grad_norm": 21.377464294433594, - "learning_rate": 1.790518191841235e-05, - "loss": 2.0132, + "epoch": 0.13, + "grad_norm": 19.428638458251953, + "learning_rate": 1.9125632765761623e-05, + "loss": 1.2258, "step": 1046 }, { - "epoch": 0.31, - "grad_norm": 12.307204246520996, - "learning_rate": 1.790317730780796e-05, - "loss": 3.3536, + "epoch": 0.13, + "grad_norm": 11.169611930847168, + "learning_rate": 1.9124796050704933e-05, + "loss": 0.8787, "step": 1047 }, { - "epoch": 0.32, - "grad_norm": 18.136398315429688, - "learning_rate": 1.790117269720357e-05, - "loss": 3.3327, + "epoch": 0.13, + "grad_norm": 10.714649200439453, + "learning_rate": 1.9123959335648247e-05, + "loss": 2.9222, "step": 1048 }, { - "epoch": 0.32, - "grad_norm": 14.812959671020508, - "learning_rate": 1.789916808659918e-05, - "loss": 2.203, + "epoch": 0.13, + "grad_norm": 17.899307250976562, + "learning_rate": 1.912312262059156e-05, + "loss": 2.7829, "step": 1049 }, { - "epoch": 0.32, - "grad_norm": 16.73038101196289, - "learning_rate": 1.789716347599479e-05, - "loss": 2.5606, + "epoch": 0.13, + "grad_norm": 21.250642776489258, + "learning_rate": 1.912228590553487e-05, + "loss": 3.9042, "step": 1050 }, { - "epoch": 0.32, - "grad_norm": 28.89952278137207, - "learning_rate": 1.7895158865390397e-05, - "loss": 3.7834, + "epoch": 0.13, + "grad_norm": 12.698055267333984, + "learning_rate": 1.9121449190478184e-05, + "loss": 3.1538, "step": 1051 }, { - "epoch": 0.32, - "grad_norm": 16.880115509033203, - "learning_rate": 1.789315425478601e-05, - "loss": 2.5196, + "epoch": 0.13, + "grad_norm": 63.32461166381836, + "learning_rate": 1.9120612475421495e-05, + "loss": 2.2006, "step": 1052 }, { - "epoch": 0.32, - "grad_norm": 13.364277839660645, - "learning_rate": 1.789114964418162e-05, - "loss": 2.9332, + "epoch": 0.13, + "grad_norm": 14.6691312789917, + "learning_rate": 1.9119775760364808e-05, + "loss": 3.7852, "step": 1053 }, { - "epoch": 0.32, - "grad_norm": 34.66802978515625, - "learning_rate": 1.7889145033577227e-05, - "loss": 3.8191, + "epoch": 0.13, + "grad_norm": 12.2632474899292, + "learning_rate": 1.9118939045308122e-05, + "loss": 1.982, "step": 1054 }, { - "epoch": 0.32, - "grad_norm": 31.339599609375, - "learning_rate": 1.788714042297284e-05, - "loss": 2.9259, + "epoch": 0.13, + "grad_norm": 20.339496612548828, + "learning_rate": 1.9118102330251432e-05, + "loss": 3.9451, "step": 1055 }, { - "epoch": 0.32, - "grad_norm": 20.67884063720703, - "learning_rate": 1.7885135812368447e-05, - "loss": 2.7581, + "epoch": 0.13, + "grad_norm": 11.595175743103027, + "learning_rate": 1.9117265615194746e-05, + "loss": 1.8827, "step": 1056 }, { - "epoch": 0.32, - "grad_norm": 15.182465553283691, - "learning_rate": 1.7883131201764058e-05, - "loss": 2.8311, + "epoch": 0.13, + "grad_norm": 17.194581985473633, + "learning_rate": 1.911642890013806e-05, + "loss": 0.9844, "step": 1057 }, { - "epoch": 0.32, - "grad_norm": 18.39655303955078, - "learning_rate": 1.7881126591159668e-05, - "loss": 2.7088, + "epoch": 0.13, + "grad_norm": 12.588428497314453, + "learning_rate": 1.9115592185081373e-05, + "loss": 3.0674, "step": 1058 }, { - "epoch": 0.32, - "grad_norm": 15.220178604125977, - "learning_rate": 1.7879121980555278e-05, - "loss": 3.1082, + "epoch": 0.13, + "grad_norm": 14.255656242370605, + "learning_rate": 1.9114755470024683e-05, + "loss": 3.8112, "step": 1059 }, { - "epoch": 0.32, - "grad_norm": 13.494277954101562, - "learning_rate": 1.7877117369950888e-05, - "loss": 2.8682, + "epoch": 0.13, + "grad_norm": 18.233028411865234, + "learning_rate": 1.9113918754967997e-05, + "loss": 3.1386, "step": 1060 }, { - "epoch": 0.32, - "grad_norm": 19.83290672302246, - "learning_rate": 1.7875112759346498e-05, - "loss": 3.3298, + "epoch": 0.13, + "grad_norm": 10.391460418701172, + "learning_rate": 1.911308203991131e-05, + "loss": 1.8707, "step": 1061 }, { - "epoch": 0.32, - "grad_norm": 22.487585067749023, - "learning_rate": 1.7873108148742108e-05, - "loss": 3.2538, + "epoch": 0.13, + "grad_norm": 17.744470596313477, + "learning_rate": 1.911224532485462e-05, + "loss": 2.09, "step": 1062 }, { - "epoch": 0.32, - "grad_norm": 24.89381217956543, - "learning_rate": 1.7871103538137718e-05, - "loss": 2.5621, + "epoch": 0.13, + "grad_norm": 13.269696235656738, + "learning_rate": 1.9111408609797935e-05, + "loss": 3.8912, "step": 1063 }, { - "epoch": 0.32, - "grad_norm": 17.16600227355957, - "learning_rate": 1.7869098927533328e-05, - "loss": 2.8225, + "epoch": 0.13, + "grad_norm": 52.91433334350586, + "learning_rate": 1.9110571894741248e-05, + "loss": 2.4628, "step": 1064 }, { - "epoch": 0.32, - "grad_norm": 10.632941246032715, - "learning_rate": 1.7867094316928938e-05, - "loss": 3.0174, + "epoch": 0.13, + "grad_norm": 10.043838500976562, + "learning_rate": 1.9109735179684562e-05, + "loss": 1.085, "step": 1065 }, { - "epoch": 0.32, - "grad_norm": 20.951936721801758, - "learning_rate": 1.7865089706324548e-05, - "loss": 3.5367, + "epoch": 0.13, + "grad_norm": 11.903457641601562, + "learning_rate": 1.9108898464627872e-05, + "loss": 1.5179, "step": 1066 }, { - "epoch": 0.32, - "grad_norm": 15.389318466186523, - "learning_rate": 1.786308509572016e-05, - "loss": 2.2259, + "epoch": 0.13, + "grad_norm": 11.420485496520996, + "learning_rate": 1.9108061749571186e-05, + "loss": 2.3194, "step": 1067 }, { - "epoch": 0.32, - "grad_norm": 44.66872787475586, - "learning_rate": 1.786108048511577e-05, - "loss": 4.754, + "epoch": 0.13, + "grad_norm": 14.506755828857422, + "learning_rate": 1.91072250345145e-05, + "loss": 3.0316, "step": 1068 }, { - "epoch": 0.32, - "grad_norm": 12.777280807495117, - "learning_rate": 1.785907587451138e-05, - "loss": 3.2503, + "epoch": 0.13, + "grad_norm": 8.62365436553955, + "learning_rate": 1.910638831945781e-05, + "loss": 3.8688, "step": 1069 }, { - "epoch": 0.32, - "grad_norm": 130.45664978027344, - "learning_rate": 1.7857071263906985e-05, - "loss": 2.8328, + "epoch": 0.13, + "grad_norm": 15.223709106445312, + "learning_rate": 1.9105551604401123e-05, + "loss": 3.7763, "step": 1070 }, { - "epoch": 0.32, - "grad_norm": 25.45984649658203, - "learning_rate": 1.78550666533026e-05, - "loss": 2.7516, + "epoch": 0.13, + "grad_norm": 18.46766471862793, + "learning_rate": 1.9104714889344437e-05, + "loss": 2.5704, "step": 1071 }, { - "epoch": 0.32, - "grad_norm": 15.323507308959961, - "learning_rate": 1.785306204269821e-05, - "loss": 2.0025, + "epoch": 0.13, + "grad_norm": 10.691957473754883, + "learning_rate": 1.9103878174287747e-05, + "loss": 2.6815, "step": 1072 }, { - "epoch": 0.32, - "grad_norm": 12.247516632080078, - "learning_rate": 1.7851057432093815e-05, - "loss": 2.7504, + "epoch": 0.13, + "grad_norm": 9.40165901184082, + "learning_rate": 1.910304145923106e-05, + "loss": 2.6142, "step": 1073 }, { - "epoch": 0.32, - "grad_norm": 33.093528747558594, - "learning_rate": 1.784905282148943e-05, - "loss": 3.8759, + "epoch": 0.13, + "grad_norm": 19.226882934570312, + "learning_rate": 1.9102204744174375e-05, + "loss": 4.4681, "step": 1074 }, { - "epoch": 0.32, - "grad_norm": 19.78754234313965, - "learning_rate": 1.7847048210885036e-05, - "loss": 3.0278, + "epoch": 0.13, + "grad_norm": 16.283008575439453, + "learning_rate": 1.9101368029117685e-05, + "loss": 3.1899, "step": 1075 }, { - "epoch": 0.32, - "grad_norm": 15.436639785766602, - "learning_rate": 1.7845043600280646e-05, - "loss": 3.5511, + "epoch": 0.14, + "grad_norm": 23.305496215820312, + "learning_rate": 1.9100531314061e-05, + "loss": 4.6672, "step": 1076 }, { - "epoch": 0.32, - "grad_norm": 12.693981170654297, - "learning_rate": 1.784303898967626e-05, - "loss": 3.4346, + "epoch": 0.14, + "grad_norm": 36.387516021728516, + "learning_rate": 1.909969459900431e-05, + "loss": 2.1056, "step": 1077 }, { - "epoch": 0.32, - "grad_norm": 17.650697708129883, - "learning_rate": 1.7841034379071866e-05, - "loss": 2.3388, + "epoch": 0.14, + "grad_norm": 40.60980224609375, + "learning_rate": 1.9098857883947622e-05, + "loss": 3.5833, "step": 1078 }, { - "epoch": 0.32, - "grad_norm": 18.479936599731445, - "learning_rate": 1.7839029768467476e-05, - "loss": 3.4169, + "epoch": 0.14, + "grad_norm": 13.623090744018555, + "learning_rate": 1.9098021168890936e-05, + "loss": 3.3983, "step": 1079 }, { - "epoch": 0.32, - "grad_norm": 14.039420127868652, - "learning_rate": 1.7837025157863086e-05, - "loss": 3.0089, - "step": 1080 - }, - { - "epoch": 0.32, - "eval_loss": 0.5645097494125366, - "eval_runtime": 43.9047, - "eval_samples_per_second": 33.687, - "eval_steps_per_second": 33.687, + "epoch": 0.14, + "grad_norm": 16.30078125, + "learning_rate": 1.9097184453834246e-05, + "loss": 2.0246, "step": 1080 }, { - "epoch": 0.33, - "grad_norm": 18.384023666381836, - "learning_rate": 1.7835020547258696e-05, - "loss": 5.0499, + "epoch": 0.14, + "grad_norm": 26.719799041748047, + "learning_rate": 1.909634773877756e-05, + "loss": 2.9841, "step": 1081 }, { - "epoch": 0.33, - "grad_norm": 14.556102752685547, - "learning_rate": 1.7833015936654306e-05, - "loss": 2.7198, + "epoch": 0.14, + "grad_norm": 15.77614974975586, + "learning_rate": 1.9095511023720874e-05, + "loss": 2.7169, "step": 1082 }, { - "epoch": 0.33, - "grad_norm": 16.748477935791016, - "learning_rate": 1.7831011326049916e-05, - "loss": 2.7809, + "epoch": 0.14, + "grad_norm": 20.72202491760254, + "learning_rate": 1.9094674308664184e-05, + "loss": 3.6718, "step": 1083 }, { - "epoch": 0.33, - "grad_norm": 16.843488693237305, - "learning_rate": 1.7829006715445526e-05, - "loss": 3.1988, + "epoch": 0.14, + "grad_norm": 19.182992935180664, + "learning_rate": 1.9093837593607497e-05, + "loss": 4.8896, "step": 1084 }, { - "epoch": 0.33, - "grad_norm": 60.20880889892578, - "learning_rate": 1.7827002104841137e-05, - "loss": 4.0739, + "epoch": 0.14, + "grad_norm": 9.58013916015625, + "learning_rate": 1.909300087855081e-05, + "loss": 2.4248, "step": 1085 }, { - "epoch": 0.33, - "grad_norm": 22.219043731689453, - "learning_rate": 1.7824997494236747e-05, - "loss": 3.1217, + "epoch": 0.14, + "grad_norm": 16.581295013427734, + "learning_rate": 1.9092164163494125e-05, + "loss": 4.0646, "step": 1086 }, { - "epoch": 0.33, - "grad_norm": 15.742715835571289, - "learning_rate": 1.7822992883632357e-05, - "loss": 2.1444, + "epoch": 0.14, + "grad_norm": 18.470375061035156, + "learning_rate": 1.9091327448437435e-05, + "loss": 4.0107, "step": 1087 }, { - "epoch": 0.33, - "grad_norm": 13.159707069396973, - "learning_rate": 1.7820988273027967e-05, - "loss": 3.7656, + "epoch": 0.14, + "grad_norm": 13.326085090637207, + "learning_rate": 1.909049073338075e-05, + "loss": 3.0869, "step": 1088 }, { - "epoch": 0.33, - "grad_norm": 18.79963493347168, - "learning_rate": 1.7818983662423573e-05, - "loss": 3.2334, + "epoch": 0.14, + "grad_norm": 16.086368560791016, + "learning_rate": 1.9089654018324062e-05, + "loss": 3.2626, "step": 1089 }, { - "epoch": 0.33, - "grad_norm": 12.51858139038086, - "learning_rate": 1.7816979051819187e-05, - "loss": 1.6161, + "epoch": 0.14, + "grad_norm": 26.71560287475586, + "learning_rate": 1.9088817303267373e-05, + "loss": 3.6289, "step": 1090 }, { - "epoch": 0.33, - "grad_norm": 18.549959182739258, - "learning_rate": 1.7814974441214797e-05, - "loss": 2.2857, + "epoch": 0.14, + "grad_norm": 11.671117782592773, + "learning_rate": 1.9087980588210686e-05, + "loss": 2.8107, "step": 1091 }, { - "epoch": 0.33, - "grad_norm": 17.99864959716797, - "learning_rate": 1.7812969830610404e-05, - "loss": 2.2779, + "epoch": 0.14, + "grad_norm": 17.49782371520996, + "learning_rate": 1.9087143873154e-05, + "loss": 4.3494, "step": 1092 }, { - "epoch": 0.33, - "grad_norm": 16.206727981567383, - "learning_rate": 1.7810965220006017e-05, - "loss": 2.1039, + "epoch": 0.14, + "grad_norm": 13.312760353088379, + "learning_rate": 1.9086307158097314e-05, + "loss": 1.8361, "step": 1093 }, { - "epoch": 0.33, - "grad_norm": 14.30031681060791, - "learning_rate": 1.7808960609401624e-05, - "loss": 3.3124, + "epoch": 0.14, + "grad_norm": 13.810235977172852, + "learning_rate": 1.9085470443040624e-05, + "loss": 3.8031, "step": 1094 }, { - "epoch": 0.33, - "grad_norm": 48.405094146728516, - "learning_rate": 1.7806955998797234e-05, - "loss": 3.8335, + "epoch": 0.14, + "grad_norm": 41.52159881591797, + "learning_rate": 1.9084633727983937e-05, + "loss": 4.49, "step": 1095 }, { - "epoch": 0.33, - "grad_norm": 17.39105224609375, - "learning_rate": 1.7804951388192847e-05, - "loss": 2.8733, + "epoch": 0.14, + "grad_norm": 13.56728458404541, + "learning_rate": 1.908379701292725e-05, + "loss": 2.4451, "step": 1096 }, { - "epoch": 0.33, - "grad_norm": 23.60736656188965, - "learning_rate": 1.7802946777588454e-05, - "loss": 2.9195, + "epoch": 0.14, + "grad_norm": 16.606473922729492, + "learning_rate": 1.908296029787056e-05, + "loss": 2.3632, "step": 1097 }, { - "epoch": 0.33, - "grad_norm": 18.69573402404785, - "learning_rate": 1.7800942166984064e-05, - "loss": 3.4923, + "epoch": 0.14, + "grad_norm": 15.629426956176758, + "learning_rate": 1.9082123582813875e-05, + "loss": 3.2804, "step": 1098 }, { - "epoch": 0.33, - "grad_norm": 31.806943893432617, - "learning_rate": 1.7798937556379674e-05, - "loss": 2.9115, + "epoch": 0.14, + "grad_norm": 14.610381126403809, + "learning_rate": 1.908128686775719e-05, + "loss": 3.0428, "step": 1099 }, { - "epoch": 0.33, - "grad_norm": 13.861745834350586, - "learning_rate": 1.7796932945775284e-05, - "loss": 1.3565, + "epoch": 0.14, + "grad_norm": 31.57737159729004, + "learning_rate": 1.90804501527005e-05, + "loss": 4.185, "step": 1100 }, { - "epoch": 0.33, - "grad_norm": 23.198631286621094, - "learning_rate": 1.7794928335170894e-05, - "loss": 3.0973, + "epoch": 0.14, + "grad_norm": 11.78358268737793, + "learning_rate": 1.9079613437643813e-05, + "loss": 3.4115, "step": 1101 }, { - "epoch": 0.33, - "grad_norm": 16.94276237487793, - "learning_rate": 1.7792923724566505e-05, - "loss": 3.5986, + "epoch": 0.14, + "grad_norm": 12.632600784301758, + "learning_rate": 1.9078776722587126e-05, + "loss": 3.2309, "step": 1102 }, { - "epoch": 0.33, - "grad_norm": 9.507773399353027, - "learning_rate": 1.7790919113962115e-05, - "loss": 1.7677, + "epoch": 0.14, + "grad_norm": 13.208819389343262, + "learning_rate": 1.9077940007530436e-05, + "loss": 2.3418, "step": 1103 }, { - "epoch": 0.33, - "grad_norm": 61.569091796875, - "learning_rate": 1.7788914503357725e-05, - "loss": 2.5007, + "epoch": 0.14, + "grad_norm": 6.310235977172852, + "learning_rate": 1.907710329247375e-05, + "loss": 0.2954, "step": 1104 }, { - "epoch": 0.33, - "grad_norm": 16.81398582458496, - "learning_rate": 1.7786909892753335e-05, - "loss": 3.5078, + "epoch": 0.14, + "grad_norm": 16.388343811035156, + "learning_rate": 1.907626657741706e-05, + "loss": 2.5308, "step": 1105 }, { - "epoch": 0.33, - "grad_norm": 15.168441772460938, - "learning_rate": 1.778490528214894e-05, - "loss": 2.3151, + "epoch": 0.14, + "grad_norm": 10.25705623626709, + "learning_rate": 1.9075429862360374e-05, + "loss": 2.5711, "step": 1106 }, { - "epoch": 0.33, - "grad_norm": 16.73635482788086, - "learning_rate": 1.7782900671544555e-05, - "loss": 3.0094, + "epoch": 0.14, + "grad_norm": 9.988468170166016, + "learning_rate": 1.9074593147303688e-05, + "loss": 1.9492, "step": 1107 }, { - "epoch": 0.33, - "grad_norm": 16.253849029541016, - "learning_rate": 1.7780896060940165e-05, - "loss": 2.7682, + "epoch": 0.14, + "grad_norm": 20.494890213012695, + "learning_rate": 1.9073756432246998e-05, + "loss": 2.5956, "step": 1108 }, { - "epoch": 0.33, - "grad_norm": 14.130172729492188, - "learning_rate": 1.7778891450335775e-05, - "loss": 3.0624, + "epoch": 0.14, + "grad_norm": 15.59097671508789, + "learning_rate": 1.907291971719031e-05, + "loss": 5.2766, "step": 1109 }, { - "epoch": 0.33, - "grad_norm": 17.684720993041992, - "learning_rate": 1.7776886839731385e-05, - "loss": 3.3596, + "epoch": 0.14, + "grad_norm": 15.705918312072754, + "learning_rate": 1.9072083002133625e-05, + "loss": 3.8288, "step": 1110 }, { - "epoch": 0.33, - "grad_norm": 29.59230613708496, - "learning_rate": 1.7774882229126992e-05, - "loss": 3.1807, + "epoch": 0.14, + "grad_norm": 11.32144832611084, + "learning_rate": 1.9071246287076935e-05, + "loss": 1.4441, "step": 1111 }, { - "epoch": 0.33, - "grad_norm": 11.93481159210205, - "learning_rate": 1.7772877618522605e-05, - "loss": 2.8107, + "epoch": 0.14, + "grad_norm": 13.291257858276367, + "learning_rate": 1.907040957202025e-05, + "loss": 2.6686, "step": 1112 }, { - "epoch": 0.33, - "grad_norm": 16.962127685546875, - "learning_rate": 1.7770873007918212e-05, - "loss": 2.9108, + "epoch": 0.14, + "grad_norm": 19.998872756958008, + "learning_rate": 1.9069572856963563e-05, + "loss": 2.3493, "step": 1113 }, { - "epoch": 0.33, - "grad_norm": 13.559270858764648, - "learning_rate": 1.7768868397313822e-05, - "loss": 3.5231, + "epoch": 0.14, + "grad_norm": 18.568021774291992, + "learning_rate": 1.9068736141906876e-05, + "loss": 2.7368, "step": 1114 }, { - "epoch": 0.34, - "grad_norm": 12.885640144348145, - "learning_rate": 1.7766863786709436e-05, - "loss": 2.7105, + "epoch": 0.14, + "grad_norm": 20.428361892700195, + "learning_rate": 1.9067899426850187e-05, + "loss": 3.5724, "step": 1115 }, { - "epoch": 0.34, - "grad_norm": 29.005674362182617, - "learning_rate": 1.7764859176105042e-05, - "loss": 3.413, + "epoch": 0.14, + "grad_norm": 28.480815887451172, + "learning_rate": 1.90670627117935e-05, + "loss": 4.4699, "step": 1116 }, { - "epoch": 0.34, - "grad_norm": 13.02987289428711, - "learning_rate": 1.7762854565500652e-05, - "loss": 1.8334, + "epoch": 0.14, + "grad_norm": 11.410590171813965, + "learning_rate": 1.9066225996736814e-05, + "loss": 4.0421, "step": 1117 }, { - "epoch": 0.34, - "grad_norm": 14.34976863861084, - "learning_rate": 1.7760849954896263e-05, - "loss": 1.5514, + "epoch": 0.14, + "grad_norm": 17.83932113647461, + "learning_rate": 1.9065389281680124e-05, + "loss": 5.0705, "step": 1118 }, { - "epoch": 0.34, - "grad_norm": 22.86460304260254, - "learning_rate": 1.7758845344291873e-05, - "loss": 3.4641, + "epoch": 0.14, + "grad_norm": 16.42825698852539, + "learning_rate": 1.9064552566623438e-05, + "loss": 2.6597, "step": 1119 }, { - "epoch": 0.34, - "grad_norm": 28.6113224029541, - "learning_rate": 1.7756840733687483e-05, - "loss": 3.761, + "epoch": 0.14, + "grad_norm": 18.98484230041504, + "learning_rate": 1.906371585156675e-05, + "loss": 3.6132, "step": 1120 }, { - "epoch": 0.34, - "grad_norm": 16.072839736938477, - "learning_rate": 1.7754836123083093e-05, - "loss": 1.5352, + "epoch": 0.14, + "grad_norm": 17.578466415405273, + "learning_rate": 1.9062879136510065e-05, + "loss": 3.7327, "step": 1121 }, { - "epoch": 0.34, - "grad_norm": 18.935298919677734, - "learning_rate": 1.7752831512478703e-05, - "loss": 2.8795, + "epoch": 0.14, + "grad_norm": 12.911381721496582, + "learning_rate": 1.9062042421453375e-05, + "loss": 2.7183, "step": 1122 }, { - "epoch": 0.34, - "grad_norm": 22.4312686920166, - "learning_rate": 1.7750826901874313e-05, - "loss": 4.1038, + "epoch": 0.14, + "grad_norm": 14.096404075622559, + "learning_rate": 1.906120570639669e-05, + "loss": 1.5354, "step": 1123 }, { - "epoch": 0.34, - "grad_norm": 13.36023998260498, - "learning_rate": 1.7748822291269923e-05, - "loss": 2.9175, + "epoch": 0.14, + "grad_norm": 16.299760818481445, + "learning_rate": 1.9060368991340003e-05, + "loss": 2.2153, "step": 1124 }, { - "epoch": 0.34, - "grad_norm": 10.283577919006348, - "learning_rate": 1.774681768066553e-05, - "loss": 1.6504, + "epoch": 0.14, + "grad_norm": 30.454317092895508, + "learning_rate": 1.9059532276283313e-05, + "loss": 2.3884, "step": 1125 }, { - "epoch": 0.34, - "grad_norm": 16.4921817779541, - "learning_rate": 1.7744813070061143e-05, - "loss": 3.4366, + "epoch": 0.14, + "grad_norm": 16.80135726928711, + "learning_rate": 1.9058695561226627e-05, + "loss": 2.7318, "step": 1126 }, { - "epoch": 0.34, - "grad_norm": 24.69314193725586, - "learning_rate": 1.7742808459456753e-05, - "loss": 3.5184, + "epoch": 0.14, + "grad_norm": 17.622236251831055, + "learning_rate": 1.905785884616994e-05, + "loss": 1.9797, "step": 1127 }, { - "epoch": 0.34, - "grad_norm": 15.775534629821777, - "learning_rate": 1.774080384885236e-05, - "loss": 3.0247, + "epoch": 0.14, + "grad_norm": 18.480009078979492, + "learning_rate": 1.905702213111325e-05, + "loss": 4.2706, "step": 1128 }, { - "epoch": 0.34, - "grad_norm": 35.886539459228516, - "learning_rate": 1.7738799238247973e-05, - "loss": 3.0789, + "epoch": 0.14, + "grad_norm": 24.628833770751953, + "learning_rate": 1.9056185416056564e-05, + "loss": 2.9627, "step": 1129 }, { - "epoch": 0.34, - "grad_norm": 13.481932640075684, - "learning_rate": 1.773679462764358e-05, - "loss": 2.6866, + "epoch": 0.14, + "grad_norm": 13.772994041442871, + "learning_rate": 1.9055348700999874e-05, + "loss": 2.6898, "step": 1130 }, { - "epoch": 0.34, - "grad_norm": 13.571722030639648, - "learning_rate": 1.773479001703919e-05, - "loss": 3.6435, + "epoch": 0.14, + "grad_norm": 14.796730041503906, + "learning_rate": 1.9054511985943188e-05, + "loss": 4.5945, "step": 1131 }, { - "epoch": 0.34, - "grad_norm": 12.800562858581543, - "learning_rate": 1.77327854064348e-05, - "loss": 2.6728, + "epoch": 0.14, + "grad_norm": 16.073083877563477, + "learning_rate": 1.9053675270886502e-05, + "loss": 3.6796, "step": 1132 }, { - "epoch": 0.34, - "grad_norm": 28.864028930664062, - "learning_rate": 1.773078079583041e-05, - "loss": 1.8873, + "epoch": 0.14, + "grad_norm": 12.354791641235352, + "learning_rate": 1.9052838555829812e-05, + "loss": 2.6927, "step": 1133 }, { - "epoch": 0.34, - "grad_norm": 23.177940368652344, - "learning_rate": 1.772877618522602e-05, - "loss": 3.7982, + "epoch": 0.14, + "grad_norm": 32.54914093017578, + "learning_rate": 1.9052001840773126e-05, + "loss": 3.332, "step": 1134 }, { - "epoch": 0.34, - "grad_norm": 18.07777214050293, - "learning_rate": 1.772677157462163e-05, - "loss": 3.1977, + "epoch": 0.14, + "grad_norm": 11.939435005187988, + "learning_rate": 1.905116512571644e-05, + "loss": 4.0946, "step": 1135 }, { - "epoch": 0.34, - "grad_norm": 25.78776741027832, - "learning_rate": 1.772476696401724e-05, - "loss": 2.4591, + "epoch": 0.14, + "grad_norm": 19.629676818847656, + "learning_rate": 1.905032841065975e-05, + "loss": 2.8144, "step": 1136 }, { - "epoch": 0.34, - "grad_norm": 14.545743942260742, - "learning_rate": 1.772276235341285e-05, - "loss": 2.5043, + "epoch": 0.14, + "grad_norm": 15.602178573608398, + "learning_rate": 1.9049491695603063e-05, + "loss": 1.3764, "step": 1137 }, { - "epoch": 0.34, - "grad_norm": 12.708026885986328, - "learning_rate": 1.772075774280846e-05, - "loss": 3.234, + "epoch": 0.14, + "grad_norm": 13.58865737915039, + "learning_rate": 1.9048654980546377e-05, + "loss": 2.1669, "step": 1138 }, { - "epoch": 0.34, - "grad_norm": 15.227660179138184, - "learning_rate": 1.771875313220407e-05, - "loss": 3.9899, + "epoch": 0.14, + "grad_norm": 10.807489395141602, + "learning_rate": 1.9047818265489687e-05, + "loss": 3.0976, "step": 1139 }, { - "epoch": 0.34, - "grad_norm": 23.074007034301758, - "learning_rate": 1.771674852159968e-05, - "loss": 1.94, + "epoch": 0.14, + "grad_norm": 19.740665435791016, + "learning_rate": 1.9046981550433e-05, + "loss": 2.1226, "step": 1140 }, { - "epoch": 0.34, - "grad_norm": 20.678417205810547, - "learning_rate": 1.771474391099529e-05, - "loss": 3.0762, + "epoch": 0.14, + "grad_norm": 27.950868606567383, + "learning_rate": 1.9046144835376314e-05, + "loss": 4.2818, "step": 1141 }, { - "epoch": 0.34, - "grad_norm": 10.947648048400879, - "learning_rate": 1.77127393003909e-05, - "loss": 3.0063, + "epoch": 0.14, + "grad_norm": 12.425821304321289, + "learning_rate": 1.9045308120319628e-05, + "loss": 3.3513, "step": 1142 }, { - "epoch": 0.34, - "grad_norm": 12.576761245727539, - "learning_rate": 1.771073468978651e-05, - "loss": 2.5958, + "epoch": 0.14, + "grad_norm": 12.599128723144531, + "learning_rate": 1.904447140526294e-05, + "loss": 2.0619, "step": 1143 }, { - "epoch": 0.34, - "grad_norm": 25.190330505371094, - "learning_rate": 1.7708730079182118e-05, - "loss": 2.9465, + "epoch": 0.14, + "grad_norm": 9.102954864501953, + "learning_rate": 1.9043634690206252e-05, + "loss": 2.0338, "step": 1144 }, { - "epoch": 0.34, - "grad_norm": 14.625896453857422, - "learning_rate": 1.770672546857773e-05, - "loss": 2.337, + "epoch": 0.14, + "grad_norm": 16.797597885131836, + "learning_rate": 1.9042797975149566e-05, + "loss": 3.4905, "step": 1145 }, { - "epoch": 0.34, - "grad_norm": 13.233895301818848, - "learning_rate": 1.770472085797334e-05, - "loss": 2.7883, + "epoch": 0.14, + "grad_norm": 12.28773307800293, + "learning_rate": 1.9041961260092876e-05, + "loss": 2.2691, "step": 1146 }, { - "epoch": 0.34, - "grad_norm": 11.731785774230957, - "learning_rate": 1.7702716247368948e-05, - "loss": 2.4522, + "epoch": 0.14, + "grad_norm": 14.749690055847168, + "learning_rate": 1.904112454503619e-05, + "loss": 4.179, "step": 1147 }, { - "epoch": 0.35, - "grad_norm": 13.378613471984863, - "learning_rate": 1.770071163676456e-05, - "loss": 2.6688, + "epoch": 0.14, + "grad_norm": 14.806706428527832, + "learning_rate": 1.9040287829979503e-05, + "loss": 3.3297, "step": 1148 }, { - "epoch": 0.35, - "grad_norm": 34.357383728027344, - "learning_rate": 1.769870702616017e-05, - "loss": 3.0263, + "epoch": 0.14, + "grad_norm": 26.46034049987793, + "learning_rate": 1.9039451114922817e-05, + "loss": 3.7049, "step": 1149 }, { - "epoch": 0.35, - "grad_norm": 13.445833206176758, - "learning_rate": 1.769670241555578e-05, - "loss": 1.7473, + "epoch": 0.14, + "grad_norm": 14.55521297454834, + "learning_rate": 1.9038614399866127e-05, + "loss": 3.3959, "step": 1150 }, { - "epoch": 0.35, - "grad_norm": 13.28929328918457, - "learning_rate": 1.7694697804951392e-05, - "loss": 2.3902, + "epoch": 0.14, + "grad_norm": 15.360191345214844, + "learning_rate": 1.903777768480944e-05, + "loss": 3.5524, "step": 1151 }, { - "epoch": 0.35, - "grad_norm": 19.005510330200195, - "learning_rate": 1.7692693194347e-05, - "loss": 2.506, + "epoch": 0.14, + "grad_norm": 17.11458969116211, + "learning_rate": 1.9036940969752754e-05, + "loss": 3.3466, "step": 1152 }, { - "epoch": 0.35, - "grad_norm": 18.774620056152344, - "learning_rate": 1.769068858374261e-05, - "loss": 2.8765, + "epoch": 0.14, + "grad_norm": 10.738466262817383, + "learning_rate": 1.9036104254696065e-05, + "loss": 2.8425, "step": 1153 }, { - "epoch": 0.35, - "grad_norm": 21.18247413635254, - "learning_rate": 1.768868397313822e-05, - "loss": 2.4275, + "epoch": 0.14, + "grad_norm": 9.902814865112305, + "learning_rate": 1.903526753963938e-05, + "loss": 2.3491, "step": 1154 }, { - "epoch": 0.35, - "grad_norm": 22.34467124938965, - "learning_rate": 1.768667936253383e-05, - "loss": 2.6174, + "epoch": 0.14, + "grad_norm": 14.706478118896484, + "learning_rate": 1.9034430824582692e-05, + "loss": 1.6602, "step": 1155 }, { - "epoch": 0.35, - "grad_norm": 28.30898094177246, - "learning_rate": 1.768467475192944e-05, - "loss": 3.1029, + "epoch": 0.15, + "grad_norm": 6.456505298614502, + "learning_rate": 1.9033594109526002e-05, + "loss": 1.7329, "step": 1156 }, { - "epoch": 0.35, - "grad_norm": 12.033406257629395, - "learning_rate": 1.768267014132505e-05, - "loss": 3.4031, + "epoch": 0.15, + "grad_norm": 11.623553276062012, + "learning_rate": 1.9032757394469316e-05, + "loss": 1.9712, "step": 1157 }, { - "epoch": 0.35, - "grad_norm": 16.54393768310547, - "learning_rate": 1.768066553072066e-05, - "loss": 2.6641, + "epoch": 0.15, + "grad_norm": 13.264073371887207, + "learning_rate": 1.9031920679412626e-05, + "loss": 5.3528, "step": 1158 }, { - "epoch": 0.35, - "grad_norm": 29.7037410736084, - "learning_rate": 1.767866092011627e-05, - "loss": 2.8023, + "epoch": 0.15, + "grad_norm": 20.779993057250977, + "learning_rate": 1.903108396435594e-05, + "loss": 3.1254, "step": 1159 }, { - "epoch": 0.35, - "grad_norm": 17.149572372436523, - "learning_rate": 1.767665630951188e-05, - "loss": 3.3883, + "epoch": 0.15, + "grad_norm": 11.741122245788574, + "learning_rate": 1.9030247249299253e-05, + "loss": 1.6191, "step": 1160 }, { - "epoch": 0.35, - "grad_norm": 32.76160430908203, - "learning_rate": 1.767465169890749e-05, - "loss": 3.1329, + "epoch": 0.15, + "grad_norm": 12.015575408935547, + "learning_rate": 1.9029410534242564e-05, + "loss": 3.5642, "step": 1161 }, { - "epoch": 0.35, - "grad_norm": 13.081724166870117, - "learning_rate": 1.76726470883031e-05, - "loss": 3.1289, + "epoch": 0.15, + "grad_norm": 10.663491249084473, + "learning_rate": 1.9028573819185877e-05, + "loss": 0.9802, "step": 1162 }, { - "epoch": 0.35, - "grad_norm": 29.665246963500977, - "learning_rate": 1.767064247769871e-05, - "loss": 3.6106, + "epoch": 0.15, + "grad_norm": 8.336151123046875, + "learning_rate": 1.902773710412919e-05, + "loss": 2.5828, "step": 1163 }, { - "epoch": 0.35, - "grad_norm": 19.989797592163086, - "learning_rate": 1.766863786709432e-05, - "loss": 3.4249, + "epoch": 0.15, + "grad_norm": 18.085256576538086, + "learning_rate": 1.90269003890725e-05, + "loss": 3.497, "step": 1164 }, { - "epoch": 0.35, - "grad_norm": 11.273635864257812, - "learning_rate": 1.766663325648993e-05, - "loss": 3.0172, + "epoch": 0.15, + "grad_norm": 16.504741668701172, + "learning_rate": 1.9026063674015815e-05, + "loss": 3.6238, "step": 1165 }, { - "epoch": 0.35, - "grad_norm": 25.350624084472656, - "learning_rate": 1.7664628645885536e-05, - "loss": 2.8628, + "epoch": 0.15, + "grad_norm": 13.649950981140137, + "learning_rate": 1.902522695895913e-05, + "loss": 3.8243, "step": 1166 }, { - "epoch": 0.35, - "grad_norm": 14.034561157226562, - "learning_rate": 1.766262403528115e-05, - "loss": 2.7608, + "epoch": 0.15, + "grad_norm": 16.09895133972168, + "learning_rate": 1.902439024390244e-05, + "loss": 2.7386, "step": 1167 }, { - "epoch": 0.35, - "grad_norm": 20.618732452392578, - "learning_rate": 1.7660619424676757e-05, - "loss": 2.6732, + "epoch": 0.15, + "grad_norm": 12.028850555419922, + "learning_rate": 1.9023553528845752e-05, + "loss": 2.5787, "step": 1168 }, { - "epoch": 0.35, - "grad_norm": 18.557788848876953, - "learning_rate": 1.7658614814072367e-05, - "loss": 4.2311, + "epoch": 0.15, + "grad_norm": 12.9425630569458, + "learning_rate": 1.9022716813789066e-05, + "loss": 2.921, "step": 1169 }, { - "epoch": 0.35, - "grad_norm": 22.748462677001953, - "learning_rate": 1.765661020346798e-05, - "loss": 3.0337, + "epoch": 0.15, + "grad_norm": 12.585747718811035, + "learning_rate": 1.902188009873238e-05, + "loss": 5.5789, "step": 1170 }, { - "epoch": 0.35, - "grad_norm": 15.054831504821777, - "learning_rate": 1.7654605592863587e-05, - "loss": 2.5534, + "epoch": 0.15, + "grad_norm": 9.341103553771973, + "learning_rate": 1.902104338367569e-05, + "loss": 1.5048, "step": 1171 }, { - "epoch": 0.35, - "grad_norm": 12.077056884765625, - "learning_rate": 1.7652600982259197e-05, - "loss": 1.9834, + "epoch": 0.15, + "grad_norm": 9.025690078735352, + "learning_rate": 1.9020206668619004e-05, + "loss": 0.6358, "step": 1172 }, { - "epoch": 0.35, - "grad_norm": 17.17857551574707, - "learning_rate": 1.7650596371654807e-05, - "loss": 3.1654, + "epoch": 0.15, + "grad_norm": 17.043132781982422, + "learning_rate": 1.9019369953562317e-05, + "loss": 2.6911, "step": 1173 }, { - "epoch": 0.35, - "grad_norm": 13.104513168334961, - "learning_rate": 1.7648591761050417e-05, - "loss": 2.999, + "epoch": 0.15, + "grad_norm": 7.509103775024414, + "learning_rate": 1.9018533238505628e-05, + "loss": 1.7465, "step": 1174 }, { - "epoch": 0.35, - "grad_norm": 14.851997375488281, - "learning_rate": 1.7646587150446027e-05, - "loss": 3.3295, + "epoch": 0.15, + "grad_norm": 9.966673851013184, + "learning_rate": 1.901769652344894e-05, + "loss": 1.9566, "step": 1175 }, { - "epoch": 0.35, - "grad_norm": 12.090683937072754, - "learning_rate": 1.7644582539841637e-05, - "loss": 2.4796, + "epoch": 0.15, + "grad_norm": 16.340253829956055, + "learning_rate": 1.9016859808392255e-05, + "loss": 3.069, "step": 1176 }, { - "epoch": 0.35, - "grad_norm": 13.724775314331055, - "learning_rate": 1.7642577929237247e-05, - "loss": 2.9697, + "epoch": 0.15, + "grad_norm": 26.096837997436523, + "learning_rate": 1.901602309333557e-05, + "loss": 4.3065, "step": 1177 }, { - "epoch": 0.35, - "grad_norm": 26.366907119750977, - "learning_rate": 1.7640573318632857e-05, - "loss": 3.2335, + "epoch": 0.15, + "grad_norm": 13.248631477355957, + "learning_rate": 1.901518637827888e-05, + "loss": 3.2177, "step": 1178 }, { - "epoch": 0.35, - "grad_norm": 10.779017448425293, - "learning_rate": 1.7638568708028467e-05, - "loss": 2.4018, + "epoch": 0.15, + "grad_norm": 14.375895500183105, + "learning_rate": 1.9014349663222192e-05, + "loss": 2.944, "step": 1179 }, { - "epoch": 0.35, - "grad_norm": 22.231727600097656, - "learning_rate": 1.7636564097424078e-05, - "loss": 3.2377, + "epoch": 0.15, + "grad_norm": 19.621803283691406, + "learning_rate": 1.9013512948165506e-05, + "loss": 2.5023, "step": 1180 }, { - "epoch": 0.36, - "grad_norm": 18.77667236328125, - "learning_rate": 1.7634559486819688e-05, - "loss": 3.0199, + "epoch": 0.15, + "grad_norm": 15.87476634979248, + "learning_rate": 1.9012676233108816e-05, + "loss": 4.5875, "step": 1181 }, { - "epoch": 0.36, - "grad_norm": 16.207422256469727, - "learning_rate": 1.7632554876215298e-05, - "loss": 2.7055, + "epoch": 0.15, + "grad_norm": 10.495246887207031, + "learning_rate": 1.901183951805213e-05, + "loss": 3.1892, "step": 1182 }, { - "epoch": 0.36, - "grad_norm": 16.809715270996094, - "learning_rate": 1.7630550265610908e-05, - "loss": 2.9821, + "epoch": 0.15, + "grad_norm": 23.24948501586914, + "learning_rate": 1.901100280299544e-05, + "loss": 3.406, "step": 1183 }, { - "epoch": 0.36, - "grad_norm": 16.3651065826416, - "learning_rate": 1.7628545655006518e-05, - "loss": 2.1699, + "epoch": 0.15, + "grad_norm": 11.072179794311523, + "learning_rate": 1.9010166087938754e-05, + "loss": 2.5812, "step": 1184 }, { - "epoch": 0.36, - "grad_norm": 11.468558311462402, - "learning_rate": 1.7626541044402125e-05, - "loss": 2.6325, + "epoch": 0.15, + "grad_norm": 12.650174140930176, + "learning_rate": 1.9009329372882068e-05, + "loss": 3.1434, "step": 1185 }, { - "epoch": 0.36, - "grad_norm": 16.051881790161133, - "learning_rate": 1.7624536433797738e-05, - "loss": 2.3443, + "epoch": 0.15, + "grad_norm": 15.346962928771973, + "learning_rate": 1.9008492657825378e-05, + "loss": 3.4276, "step": 1186 }, { - "epoch": 0.36, - "grad_norm": 12.057586669921875, - "learning_rate": 1.7622531823193345e-05, - "loss": 2.7623, + "epoch": 0.15, + "grad_norm": 13.713033676147461, + "learning_rate": 1.900765594276869e-05, + "loss": 2.194, "step": 1187 }, { - "epoch": 0.36, - "grad_norm": 17.24924659729004, - "learning_rate": 1.7620527212588955e-05, - "loss": 3.3553, + "epoch": 0.15, + "grad_norm": 33.21901321411133, + "learning_rate": 1.9006819227712002e-05, + "loss": 3.0766, "step": 1188 }, { - "epoch": 0.36, - "grad_norm": 12.06871223449707, - "learning_rate": 1.761852260198457e-05, - "loss": 1.9598, + "epoch": 0.15, + "grad_norm": 13.535443305969238, + "learning_rate": 1.9005982512655315e-05, + "loss": 3.4499, "step": 1189 }, { - "epoch": 0.36, - "grad_norm": 20.307418823242188, - "learning_rate": 1.7616517991380175e-05, - "loss": 3.1637, + "epoch": 0.15, + "grad_norm": 10.614287376403809, + "learning_rate": 1.900514579759863e-05, + "loss": 1.821, "step": 1190 }, { - "epoch": 0.36, - "grad_norm": 11.176687240600586, - "learning_rate": 1.7614513380775785e-05, - "loss": 2.5982, + "epoch": 0.15, + "grad_norm": 11.230908393859863, + "learning_rate": 1.9004309082541943e-05, + "loss": 1.0304, "step": 1191 }, { - "epoch": 0.36, - "grad_norm": 14.596364974975586, - "learning_rate": 1.7612508770171395e-05, - "loss": 2.5782, + "epoch": 0.15, + "grad_norm": 7.948458194732666, + "learning_rate": 1.9003472367485253e-05, + "loss": 2.5713, "step": 1192 }, { - "epoch": 0.36, - "grad_norm": 16.369272232055664, - "learning_rate": 1.7610504159567005e-05, - "loss": 2.7876, + "epoch": 0.15, + "grad_norm": 19.880294799804688, + "learning_rate": 1.9002635652428567e-05, + "loss": 3.7412, "step": 1193 }, { - "epoch": 0.36, - "grad_norm": 16.84731101989746, - "learning_rate": 1.7608499548962615e-05, - "loss": 2.9056, + "epoch": 0.15, + "grad_norm": 21.333202362060547, + "learning_rate": 1.900179893737188e-05, + "loss": 1.8671, "step": 1194 }, { - "epoch": 0.36, - "grad_norm": 51.60824203491211, - "learning_rate": 1.7606494938358225e-05, - "loss": 2.3254, + "epoch": 0.15, + "grad_norm": 10.239437103271484, + "learning_rate": 1.900096222231519e-05, + "loss": 1.6627, "step": 1195 }, { - "epoch": 0.36, - "grad_norm": 18.47427749633789, - "learning_rate": 1.7604490327753836e-05, - "loss": 2.9355, + "epoch": 0.15, + "grad_norm": 9.137307167053223, + "learning_rate": 1.9000125507258504e-05, + "loss": 2.6344, "step": 1196 }, { - "epoch": 0.36, - "grad_norm": 15.123434066772461, - "learning_rate": 1.7602485717149446e-05, - "loss": 2.588, + "epoch": 0.15, + "grad_norm": 13.801033020019531, + "learning_rate": 1.8999288792201818e-05, + "loss": 3.7894, "step": 1197 }, { - "epoch": 0.36, - "grad_norm": 19.959136962890625, - "learning_rate": 1.7600481106545056e-05, - "loss": 2.5238, + "epoch": 0.15, + "grad_norm": 22.311199188232422, + "learning_rate": 1.899845207714513e-05, + "loss": 2.3578, "step": 1198 }, { - "epoch": 0.36, - "grad_norm": 19.62584114074707, - "learning_rate": 1.7598476495940662e-05, - "loss": 3.2818, + "epoch": 0.15, + "grad_norm": 13.077570915222168, + "learning_rate": 1.8997615362088442e-05, + "loss": 2.8371, "step": 1199 }, { - "epoch": 0.36, - "grad_norm": 19.437580108642578, - "learning_rate": 1.7596471885336276e-05, - "loss": 2.8019, + "epoch": 0.15, + "grad_norm": 20.19798469543457, + "learning_rate": 1.8996778647031755e-05, + "loss": 3.2919, "step": 1200 }, { - "epoch": 0.36, - "eval_loss": 0.5124673843383789, - "eval_runtime": 43.4829, - "eval_samples_per_second": 34.013, - "eval_steps_per_second": 34.013, + "epoch": 0.15, + "eval_loss": 0.2365642935037613, + "eval_runtime": 93.584, + "eval_samples_per_second": 37.848, + "eval_steps_per_second": 37.848, "step": 1200 }, { - "epoch": 0.36, - "grad_norm": 41.26815414428711, - "learning_rate": 1.7594467274731886e-05, - "loss": 3.0088, + "epoch": 0.15, + "grad_norm": 13.819417953491211, + "learning_rate": 1.899594193197507e-05, + "loss": 3.3004, "step": 1201 }, { - "epoch": 0.36, - "grad_norm": 23.42255973815918, - "learning_rate": 1.7592462664127493e-05, - "loss": 3.0811, + "epoch": 0.15, + "grad_norm": 22.852203369140625, + "learning_rate": 1.899510521691838e-05, + "loss": 3.276, "step": 1202 }, { - "epoch": 0.36, - "grad_norm": 24.135406494140625, - "learning_rate": 1.7590458053523106e-05, - "loss": 3.4351, + "epoch": 0.15, + "grad_norm": 16.19291877746582, + "learning_rate": 1.8994268501861693e-05, + "loss": 3.67, "step": 1203 }, { - "epoch": 0.36, - "grad_norm": 11.820728302001953, - "learning_rate": 1.7588453442918713e-05, - "loss": 2.2131, + "epoch": 0.15, + "grad_norm": 16.76343536376953, + "learning_rate": 1.8993431786805007e-05, + "loss": 1.8594, "step": 1204 }, { - "epoch": 0.36, - "grad_norm": 12.589012145996094, - "learning_rate": 1.7586448832314323e-05, - "loss": 2.5815, + "epoch": 0.15, + "grad_norm": 6.684631824493408, + "learning_rate": 1.899259507174832e-05, + "loss": 0.4694, "step": 1205 }, { - "epoch": 0.36, - "grad_norm": 14.668851852416992, - "learning_rate": 1.7584444221709936e-05, - "loss": 2.9765, + "epoch": 0.15, + "grad_norm": 12.731599807739258, + "learning_rate": 1.899175835669163e-05, + "loss": 2.2129, "step": 1206 }, { - "epoch": 0.36, - "grad_norm": 13.122230529785156, - "learning_rate": 1.7582439611105543e-05, - "loss": 2.3476, + "epoch": 0.15, + "grad_norm": 42.011207580566406, + "learning_rate": 1.8990921641634944e-05, + "loss": 4.4189, "step": 1207 }, { - "epoch": 0.36, - "grad_norm": 17.157947540283203, - "learning_rate": 1.7580435000501153e-05, - "loss": 2.8408, + "epoch": 0.15, + "grad_norm": 67.61883544921875, + "learning_rate": 1.8990084926578254e-05, + "loss": 3.1967, "step": 1208 }, { - "epoch": 0.36, - "grad_norm": 21.18256950378418, - "learning_rate": 1.7578430389896763e-05, - "loss": 2.5619, + "epoch": 0.15, + "grad_norm": 15.998286247253418, + "learning_rate": 1.8989248211521568e-05, + "loss": 3.8409, "step": 1209 }, { - "epoch": 0.36, - "grad_norm": 19.1740779876709, - "learning_rate": 1.7576425779292373e-05, - "loss": 4.0597, + "epoch": 0.15, + "grad_norm": 9.8434419631958, + "learning_rate": 1.8988411496464882e-05, + "loss": 2.0819, "step": 1210 }, { - "epoch": 0.36, - "grad_norm": 20.59099769592285, - "learning_rate": 1.7574421168687983e-05, - "loss": 1.8192, + "epoch": 0.15, + "grad_norm": 148.34994506835938, + "learning_rate": 1.8987574781408192e-05, + "loss": 2.4031, "step": 1211 }, { - "epoch": 0.36, - "grad_norm": 31.291309356689453, - "learning_rate": 1.7572416558083593e-05, - "loss": 3.6861, + "epoch": 0.15, + "grad_norm": 44.0875358581543, + "learning_rate": 1.8986738066351506e-05, + "loss": 2.2609, "step": 1212 }, { - "epoch": 0.36, - "grad_norm": 11.655035018920898, - "learning_rate": 1.7570411947479204e-05, - "loss": 2.3163, + "epoch": 0.15, + "grad_norm": 13.564834594726562, + "learning_rate": 1.898590135129482e-05, + "loss": 3.4062, "step": 1213 }, { - "epoch": 0.37, - "grad_norm": 27.356048583984375, - "learning_rate": 1.7568407336874814e-05, - "loss": 3.9249, + "epoch": 0.15, + "grad_norm": 13.418842315673828, + "learning_rate": 1.898506463623813e-05, + "loss": 2.1331, "step": 1214 }, { - "epoch": 0.37, - "grad_norm": 22.89837646484375, - "learning_rate": 1.7566402726270424e-05, - "loss": 2.6824, + "epoch": 0.15, + "grad_norm": 11.270611763000488, + "learning_rate": 1.8984227921181443e-05, + "loss": 2.806, "step": 1215 }, { - "epoch": 0.37, - "grad_norm": 16.46586799621582, - "learning_rate": 1.7564398115666034e-05, - "loss": 2.5388, + "epoch": 0.15, + "grad_norm": 34.41801452636719, + "learning_rate": 1.8983391206124753e-05, + "loss": 4.1672, "step": 1216 }, { - "epoch": 0.37, - "grad_norm": 18.528961181640625, - "learning_rate": 1.7562393505061644e-05, - "loss": 2.435, + "epoch": 0.15, + "grad_norm": 17.68085289001465, + "learning_rate": 1.8982554491068067e-05, + "loss": 3.34, "step": 1217 }, { - "epoch": 0.37, - "grad_norm": 10.361041069030762, - "learning_rate": 1.756038889445725e-05, - "loss": 2.5633, + "epoch": 0.15, + "grad_norm": 18.969608306884766, + "learning_rate": 1.898171777601138e-05, + "loss": 1.0799, "step": 1218 }, { - "epoch": 0.37, - "grad_norm": 17.265287399291992, - "learning_rate": 1.7558384283852864e-05, - "loss": 2.3523, + "epoch": 0.15, + "grad_norm": 9.720231056213379, + "learning_rate": 1.8980881060954694e-05, + "loss": 2.2871, "step": 1219 }, { - "epoch": 0.37, - "grad_norm": 18.783292770385742, - "learning_rate": 1.7556379673248474e-05, - "loss": 2.3425, + "epoch": 0.15, + "grad_norm": 13.65944766998291, + "learning_rate": 1.8980044345898005e-05, + "loss": 1.9212, "step": 1220 }, { - "epoch": 0.37, - "grad_norm": 21.62091827392578, - "learning_rate": 1.755437506264408e-05, - "loss": 2.7889, + "epoch": 0.15, + "grad_norm": 10.61152172088623, + "learning_rate": 1.8979207630841318e-05, + "loss": 1.9664, "step": 1221 }, { - "epoch": 0.37, - "grad_norm": 19.435802459716797, - "learning_rate": 1.7552370452039694e-05, - "loss": 3.4363, + "epoch": 0.15, + "grad_norm": 18.863014221191406, + "learning_rate": 1.8978370915784632e-05, + "loss": 3.9371, "step": 1222 }, { - "epoch": 0.37, - "grad_norm": 13.962774276733398, - "learning_rate": 1.75503658414353e-05, - "loss": 1.6731, + "epoch": 0.15, + "grad_norm": 16.500232696533203, + "learning_rate": 1.8977534200727942e-05, + "loss": 4.1268, "step": 1223 }, { - "epoch": 0.37, - "grad_norm": 10.637103080749512, - "learning_rate": 1.754836123083091e-05, - "loss": 1.9421, + "epoch": 0.15, + "grad_norm": 11.271100044250488, + "learning_rate": 1.8976697485671256e-05, + "loss": 2.2966, "step": 1224 }, { - "epoch": 0.37, - "grad_norm": 12.985700607299805, - "learning_rate": 1.7546356620226525e-05, - "loss": 2.2144, + "epoch": 0.15, + "grad_norm": 13.12723445892334, + "learning_rate": 1.897586077061457e-05, + "loss": 2.3253, "step": 1225 }, { - "epoch": 0.37, - "grad_norm": 16.512508392333984, - "learning_rate": 1.754435200962213e-05, - "loss": 2.4662, + "epoch": 0.15, + "grad_norm": 11.523720741271973, + "learning_rate": 1.8975024055557883e-05, + "loss": 2.9859, "step": 1226 }, { - "epoch": 0.37, - "grad_norm": 20.497188568115234, - "learning_rate": 1.754234739901774e-05, - "loss": 2.0858, + "epoch": 0.15, + "grad_norm": 12.031673431396484, + "learning_rate": 1.8974187340501193e-05, + "loss": 2.6248, "step": 1227 }, { - "epoch": 0.37, - "grad_norm": 32.291175842285156, - "learning_rate": 1.754034278841335e-05, - "loss": 3.3415, + "epoch": 0.15, + "grad_norm": 7.525265216827393, + "learning_rate": 1.8973350625444507e-05, + "loss": 1.3119, "step": 1228 }, { - "epoch": 0.37, - "grad_norm": 16.312284469604492, - "learning_rate": 1.753833817780896e-05, - "loss": 2.2323, + "epoch": 0.15, + "grad_norm": 14.541187286376953, + "learning_rate": 1.897251391038782e-05, + "loss": 4.317, "step": 1229 }, { - "epoch": 0.37, - "grad_norm": 23.16538429260254, - "learning_rate": 1.753633356720457e-05, - "loss": 3.7011, + "epoch": 0.15, + "grad_norm": 8.85118293762207, + "learning_rate": 1.897167719533113e-05, + "loss": 1.5031, "step": 1230 }, { - "epoch": 0.37, - "grad_norm": 9.652095794677734, - "learning_rate": 1.7534328956600182e-05, - "loss": 2.5907, + "epoch": 0.15, + "grad_norm": 24.35873794555664, + "learning_rate": 1.8970840480274445e-05, + "loss": 2.8286, "step": 1231 }, { - "epoch": 0.37, - "grad_norm": 16.361186981201172, - "learning_rate": 1.7532324345995792e-05, - "loss": 3.1489, + "epoch": 0.15, + "grad_norm": 18.828296661376953, + "learning_rate": 1.8970003765217758e-05, + "loss": 3.0941, "step": 1232 }, { - "epoch": 0.37, - "grad_norm": 24.299665451049805, - "learning_rate": 1.7530319735391402e-05, - "loss": 2.751, + "epoch": 0.15, + "grad_norm": 89.44055938720703, + "learning_rate": 1.8969167050161072e-05, + "loss": 2.4806, "step": 1233 }, { - "epoch": 0.37, - "grad_norm": 11.663421630859375, - "learning_rate": 1.7528315124787012e-05, - "loss": 2.2756, + "epoch": 0.15, + "grad_norm": 7.748295783996582, + "learning_rate": 1.8968330335104382e-05, + "loss": 0.6644, "step": 1234 }, { - "epoch": 0.37, - "grad_norm": 16.520639419555664, - "learning_rate": 1.7526310514182622e-05, - "loss": 1.8753, + "epoch": 0.15, + "grad_norm": 7.684282302856445, + "learning_rate": 1.8967493620047696e-05, + "loss": 0.4885, "step": 1235 }, { - "epoch": 0.37, - "grad_norm": 30.01593780517578, - "learning_rate": 1.7524305903578232e-05, - "loss": 1.1438, + "epoch": 0.16, + "grad_norm": 16.291973114013672, + "learning_rate": 1.8966656904991006e-05, + "loss": 3.0854, "step": 1236 }, { - "epoch": 0.37, - "grad_norm": 28.729244232177734, - "learning_rate": 1.7522301292973842e-05, - "loss": 3.4131, + "epoch": 0.16, + "grad_norm": 11.370176315307617, + "learning_rate": 1.896582018993432e-05, + "loss": 4.0331, "step": 1237 }, { - "epoch": 0.37, - "grad_norm": 18.10856819152832, - "learning_rate": 1.7520296682369452e-05, - "loss": 2.8804, + "epoch": 0.16, + "grad_norm": 19.60243034362793, + "learning_rate": 1.8964983474877633e-05, + "loss": 2.9217, "step": 1238 }, { - "epoch": 0.37, - "grad_norm": 18.52988052368164, - "learning_rate": 1.7518292071765062e-05, - "loss": 2.57, + "epoch": 0.16, + "grad_norm": 10.804051399230957, + "learning_rate": 1.8964146759820944e-05, + "loss": 2.2526, "step": 1239 }, { - "epoch": 0.37, - "grad_norm": 18.071670532226562, - "learning_rate": 1.751628746116067e-05, - "loss": 2.5888, + "epoch": 0.16, + "grad_norm": 13.438443183898926, + "learning_rate": 1.8963310044764257e-05, + "loss": 3.4662, "step": 1240 }, { - "epoch": 0.37, - "grad_norm": 9.437912940979004, - "learning_rate": 1.7514282850556283e-05, - "loss": 2.1698, + "epoch": 0.16, + "grad_norm": 11.870586395263672, + "learning_rate": 1.8962473329707568e-05, + "loss": 2.2085, "step": 1241 }, { - "epoch": 0.37, - "grad_norm": 11.90031623840332, - "learning_rate": 1.751227823995189e-05, - "loss": 2.5627, + "epoch": 0.16, + "grad_norm": 14.314711570739746, + "learning_rate": 1.896163661465088e-05, + "loss": 3.4219, "step": 1242 }, { - "epoch": 0.37, - "grad_norm": 12.437149047851562, - "learning_rate": 1.75102736293475e-05, - "loss": 2.2992, + "epoch": 0.16, + "grad_norm": 14.737003326416016, + "learning_rate": 1.8960799899594195e-05, + "loss": 1.5509, "step": 1243 }, { - "epoch": 0.37, - "grad_norm": 27.860820770263672, - "learning_rate": 1.7508269018743113e-05, - "loss": 3.5747, + "epoch": 0.16, + "grad_norm": 21.78973388671875, + "learning_rate": 1.8959963184537505e-05, + "loss": 1.1379, "step": 1244 }, { - "epoch": 0.37, - "grad_norm": 26.30794334411621, - "learning_rate": 1.750626440813872e-05, - "loss": 3.8743, + "epoch": 0.16, + "grad_norm": 19.68343734741211, + "learning_rate": 1.895912646948082e-05, + "loss": 4.0445, "step": 1245 }, { - "epoch": 0.37, - "grad_norm": 17.703901290893555, - "learning_rate": 1.750425979753433e-05, - "loss": 2.338, + "epoch": 0.16, + "grad_norm": 14.491902351379395, + "learning_rate": 1.8958289754424132e-05, + "loss": 3.4564, "step": 1246 }, { - "epoch": 0.37, - "grad_norm": 10.545022010803223, - "learning_rate": 1.750225518692994e-05, - "loss": 1.9899, + "epoch": 0.16, + "grad_norm": 10.514927864074707, + "learning_rate": 1.8957453039367446e-05, + "loss": 2.9613, "step": 1247 }, { - "epoch": 0.38, - "grad_norm": 24.279394149780273, - "learning_rate": 1.750025057632555e-05, - "loss": 2.9463, + "epoch": 0.16, + "grad_norm": 17.163677215576172, + "learning_rate": 1.8956616324310756e-05, + "loss": 3.3785, "step": 1248 }, { - "epoch": 0.38, - "grad_norm": 21.58678436279297, - "learning_rate": 1.749824596572116e-05, - "loss": 3.4896, + "epoch": 0.16, + "grad_norm": 7.404734134674072, + "learning_rate": 1.895577960925407e-05, + "loss": 2.0438, "step": 1249 }, { - "epoch": 0.38, - "grad_norm": 13.904131889343262, - "learning_rate": 1.749624135511677e-05, - "loss": 3.1997, + "epoch": 0.16, + "grad_norm": 14.131943702697754, + "learning_rate": 1.8954942894197384e-05, + "loss": 1.5331, "step": 1250 }, { - "epoch": 0.38, - "grad_norm": 10.544210433959961, - "learning_rate": 1.749423674451238e-05, - "loss": 1.9443, + "epoch": 0.16, + "grad_norm": 15.94304084777832, + "learning_rate": 1.8954106179140694e-05, + "loss": 2.2588, "step": 1251 }, { - "epoch": 0.38, - "grad_norm": 16.909954071044922, - "learning_rate": 1.749223213390799e-05, - "loss": 3.1345, + "epoch": 0.16, + "grad_norm": 16.104053497314453, + "learning_rate": 1.8953269464084008e-05, + "loss": 2.8138, "step": 1252 }, { - "epoch": 0.38, - "grad_norm": 18.661376953125, - "learning_rate": 1.74902275233036e-05, - "loss": 3.7376, + "epoch": 0.16, + "grad_norm": 12.07219123840332, + "learning_rate": 1.895243274902732e-05, + "loss": 3.3369, "step": 1253 }, { - "epoch": 0.38, - "grad_norm": 16.603628158569336, - "learning_rate": 1.748822291269921e-05, - "loss": 2.108, + "epoch": 0.16, + "grad_norm": 21.96693229675293, + "learning_rate": 1.895159603397063e-05, + "loss": 3.9521, "step": 1254 }, { - "epoch": 0.38, - "grad_norm": 12.670201301574707, - "learning_rate": 1.748621830209482e-05, - "loss": 3.4042, + "epoch": 0.16, + "grad_norm": 16.473649978637695, + "learning_rate": 1.8950759318913945e-05, + "loss": 3.1058, "step": 1255 }, { - "epoch": 0.38, - "grad_norm": 12.029052734375, - "learning_rate": 1.748421369149043e-05, - "loss": 2.9556, + "epoch": 0.16, + "grad_norm": 34.58412551879883, + "learning_rate": 1.894992260385726e-05, + "loss": 2.8908, "step": 1256 }, { - "epoch": 0.38, - "grad_norm": 28.61595916748047, - "learning_rate": 1.748220908088604e-05, - "loss": 2.0854, + "epoch": 0.16, + "grad_norm": 13.569806098937988, + "learning_rate": 1.8949085888800572e-05, + "loss": 1.7347, "step": 1257 }, { - "epoch": 0.38, - "grad_norm": 22.586305618286133, - "learning_rate": 1.748020447028165e-05, - "loss": 2.461, + "epoch": 0.16, + "grad_norm": 17.082672119140625, + "learning_rate": 1.8948249173743883e-05, + "loss": 1.3953, "step": 1258 }, { - "epoch": 0.38, - "grad_norm": 14.331897735595703, - "learning_rate": 1.7478199859677257e-05, - "loss": 3.5532, + "epoch": 0.16, + "grad_norm": 10.1463041305542, + "learning_rate": 1.8947412458687196e-05, + "loss": 1.7843, "step": 1259 }, { - "epoch": 0.38, - "grad_norm": 18.208744049072266, - "learning_rate": 1.747619524907287e-05, - "loss": 2.59, + "epoch": 0.16, + "grad_norm": 45.11806869506836, + "learning_rate": 1.894657574363051e-05, + "loss": 3.1561, "step": 1260 }, { - "epoch": 0.38, - "grad_norm": 26.50733757019043, - "learning_rate": 1.7474190638468477e-05, - "loss": 3.567, + "epoch": 0.16, + "grad_norm": 9.12941837310791, + "learning_rate": 1.894573902857382e-05, + "loss": 1.6232, "step": 1261 }, { - "epoch": 0.38, - "grad_norm": 17.948020935058594, - "learning_rate": 1.7472186027864088e-05, - "loss": 2.5004, + "epoch": 0.16, + "grad_norm": 17.47677993774414, + "learning_rate": 1.8944902313517134e-05, + "loss": 4.1126, "step": 1262 }, { - "epoch": 0.38, - "grad_norm": 11.927952766418457, - "learning_rate": 1.74701814172597e-05, - "loss": 2.1265, + "epoch": 0.16, + "grad_norm": 12.587291717529297, + "learning_rate": 1.8944065598460447e-05, + "loss": 2.5521, "step": 1263 }, { - "epoch": 0.38, - "grad_norm": 26.811809539794922, - "learning_rate": 1.7468176806655308e-05, - "loss": 2.7967, + "epoch": 0.16, + "grad_norm": 18.3865966796875, + "learning_rate": 1.8943228883403758e-05, + "loss": 4.6634, "step": 1264 }, { - "epoch": 0.38, - "grad_norm": 82.8468246459961, - "learning_rate": 1.7466172196050918e-05, - "loss": 2.652, + "epoch": 0.16, + "grad_norm": 12.10347843170166, + "learning_rate": 1.894239216834707e-05, + "loss": 3.8171, "step": 1265 }, { - "epoch": 0.38, - "grad_norm": 13.454938888549805, - "learning_rate": 1.7464167585446528e-05, - "loss": 2.282, + "epoch": 0.16, + "grad_norm": 9.453478813171387, + "learning_rate": 1.8941555453290385e-05, + "loss": 1.9866, "step": 1266 }, { - "epoch": 0.38, - "grad_norm": 20.55266571044922, - "learning_rate": 1.7462162974842138e-05, - "loss": 2.3296, + "epoch": 0.16, + "grad_norm": 13.832612037658691, + "learning_rate": 1.8940718738233695e-05, + "loss": 3.8677, "step": 1267 }, { - "epoch": 0.38, - "grad_norm": 10.279245376586914, - "learning_rate": 1.7460158364237748e-05, - "loss": 1.3453, + "epoch": 0.16, + "grad_norm": 10.849949836730957, + "learning_rate": 1.893988202317701e-05, + "loss": 2.711, "step": 1268 }, { - "epoch": 0.38, - "grad_norm": 11.651695251464844, - "learning_rate": 1.7458153753633358e-05, - "loss": 2.5631, + "epoch": 0.16, + "grad_norm": 11.43234634399414, + "learning_rate": 1.893904530812032e-05, + "loss": 2.7987, "step": 1269 }, { - "epoch": 0.38, - "grad_norm": 12.197574615478516, - "learning_rate": 1.7456149143028968e-05, - "loss": 3.0425, + "epoch": 0.16, + "grad_norm": 10.973919868469238, + "learning_rate": 1.8938208593063633e-05, + "loss": 3.1193, "step": 1270 }, { - "epoch": 0.38, - "grad_norm": 24.782493591308594, - "learning_rate": 1.7454144532424578e-05, - "loss": 3.1392, + "epoch": 0.16, + "grad_norm": 12.14253044128418, + "learning_rate": 1.8937371878006947e-05, + "loss": 3.347, "step": 1271 }, { - "epoch": 0.38, - "grad_norm": 10.844775199890137, - "learning_rate": 1.745213992182019e-05, - "loss": 2.9512, + "epoch": 0.16, + "grad_norm": 14.993719100952148, + "learning_rate": 1.8936535162950257e-05, + "loss": 0.8552, "step": 1272 }, { - "epoch": 0.38, - "grad_norm": 13.511686325073242, - "learning_rate": 1.7450135311215795e-05, - "loss": 2.8739, + "epoch": 0.16, + "grad_norm": 11.494132041931152, + "learning_rate": 1.893569844789357e-05, + "loss": 2.2367, "step": 1273 }, { - "epoch": 0.38, - "grad_norm": 21.19719886779785, - "learning_rate": 1.744813070061141e-05, - "loss": 2.8494, + "epoch": 0.16, + "grad_norm": 28.249052047729492, + "learning_rate": 1.8934861732836884e-05, + "loss": 1.8822, "step": 1274 }, { - "epoch": 0.38, - "grad_norm": 17.959545135498047, - "learning_rate": 1.744612609000702e-05, - "loss": 2.8389, + "epoch": 0.16, + "grad_norm": 15.573881149291992, + "learning_rate": 1.8934025017780194e-05, + "loss": 4.3407, "step": 1275 }, { - "epoch": 0.38, - "grad_norm": 17.719860076904297, - "learning_rate": 1.7444121479402625e-05, - "loss": 2.7844, + "epoch": 0.16, + "grad_norm": 9.760501861572266, + "learning_rate": 1.8933188302723508e-05, + "loss": 0.9577, "step": 1276 }, { - "epoch": 0.38, - "grad_norm": 15.047807693481445, - "learning_rate": 1.744211686879824e-05, - "loss": 2.5462, + "epoch": 0.16, + "grad_norm": 30.468326568603516, + "learning_rate": 1.893235158766682e-05, + "loss": 2.6084, "step": 1277 }, { - "epoch": 0.38, - "grad_norm": 18.358489990234375, - "learning_rate": 1.7440112258193846e-05, - "loss": 2.8936, + "epoch": 0.16, + "grad_norm": 18.64023780822754, + "learning_rate": 1.8931514872610135e-05, + "loss": 4.2978, "step": 1278 }, { - "epoch": 0.38, - "grad_norm": 20.895231246948242, - "learning_rate": 1.743810764758946e-05, - "loss": 3.1691, + "epoch": 0.16, + "grad_norm": 10.93867301940918, + "learning_rate": 1.8930678157553446e-05, + "loss": 3.2249, "step": 1279 }, { - "epoch": 0.38, - "grad_norm": 10.4979248046875, - "learning_rate": 1.743610303698507e-05, - "loss": 1.5841, + "epoch": 0.16, + "grad_norm": 51.95521545410156, + "learning_rate": 1.892984144249676e-05, + "loss": 2.4306, "step": 1280 }, { - "epoch": 0.39, - "grad_norm": 14.42455768585205, - "learning_rate": 1.7434098426380676e-05, - "loss": 2.7656, + "epoch": 0.16, + "grad_norm": 19.859397888183594, + "learning_rate": 1.8929004727440073e-05, + "loss": 3.4901, "step": 1281 }, { - "epoch": 0.39, - "grad_norm": 11.873161315917969, - "learning_rate": 1.743209381577629e-05, - "loss": 2.9719, + "epoch": 0.16, + "grad_norm": 19.423580169677734, + "learning_rate": 1.8928168012383383e-05, + "loss": 3.4095, "step": 1282 }, { - "epoch": 0.39, - "grad_norm": 23.232919692993164, - "learning_rate": 1.7430089205171896e-05, - "loss": 2.5282, + "epoch": 0.16, + "grad_norm": 14.165048599243164, + "learning_rate": 1.8927331297326697e-05, + "loss": 3.5648, "step": 1283 }, { - "epoch": 0.39, - "grad_norm": 15.954151153564453, - "learning_rate": 1.7428084594567506e-05, - "loss": 3.3602, + "epoch": 0.16, + "grad_norm": 10.209765434265137, + "learning_rate": 1.892649458227001e-05, + "loss": 3.1595, "step": 1284 }, { - "epoch": 0.39, - "grad_norm": 15.332113265991211, - "learning_rate": 1.7426079983963116e-05, - "loss": 3.1367, + "epoch": 0.16, + "grad_norm": 23.574323654174805, + "learning_rate": 1.8925657867213324e-05, + "loss": 2.8012, "step": 1285 }, { - "epoch": 0.39, - "grad_norm": 17.88947296142578, - "learning_rate": 1.7424075373358726e-05, - "loss": 2.8707, + "epoch": 0.16, + "grad_norm": 8.216002464294434, + "learning_rate": 1.8924821152156634e-05, + "loss": 1.5007, "step": 1286 }, { - "epoch": 0.39, - "grad_norm": 14.455220222473145, - "learning_rate": 1.7422070762754336e-05, - "loss": 2.7743, + "epoch": 0.16, + "grad_norm": 20.161785125732422, + "learning_rate": 1.8923984437099948e-05, + "loss": 3.0664, "step": 1287 }, { - "epoch": 0.39, - "grad_norm": 12.738212585449219, - "learning_rate": 1.7420066152149946e-05, - "loss": 2.4288, + "epoch": 0.16, + "grad_norm": 11.79356575012207, + "learning_rate": 1.892314772204326e-05, + "loss": 2.6138, "step": 1288 }, { - "epoch": 0.39, - "grad_norm": 17.96286392211914, - "learning_rate": 1.7418061541545556e-05, - "loss": 2.1277, + "epoch": 0.16, + "grad_norm": 13.224976539611816, + "learning_rate": 1.8922311006986572e-05, + "loss": 4.1931, "step": 1289 }, { - "epoch": 0.39, - "grad_norm": 10.881392478942871, - "learning_rate": 1.7416056930941167e-05, - "loss": 3.3985, + "epoch": 0.16, + "grad_norm": 11.43084716796875, + "learning_rate": 1.8921474291929886e-05, + "loss": 3.2066, "step": 1290 }, { - "epoch": 0.39, - "grad_norm": 9.816652297973633, - "learning_rate": 1.7414052320336777e-05, - "loss": 1.9979, + "epoch": 0.16, + "grad_norm": 14.025525093078613, + "learning_rate": 1.89206375768732e-05, + "loss": 3.5726, "step": 1291 }, { - "epoch": 0.39, - "grad_norm": 14.846339225769043, - "learning_rate": 1.7412047709732387e-05, - "loss": 3.2407, + "epoch": 0.16, + "grad_norm": 10.88570785522461, + "learning_rate": 1.891980086181651e-05, + "loss": 2.0154, "step": 1292 }, { - "epoch": 0.39, - "grad_norm": 24.287704467773438, - "learning_rate": 1.7410043099127997e-05, - "loss": 3.1519, + "epoch": 0.16, + "grad_norm": 14.027239799499512, + "learning_rate": 1.8918964146759823e-05, + "loss": 2.4733, "step": 1293 }, { - "epoch": 0.39, - "grad_norm": 18.814102172851562, - "learning_rate": 1.7408038488523607e-05, - "loss": 3.9634, + "epoch": 0.16, + "grad_norm": 9.714178085327148, + "learning_rate": 1.8918127431703133e-05, + "loss": 1.456, "step": 1294 }, { - "epoch": 0.39, - "grad_norm": 21.42952537536621, - "learning_rate": 1.7406033877919214e-05, - "loss": 3.3032, + "epoch": 0.16, + "grad_norm": 19.824508666992188, + "learning_rate": 1.8917290716646447e-05, + "loss": 4.0455, "step": 1295 }, { - "epoch": 0.39, - "grad_norm": 14.19246768951416, - "learning_rate": 1.7404029267314827e-05, - "loss": 2.3004, + "epoch": 0.16, + "grad_norm": 1605.49951171875, + "learning_rate": 1.891645400158976e-05, + "loss": 1.6104, "step": 1296 }, { - "epoch": 0.39, - "grad_norm": 32.35289001464844, - "learning_rate": 1.7402024656710434e-05, - "loss": 4.9394, + "epoch": 0.16, + "grad_norm": 12.466790199279785, + "learning_rate": 1.891561728653307e-05, + "loss": 2.2651, "step": 1297 }, { - "epoch": 0.39, - "grad_norm": 14.341626167297363, - "learning_rate": 1.7400020046106044e-05, - "loss": 2.9232, + "epoch": 0.16, + "grad_norm": 17.17475128173828, + "learning_rate": 1.8914780571476385e-05, + "loss": 3.2729, "step": 1298 }, { - "epoch": 0.39, - "grad_norm": 13.265176773071289, - "learning_rate": 1.7398015435501657e-05, - "loss": 3.7512, + "epoch": 0.16, + "grad_norm": 16.722414016723633, + "learning_rate": 1.8913943856419698e-05, + "loss": 1.8439, "step": 1299 }, { - "epoch": 0.39, - "grad_norm": 15.265084266662598, - "learning_rate": 1.7396010824897264e-05, - "loss": 3.0586, + "epoch": 0.16, + "grad_norm": 7.308617115020752, + "learning_rate": 1.891310714136301e-05, + "loss": 1.2779, "step": 1300 }, { - "epoch": 0.39, - "grad_norm": 19.14535903930664, - "learning_rate": 1.7394006214292874e-05, - "loss": 3.9271, + "epoch": 0.16, + "grad_norm": 12.488905906677246, + "learning_rate": 1.8912270426306322e-05, + "loss": 3.6065, "step": 1301 }, { - "epoch": 0.39, - "grad_norm": 19.937034606933594, - "learning_rate": 1.7392001603688484e-05, - "loss": 1.9253, + "epoch": 0.16, + "grad_norm": 11.767340660095215, + "learning_rate": 1.8911433711249636e-05, + "loss": 2.3138, "step": 1302 }, { - "epoch": 0.39, - "grad_norm": 15.879293441772461, - "learning_rate": 1.7389996993084094e-05, - "loss": 2.5909, + "epoch": 0.16, + "grad_norm": 12.223610877990723, + "learning_rate": 1.8910596996192946e-05, + "loss": 4.5897, "step": 1303 }, { - "epoch": 0.39, - "grad_norm": 22.646526336669922, - "learning_rate": 1.7387992382479704e-05, - "loss": 2.1147, + "epoch": 0.16, + "grad_norm": 8.791810989379883, + "learning_rate": 1.890976028113626e-05, + "loss": 0.9778, "step": 1304 }, { - "epoch": 0.39, - "grad_norm": 19.122434616088867, - "learning_rate": 1.7385987771875314e-05, - "loss": 2.6576, + "epoch": 0.16, + "grad_norm": 11.121882438659668, + "learning_rate": 1.8908923566079573e-05, + "loss": 2.1943, "step": 1305 }, { - "epoch": 0.39, - "grad_norm": 16.39354705810547, - "learning_rate": 1.7383983161270924e-05, - "loss": 2.3048, + "epoch": 0.16, + "grad_norm": 15.698457717895508, + "learning_rate": 1.8908086851022887e-05, + "loss": 2.093, "step": 1306 }, { - "epoch": 0.39, - "grad_norm": 21.781461715698242, - "learning_rate": 1.7381978550666535e-05, - "loss": 2.5542, + "epoch": 0.16, + "grad_norm": 14.07865047454834, + "learning_rate": 1.8907250135966197e-05, + "loss": 3.0307, "step": 1307 }, { - "epoch": 0.39, - "grad_norm": 13.316234588623047, - "learning_rate": 1.7379973940062145e-05, - "loss": 2.669, + "epoch": 0.16, + "grad_norm": 14.06962776184082, + "learning_rate": 1.890641342090951e-05, + "loss": 3.8779, "step": 1308 }, { - "epoch": 0.39, - "grad_norm": 11.472354888916016, - "learning_rate": 1.7377969329457755e-05, - "loss": 2.7657, + "epoch": 0.16, + "grad_norm": 9.522274017333984, + "learning_rate": 1.8905576705852825e-05, + "loss": 2.6916, "step": 1309 }, { - "epoch": 0.39, - "grad_norm": 25.965810775756836, - "learning_rate": 1.7375964718853365e-05, - "loss": 2.8167, + "epoch": 0.16, + "grad_norm": 9.4116849899292, + "learning_rate": 1.8904739990796135e-05, + "loss": 1.5084, "step": 1310 }, { - "epoch": 0.39, - "grad_norm": 17.545780181884766, - "learning_rate": 1.7373960108248975e-05, - "loss": 3.5917, + "epoch": 0.16, + "grad_norm": 16.273611068725586, + "learning_rate": 1.890390327573945e-05, + "loss": 4.6093, "step": 1311 }, { - "epoch": 0.39, - "grad_norm": 22.960935592651367, - "learning_rate": 1.7371955497644585e-05, - "loss": 3.4562, + "epoch": 0.16, + "grad_norm": 16.295133590698242, + "learning_rate": 1.8903066560682762e-05, + "loss": 2.3653, "step": 1312 }, { - "epoch": 0.39, - "grad_norm": 11.906625747680664, - "learning_rate": 1.7369950887040195e-05, - "loss": 3.293, + "epoch": 0.16, + "grad_norm": 24.667890548706055, + "learning_rate": 1.8902229845626076e-05, + "loss": 3.506, "step": 1313 }, { - "epoch": 0.4, - "grad_norm": 14.520302772521973, - "learning_rate": 1.7367946276435802e-05, - "loss": 2.368, + "epoch": 0.16, + "grad_norm": 17.03211212158203, + "learning_rate": 1.8901393130569386e-05, + "loss": 2.8704, "step": 1314 }, { - "epoch": 0.4, - "grad_norm": 9.339329719543457, - "learning_rate": 1.7365941665831415e-05, - "loss": 2.1992, + "epoch": 0.17, + "grad_norm": 13.481083869934082, + "learning_rate": 1.89005564155127e-05, + "loss": 1.1945, "step": 1315 }, { - "epoch": 0.4, - "grad_norm": 19.66999626159668, - "learning_rate": 1.7363937055227022e-05, - "loss": 3.1317, + "epoch": 0.17, + "grad_norm": 13.902771949768066, + "learning_rate": 1.8899719700456013e-05, + "loss": 4.1782, "step": 1316 }, { - "epoch": 0.4, - "grad_norm": 19.995849609375, - "learning_rate": 1.7361932444622632e-05, - "loss": 2.606, + "epoch": 0.17, + "grad_norm": 15.049631118774414, + "learning_rate": 1.8898882985399324e-05, + "loss": 2.4135, "step": 1317 }, { - "epoch": 0.4, - "grad_norm": 16.427093505859375, - "learning_rate": 1.7359927834018245e-05, - "loss": 1.7637, + "epoch": 0.17, + "grad_norm": 13.669821739196777, + "learning_rate": 1.8898046270342637e-05, + "loss": 3.7981, "step": 1318 }, { - "epoch": 0.4, - "grad_norm": 14.198533058166504, - "learning_rate": 1.7357923223413852e-05, - "loss": 2.904, + "epoch": 0.17, + "grad_norm": 9.362961769104004, + "learning_rate": 1.8897209555285947e-05, + "loss": 1.0743, "step": 1319 }, { - "epoch": 0.4, - "grad_norm": 12.009779930114746, - "learning_rate": 1.7355918612809462e-05, - "loss": 2.2005, - "step": 1320 - }, - { - "epoch": 0.4, - "eval_loss": 0.39250195026397705, - "eval_runtime": 43.6691, - "eval_samples_per_second": 33.868, - "eval_steps_per_second": 33.868, + "epoch": 0.17, + "grad_norm": 14.232973098754883, + "learning_rate": 1.889637284022926e-05, + "loss": 2.9732, "step": 1320 }, { - "epoch": 0.4, - "grad_norm": 11.49736499786377, - "learning_rate": 1.7353914002205072e-05, - "loss": 2.9098, + "epoch": 0.17, + "grad_norm": 14.265029907226562, + "learning_rate": 1.8895536125172575e-05, + "loss": 2.7579, "step": 1321 }, { - "epoch": 0.4, - "grad_norm": 12.25181770324707, - "learning_rate": 1.7351909391600682e-05, - "loss": 2.83, + "epoch": 0.17, + "grad_norm": 12.623398780822754, + "learning_rate": 1.8894699410115885e-05, + "loss": 1.7141, "step": 1322 }, { - "epoch": 0.4, - "grad_norm": 17.131059646606445, - "learning_rate": 1.7349904780996293e-05, - "loss": 2.6729, + "epoch": 0.17, + "grad_norm": 22.552587509155273, + "learning_rate": 1.88938626950592e-05, + "loss": 4.1298, "step": 1323 }, { - "epoch": 0.4, - "grad_norm": 16.65376853942871, - "learning_rate": 1.7347900170391903e-05, - "loss": 2.9689, + "epoch": 0.17, + "grad_norm": 13.751575469970703, + "learning_rate": 1.8893025980002512e-05, + "loss": 2.101, "step": 1324 }, { - "epoch": 0.4, - "grad_norm": 13.16373062133789, - "learning_rate": 1.7345895559787513e-05, - "loss": 2.5196, + "epoch": 0.17, + "grad_norm": 10.549819946289062, + "learning_rate": 1.8892189264945823e-05, + "loss": 2.7987, "step": 1325 }, { - "epoch": 0.4, - "grad_norm": 15.137256622314453, - "learning_rate": 1.7343890949183123e-05, - "loss": 3.2273, + "epoch": 0.17, + "grad_norm": 10.506394386291504, + "learning_rate": 1.8891352549889136e-05, + "loss": 4.0793, "step": 1326 }, { - "epoch": 0.4, - "grad_norm": 12.256842613220215, - "learning_rate": 1.7341886338578733e-05, - "loss": 3.0272, + "epoch": 0.17, + "grad_norm": 12.712037086486816, + "learning_rate": 1.889051583483245e-05, + "loss": 3.0174, "step": 1327 }, { - "epoch": 0.4, - "grad_norm": 46.50150680541992, - "learning_rate": 1.7339881727974343e-05, - "loss": 3.4704, + "epoch": 0.17, + "grad_norm": 13.093432426452637, + "learning_rate": 1.888967911977576e-05, + "loss": 1.6325, "step": 1328 }, { - "epoch": 0.4, - "grad_norm": 23.8785400390625, - "learning_rate": 1.7337877117369953e-05, - "loss": 3.1262, + "epoch": 0.17, + "grad_norm": 10.350591659545898, + "learning_rate": 1.8888842404719074e-05, + "loss": 0.7694, "step": 1329 }, { - "epoch": 0.4, - "grad_norm": 17.71267318725586, - "learning_rate": 1.7335872506765563e-05, - "loss": 2.1541, + "epoch": 0.17, + "grad_norm": 10.909330368041992, + "learning_rate": 1.8888005689662387e-05, + "loss": 1.5713, "step": 1330 }, { - "epoch": 0.4, - "grad_norm": 22.225250244140625, - "learning_rate": 1.7333867896161173e-05, - "loss": 1.5919, + "epoch": 0.17, + "grad_norm": 14.973225593566895, + "learning_rate": 1.8887168974605698e-05, + "loss": 3.2756, "step": 1331 }, { - "epoch": 0.4, - "grad_norm": 16.465585708618164, - "learning_rate": 1.7331863285556783e-05, - "loss": 2.3168, + "epoch": 0.17, + "grad_norm": 9.871883392333984, + "learning_rate": 1.888633225954901e-05, + "loss": 1.5585, "step": 1332 }, { - "epoch": 0.4, - "grad_norm": 24.607051849365234, - "learning_rate": 1.732985867495239e-05, - "loss": 3.1243, + "epoch": 0.17, + "grad_norm": 15.263958930969238, + "learning_rate": 1.8885495544492325e-05, + "loss": 1.3708, "step": 1333 }, { - "epoch": 0.4, - "grad_norm": 19.95473861694336, - "learning_rate": 1.7327854064348003e-05, - "loss": 2.5079, + "epoch": 0.17, + "grad_norm": 15.60806655883789, + "learning_rate": 1.888465882943564e-05, + "loss": 2.9389, "step": 1334 }, { - "epoch": 0.4, - "grad_norm": 19.784517288208008, - "learning_rate": 1.7325849453743614e-05, - "loss": 3.3156, + "epoch": 0.17, + "grad_norm": 13.39920425415039, + "learning_rate": 1.888382211437895e-05, + "loss": 3.3344, "step": 1335 }, { - "epoch": 0.4, - "grad_norm": 13.695775032043457, - "learning_rate": 1.732384484313922e-05, - "loss": 3.1165, + "epoch": 0.17, + "grad_norm": 9.534605026245117, + "learning_rate": 1.8882985399322263e-05, + "loss": 2.4543, "step": 1336 }, { - "epoch": 0.4, - "grad_norm": 11.951885223388672, - "learning_rate": 1.7321840232534834e-05, - "loss": 2.9321, + "epoch": 0.17, + "grad_norm": 11.92212200164795, + "learning_rate": 1.8882148684265576e-05, + "loss": 2.1972, "step": 1337 }, { - "epoch": 0.4, - "grad_norm": 13.507436752319336, - "learning_rate": 1.731983562193044e-05, - "loss": 2.4538, + "epoch": 0.17, + "grad_norm": 12.673669815063477, + "learning_rate": 1.8881311969208886e-05, + "loss": 2.1967, "step": 1338 }, { - "epoch": 0.4, - "grad_norm": 11.957839965820312, - "learning_rate": 1.731783101132605e-05, - "loss": 3.1245, + "epoch": 0.17, + "grad_norm": 15.573685646057129, + "learning_rate": 1.88804752541522e-05, + "loss": 3.0234, "step": 1339 }, { - "epoch": 0.4, - "grad_norm": 14.829548835754395, - "learning_rate": 1.731582640072166e-05, - "loss": 2.3632, + "epoch": 0.17, + "grad_norm": 23.35232162475586, + "learning_rate": 1.8879638539095514e-05, + "loss": 2.9469, "step": 1340 }, { - "epoch": 0.4, - "grad_norm": 14.587291717529297, - "learning_rate": 1.731382179011727e-05, - "loss": 2.8577, + "epoch": 0.17, + "grad_norm": 20.39870834350586, + "learning_rate": 1.8878801824038827e-05, + "loss": 4.6102, "step": 1341 }, { - "epoch": 0.4, - "grad_norm": 13.556631088256836, - "learning_rate": 1.731181717951288e-05, - "loss": 2.0475, + "epoch": 0.17, + "grad_norm": 18.56085205078125, + "learning_rate": 1.8877965108982138e-05, + "loss": 3.432, "step": 1342 }, { - "epoch": 0.4, - "grad_norm": 13.010260581970215, - "learning_rate": 1.730981256890849e-05, - "loss": 2.6603, + "epoch": 0.17, + "grad_norm": 15.179999351501465, + "learning_rate": 1.887712839392545e-05, + "loss": 4.0547, "step": 1343 }, { - "epoch": 0.4, - "grad_norm": 11.09408187866211, - "learning_rate": 1.73078079583041e-05, - "loss": 2.5755, + "epoch": 0.17, + "grad_norm": 11.116223335266113, + "learning_rate": 1.8876291678868765e-05, + "loss": 1.5694, "step": 1344 }, { - "epoch": 0.4, - "grad_norm": 13.849974632263184, - "learning_rate": 1.730580334769971e-05, - "loss": 2.8472, + "epoch": 0.17, + "grad_norm": 12.547582626342773, + "learning_rate": 1.8875454963812075e-05, + "loss": 4.2236, "step": 1345 }, { - "epoch": 0.4, - "grad_norm": 21.211544036865234, - "learning_rate": 1.730379873709532e-05, - "loss": 2.7465, + "epoch": 0.17, + "grad_norm": 10.222655296325684, + "learning_rate": 1.887461824875539e-05, + "loss": 4.0636, "step": 1346 }, { - "epoch": 0.4, - "grad_norm": 15.509568214416504, - "learning_rate": 1.730179412649093e-05, - "loss": 3.4552, + "epoch": 0.17, + "grad_norm": 8.905462265014648, + "learning_rate": 1.88737815336987e-05, + "loss": 2.6271, "step": 1347 }, { - "epoch": 0.41, - "grad_norm": 13.904120445251465, - "learning_rate": 1.729978951588654e-05, - "loss": 2.9963, + "epoch": 0.17, + "grad_norm": 12.223560333251953, + "learning_rate": 1.8872944818642013e-05, + "loss": 2.2671, "step": 1348 }, { - "epoch": 0.41, - "grad_norm": 17.5559024810791, - "learning_rate": 1.729778490528215e-05, - "loss": 2.4842, + "epoch": 0.17, + "grad_norm": 16.041154861450195, + "learning_rate": 1.8872108103585326e-05, + "loss": 3.0192, "step": 1349 }, { - "epoch": 0.41, - "grad_norm": 15.476587295532227, - "learning_rate": 1.729578029467776e-05, - "loss": 2.5928, + "epoch": 0.17, + "grad_norm": 24.999361038208008, + "learning_rate": 1.8871271388528637e-05, + "loss": 2.1426, "step": 1350 }, { - "epoch": 0.41, - "grad_norm": 15.32288932800293, - "learning_rate": 1.729377568407337e-05, - "loss": 3.4517, + "epoch": 0.17, + "grad_norm": 12.167609214782715, + "learning_rate": 1.887043467347195e-05, + "loss": 2.9919, "step": 1351 }, { - "epoch": 0.41, - "grad_norm": 15.49992561340332, - "learning_rate": 1.7291771073468978e-05, - "loss": 3.042, + "epoch": 0.17, + "grad_norm": 11.232583045959473, + "learning_rate": 1.886959795841526e-05, + "loss": 3.7923, "step": 1352 }, { - "epoch": 0.41, - "grad_norm": 13.955615043640137, - "learning_rate": 1.728976646286459e-05, - "loss": 2.5861, + "epoch": 0.17, + "grad_norm": 11.472769737243652, + "learning_rate": 1.8868761243358574e-05, + "loss": 2.5913, "step": 1353 }, { - "epoch": 0.41, - "grad_norm": 12.550683975219727, - "learning_rate": 1.7287761852260202e-05, - "loss": 3.6951, + "epoch": 0.17, + "grad_norm": 24.40163230895996, + "learning_rate": 1.8867924528301888e-05, + "loss": 3.6598, "step": 1354 }, { - "epoch": 0.41, - "grad_norm": 22.899044036865234, - "learning_rate": 1.728575724165581e-05, - "loss": 2.9966, + "epoch": 0.17, + "grad_norm": 12.834221839904785, + "learning_rate": 1.88670878132452e-05, + "loss": 2.6269, "step": 1355 }, { - "epoch": 0.41, - "grad_norm": 12.468682289123535, - "learning_rate": 1.7283752631051422e-05, - "loss": 2.1763, + "epoch": 0.17, + "grad_norm": 14.640671730041504, + "learning_rate": 1.8866251098188512e-05, + "loss": 3.9581, "step": 1356 }, { - "epoch": 0.41, - "grad_norm": 22.829370498657227, - "learning_rate": 1.728174802044703e-05, - "loss": 3.2563, + "epoch": 0.17, + "grad_norm": 14.906844139099121, + "learning_rate": 1.8865414383131825e-05, + "loss": 3.0435, "step": 1357 }, { - "epoch": 0.41, - "grad_norm": 27.643447875976562, - "learning_rate": 1.727974340984264e-05, - "loss": 2.7474, + "epoch": 0.17, + "grad_norm": 11.10807991027832, + "learning_rate": 1.886457766807514e-05, + "loss": 3.1026, "step": 1358 }, { - "epoch": 0.41, - "grad_norm": 17.19843101501465, - "learning_rate": 1.727773879923825e-05, - "loss": 2.9442, + "epoch": 0.17, + "grad_norm": 16.644826889038086, + "learning_rate": 1.886374095301845e-05, + "loss": 1.8461, "step": 1359 }, { - "epoch": 0.41, - "grad_norm": 13.734451293945312, - "learning_rate": 1.727573418863386e-05, - "loss": 2.8518, + "epoch": 0.17, + "grad_norm": 11.831157684326172, + "learning_rate": 1.8862904237961763e-05, + "loss": 1.422, "step": 1360 }, { - "epoch": 0.41, - "grad_norm": 11.024998664855957, - "learning_rate": 1.727372957802947e-05, - "loss": 2.2833, + "epoch": 0.17, + "grad_norm": 19.2175350189209, + "learning_rate": 1.8862067522905077e-05, + "loss": 2.5451, "step": 1361 }, { - "epoch": 0.41, - "grad_norm": 19.57155990600586, - "learning_rate": 1.727172496742508e-05, - "loss": 2.9668, + "epoch": 0.17, + "grad_norm": 15.262014389038086, + "learning_rate": 1.886123080784839e-05, + "loss": 2.6348, "step": 1362 }, { - "epoch": 0.41, - "grad_norm": 24.14830207824707, - "learning_rate": 1.726972035682069e-05, - "loss": 2.2259, + "epoch": 0.17, + "grad_norm": 10.91917896270752, + "learning_rate": 1.88603940927917e-05, + "loss": 2.6969, "step": 1363 }, { - "epoch": 0.41, - "grad_norm": 13.887985229492188, - "learning_rate": 1.72677157462163e-05, - "loss": 2.4764, + "epoch": 0.17, + "grad_norm": 11.875226020812988, + "learning_rate": 1.8859557377735014e-05, + "loss": 2.2805, "step": 1364 }, { - "epoch": 0.41, - "grad_norm": 11.10314655303955, - "learning_rate": 1.726571113561191e-05, - "loss": 2.1367, + "epoch": 0.17, + "grad_norm": 21.78172492980957, + "learning_rate": 1.8858720662678328e-05, + "loss": 2.5054, "step": 1365 }, { - "epoch": 0.41, - "grad_norm": 11.575247764587402, - "learning_rate": 1.726370652500752e-05, - "loss": 2.396, + "epoch": 0.17, + "grad_norm": 14.415453910827637, + "learning_rate": 1.8857883947621638e-05, + "loss": 2.788, "step": 1366 }, { - "epoch": 0.41, - "grad_norm": 17.140932083129883, - "learning_rate": 1.726170191440313e-05, - "loss": 3.0233, + "epoch": 0.17, + "grad_norm": 9.646689414978027, + "learning_rate": 1.8857047232564952e-05, + "loss": 1.9306, "step": 1367 }, { - "epoch": 0.41, - "grad_norm": 18.503704071044922, - "learning_rate": 1.725969730379874e-05, - "loss": 3.2255, + "epoch": 0.17, + "grad_norm": 11.422354698181152, + "learning_rate": 1.8856210517508265e-05, + "loss": 1.9116, "step": 1368 }, { - "epoch": 0.41, - "grad_norm": 14.695094108581543, - "learning_rate": 1.7257692693194346e-05, - "loss": 3.1332, + "epoch": 0.17, + "grad_norm": 19.322690963745117, + "learning_rate": 1.885537380245158e-05, + "loss": 4.9606, "step": 1369 }, { - "epoch": 0.41, - "grad_norm": 12.079935073852539, - "learning_rate": 1.725568808258996e-05, - "loss": 2.4825, + "epoch": 0.17, + "grad_norm": 11.154664993286133, + "learning_rate": 1.885453708739489e-05, + "loss": 3.4789, "step": 1370 }, { - "epoch": 0.41, - "grad_norm": 23.080753326416016, - "learning_rate": 1.7253683471985566e-05, - "loss": 2.7521, + "epoch": 0.17, + "grad_norm": 16.747013092041016, + "learning_rate": 1.8853700372338203e-05, + "loss": 3.9088, "step": 1371 }, { - "epoch": 0.41, - "grad_norm": 13.837876319885254, - "learning_rate": 1.7251678861381176e-05, - "loss": 2.5238, + "epoch": 0.17, + "grad_norm": 13.053373336791992, + "learning_rate": 1.8852863657281513e-05, + "loss": 3.8792, "step": 1372 }, { - "epoch": 0.41, - "grad_norm": 19.977251052856445, - "learning_rate": 1.724967425077679e-05, - "loss": 3.2955, + "epoch": 0.17, + "grad_norm": 10.83530044555664, + "learning_rate": 1.8852026942224827e-05, + "loss": 3.0796, "step": 1373 }, { - "epoch": 0.41, - "grad_norm": 13.569607734680176, - "learning_rate": 1.7247669640172397e-05, - "loss": 2.7767, + "epoch": 0.17, + "grad_norm": 15.559985160827637, + "learning_rate": 1.885119022716814e-05, + "loss": 4.2939, "step": 1374 }, { - "epoch": 0.41, - "grad_norm": 14.404850959777832, - "learning_rate": 1.7245665029568007e-05, - "loss": 2.6134, + "epoch": 0.17, + "grad_norm": 11.467248916625977, + "learning_rate": 1.885035351211145e-05, + "loss": 1.7113, "step": 1375 }, { - "epoch": 0.41, - "grad_norm": 16.575212478637695, - "learning_rate": 1.7243660418963617e-05, - "loss": 1.7531, + "epoch": 0.17, + "grad_norm": 9.12670612335205, + "learning_rate": 1.8849516797054764e-05, + "loss": 2.0589, "step": 1376 }, { - "epoch": 0.41, - "grad_norm": 21.60881233215332, - "learning_rate": 1.7241655808359227e-05, - "loss": 2.9619, + "epoch": 0.17, + "grad_norm": 9.2252197265625, + "learning_rate": 1.8848680081998078e-05, + "loss": 1.4876, "step": 1377 }, { - "epoch": 0.41, - "grad_norm": 17.051956176757812, - "learning_rate": 1.7239651197754837e-05, - "loss": 3.0801, + "epoch": 0.17, + "grad_norm": 12.349713325500488, + "learning_rate": 1.884784336694139e-05, + "loss": 1.7325, "step": 1378 }, { - "epoch": 0.41, - "grad_norm": 37.58320999145508, - "learning_rate": 1.7237646587150447e-05, - "loss": 2.3241, + "epoch": 0.17, + "grad_norm": 16.01396369934082, + "learning_rate": 1.8847006651884702e-05, + "loss": 2.7937, "step": 1379 }, { - "epoch": 0.41, - "grad_norm": 12.96102237701416, - "learning_rate": 1.7235641976546057e-05, - "loss": 2.2837, + "epoch": 0.17, + "grad_norm": 13.64192008972168, + "learning_rate": 1.8846169936828012e-05, + "loss": 2.941, "step": 1380 }, { - "epoch": 0.42, - "grad_norm": 34.093780517578125, - "learning_rate": 1.7233637365941667e-05, - "loss": 2.8085, + "epoch": 0.17, + "grad_norm": 16.084604263305664, + "learning_rate": 1.8845333221771326e-05, + "loss": 2.1773, "step": 1381 }, { - "epoch": 0.42, - "grad_norm": 17.705703735351562, - "learning_rate": 1.7231632755337277e-05, - "loss": 3.4047, + "epoch": 0.17, + "grad_norm": 20.62796401977539, + "learning_rate": 1.884449650671464e-05, + "loss": 1.8308, "step": 1382 }, { - "epoch": 0.42, - "grad_norm": 16.184062957763672, - "learning_rate": 1.7229628144732887e-05, - "loss": 3.4847, + "epoch": 0.17, + "grad_norm": 18.79698371887207, + "learning_rate": 1.8843659791657953e-05, + "loss": 2.6251, "step": 1383 }, { - "epoch": 0.42, - "grad_norm": 12.74220085144043, - "learning_rate": 1.7227623534128497e-05, - "loss": 1.8471, + "epoch": 0.17, + "grad_norm": 27.760475158691406, + "learning_rate": 1.8842823076601263e-05, + "loss": 5.3229, "step": 1384 }, { - "epoch": 0.42, - "grad_norm": 26.621505737304688, - "learning_rate": 1.7225618923524108e-05, - "loss": 3.4508, + "epoch": 0.17, + "grad_norm": 11.442704200744629, + "learning_rate": 1.8841986361544577e-05, + "loss": 2.6164, "step": 1385 }, { - "epoch": 0.42, - "grad_norm": 23.22762107849121, - "learning_rate": 1.7223614312919718e-05, - "loss": 2.8567, + "epoch": 0.17, + "grad_norm": 21.72187042236328, + "learning_rate": 1.884114964648789e-05, + "loss": 2.2087, "step": 1386 }, { - "epoch": 0.42, - "grad_norm": 13.235843658447266, - "learning_rate": 1.7221609702315328e-05, - "loss": 2.368, + "epoch": 0.17, + "grad_norm": 18.298065185546875, + "learning_rate": 1.88403129314312e-05, + "loss": 2.0928, "step": 1387 }, { - "epoch": 0.42, - "grad_norm": 10.793066024780273, - "learning_rate": 1.7219605091710934e-05, - "loss": 1.9442, + "epoch": 0.17, + "grad_norm": 14.109087944030762, + "learning_rate": 1.8839476216374515e-05, + "loss": 2.6915, "step": 1388 }, { - "epoch": 0.42, - "grad_norm": 18.45147705078125, - "learning_rate": 1.7217600481106548e-05, - "loss": 3.3398, + "epoch": 0.17, + "grad_norm": 25.29339027404785, + "learning_rate": 1.883863950131783e-05, + "loss": 3.4751, "step": 1389 }, { - "epoch": 0.42, - "grad_norm": 14.921577453613281, - "learning_rate": 1.7215595870502155e-05, - "loss": 2.1594, + "epoch": 0.17, + "grad_norm": 14.645580291748047, + "learning_rate": 1.8837802786261142e-05, + "loss": 2.9767, "step": 1390 }, { - "epoch": 0.42, - "grad_norm": 14.961502075195312, - "learning_rate": 1.7213591259897765e-05, - "loss": 3.3974, + "epoch": 0.17, + "grad_norm": 9.96379566192627, + "learning_rate": 1.8836966071204452e-05, + "loss": 4.0804, "step": 1391 }, { - "epoch": 0.42, - "grad_norm": 19.07111167907715, - "learning_rate": 1.7211586649293378e-05, - "loss": 3.1362, + "epoch": 0.17, + "grad_norm": 16.089723587036133, + "learning_rate": 1.8836129356147766e-05, + "loss": 2.7156, "step": 1392 }, { - "epoch": 0.42, - "grad_norm": 10.949179649353027, - "learning_rate": 1.7209582038688985e-05, - "loss": 2.0993, + "epoch": 0.17, + "grad_norm": 42.31618118286133, + "learning_rate": 1.883529264109108e-05, + "loss": 2.8604, "step": 1393 }, { - "epoch": 0.42, - "grad_norm": 12.562166213989258, - "learning_rate": 1.7207577428084595e-05, - "loss": 2.0554, + "epoch": 0.17, + "grad_norm": 12.02553653717041, + "learning_rate": 1.883445592603439e-05, + "loss": 3.5257, "step": 1394 }, { - "epoch": 0.42, - "grad_norm": 11.35824966430664, - "learning_rate": 1.7205572817480205e-05, - "loss": 2.1912, + "epoch": 0.18, + "grad_norm": 24.402360916137695, + "learning_rate": 1.8833619210977703e-05, + "loss": 3.3099, "step": 1395 }, { - "epoch": 0.42, - "grad_norm": 16.764873504638672, - "learning_rate": 1.7203568206875815e-05, - "loss": 2.5602, + "epoch": 0.18, + "grad_norm": 13.76604175567627, + "learning_rate": 1.8832782495921017e-05, + "loss": 2.6733, "step": 1396 }, { - "epoch": 0.42, - "grad_norm": 14.467211723327637, - "learning_rate": 1.7201563596271425e-05, - "loss": 2.6329, + "epoch": 0.18, + "grad_norm": 15.612774848937988, + "learning_rate": 1.883194578086433e-05, + "loss": 2.981, "step": 1397 }, { - "epoch": 0.42, - "grad_norm": 11.931891441345215, - "learning_rate": 1.7199558985667035e-05, - "loss": 2.2821, + "epoch": 0.18, + "grad_norm": 14.17855453491211, + "learning_rate": 1.883110906580764e-05, + "loss": 1.7441, "step": 1398 }, { - "epoch": 0.42, - "grad_norm": 22.51726722717285, - "learning_rate": 1.7197554375062645e-05, - "loss": 2.7927, + "epoch": 0.18, + "grad_norm": 18.615697860717773, + "learning_rate": 1.8830272350750955e-05, + "loss": 2.8956, "step": 1399 }, { - "epoch": 0.42, - "grad_norm": 10.18954086303711, - "learning_rate": 1.7195549764458255e-05, - "loss": 1.9007, + "epoch": 0.18, + "grad_norm": 33.42338562011719, + "learning_rate": 1.8829435635694265e-05, + "loss": 3.1567, "step": 1400 }, { - "epoch": 0.42, - "grad_norm": 15.76696491241455, - "learning_rate": 1.7193545153853866e-05, - "loss": 2.0431, + "epoch": 0.18, + "grad_norm": 14.481989860534668, + "learning_rate": 1.882859892063758e-05, + "loss": 2.4805, "step": 1401 }, { - "epoch": 0.42, - "grad_norm": 11.40339183807373, - "learning_rate": 1.7191540543249476e-05, - "loss": 2.5742, + "epoch": 0.18, + "grad_norm": 15.88669490814209, + "learning_rate": 1.8827762205580892e-05, + "loss": 1.9319, "step": 1402 }, { - "epoch": 0.42, - "grad_norm": 12.021610260009766, - "learning_rate": 1.7189535932645086e-05, - "loss": 3.6196, + "epoch": 0.18, + "grad_norm": 21.62628173828125, + "learning_rate": 1.8826925490524202e-05, + "loss": 1.8743, "step": 1403 }, { - "epoch": 0.42, - "grad_norm": 19.796781539916992, - "learning_rate": 1.7187531322040696e-05, - "loss": 2.8469, + "epoch": 0.18, + "grad_norm": 15.85845947265625, + "learning_rate": 1.8826088775467516e-05, + "loss": 3.7623, "step": 1404 }, { - "epoch": 0.42, - "grad_norm": 14.170371055603027, - "learning_rate": 1.7185526711436306e-05, - "loss": 2.3962, + "epoch": 0.18, + "grad_norm": 12.276082038879395, + "learning_rate": 1.8825252060410826e-05, + "loss": 1.6724, "step": 1405 }, { - "epoch": 0.42, - "grad_norm": 11.088910102844238, - "learning_rate": 1.7183522100831916e-05, - "loss": 2.6827, + "epoch": 0.18, + "grad_norm": 17.465612411499023, + "learning_rate": 1.882441534535414e-05, + "loss": 3.3751, "step": 1406 }, { - "epoch": 0.42, - "grad_norm": 136.58908081054688, - "learning_rate": 1.7181517490227523e-05, - "loss": 2.3091, + "epoch": 0.18, + "grad_norm": 6.7463765144348145, + "learning_rate": 1.8823578630297454e-05, + "loss": 1.2683, "step": 1407 }, { - "epoch": 0.42, - "grad_norm": 12.442429542541504, - "learning_rate": 1.7179512879623136e-05, - "loss": 2.4494, + "epoch": 0.18, + "grad_norm": 17.999980926513672, + "learning_rate": 1.8822741915240764e-05, + "loss": 1.6499, "step": 1408 }, { - "epoch": 0.42, - "grad_norm": 14.868009567260742, - "learning_rate": 1.7177508269018746e-05, - "loss": 2.5188, + "epoch": 0.18, + "grad_norm": 8.493429183959961, + "learning_rate": 1.8821905200184078e-05, + "loss": 2.7235, "step": 1409 }, { - "epoch": 0.42, - "grad_norm": 12.164691925048828, - "learning_rate": 1.7175503658414353e-05, - "loss": 2.378, + "epoch": 0.18, + "grad_norm": 68.3912124633789, + "learning_rate": 1.882106848512739e-05, + "loss": 4.2291, "step": 1410 }, { - "epoch": 0.42, - "grad_norm": 15.50973129272461, - "learning_rate": 1.7173499047809966e-05, - "loss": 4.2229, + "epoch": 0.18, + "grad_norm": 6.770368576049805, + "learning_rate": 1.8820231770070705e-05, + "loss": 0.8155, "step": 1411 }, { - "epoch": 0.42, - "grad_norm": 16.49388885498047, - "learning_rate": 1.7171494437205573e-05, - "loss": 2.5579, + "epoch": 0.18, + "grad_norm": 16.140487670898438, + "learning_rate": 1.8819395055014015e-05, + "loss": 2.7073, "step": 1412 }, { - "epoch": 0.42, - "grad_norm": 14.063419342041016, - "learning_rate": 1.7169489826601183e-05, - "loss": 3.0148, + "epoch": 0.18, + "grad_norm": 24.126846313476562, + "learning_rate": 1.881855833995733e-05, + "loss": 2.2317, "step": 1413 }, { - "epoch": 0.43, - "grad_norm": 37.99775314331055, - "learning_rate": 1.7167485215996793e-05, - "loss": 2.9612, + "epoch": 0.18, + "grad_norm": 10.037046432495117, + "learning_rate": 1.8817721624900642e-05, + "loss": 2.8032, "step": 1414 }, { - "epoch": 0.43, - "grad_norm": 18.0521183013916, - "learning_rate": 1.7165480605392403e-05, - "loss": 2.871, + "epoch": 0.18, + "grad_norm": 9.873966217041016, + "learning_rate": 1.8816884909843953e-05, + "loss": 2.6386, "step": 1415 }, { - "epoch": 0.43, - "grad_norm": 17.04564094543457, - "learning_rate": 1.7163475994788013e-05, - "loss": 2.6688, + "epoch": 0.18, + "grad_norm": 16.003158569335938, + "learning_rate": 1.8816048194787266e-05, + "loss": 2.1387, "step": 1416 }, { - "epoch": 0.43, - "grad_norm": 13.493517875671387, - "learning_rate": 1.7161471384183624e-05, - "loss": 2.1012, + "epoch": 0.18, + "grad_norm": 14.439873695373535, + "learning_rate": 1.881521147973058e-05, + "loss": 2.8796, "step": 1417 }, { - "epoch": 0.43, - "grad_norm": 13.597625732421875, - "learning_rate": 1.7159466773579234e-05, - "loss": 3.3529, + "epoch": 0.18, + "grad_norm": 24.364248275756836, + "learning_rate": 1.8814374764673894e-05, + "loss": 2.6972, "step": 1418 }, { - "epoch": 0.43, - "grad_norm": 17.114131927490234, - "learning_rate": 1.7157462162974844e-05, - "loss": 3.2373, + "epoch": 0.18, + "grad_norm": 19.170869827270508, + "learning_rate": 1.8813538049617204e-05, + "loss": 3.3193, "step": 1419 }, { - "epoch": 0.43, - "grad_norm": 18.538379669189453, - "learning_rate": 1.7155457552370454e-05, - "loss": 2.4973, + "epoch": 0.18, + "grad_norm": 13.805689811706543, + "learning_rate": 1.8812701334560518e-05, + "loss": 4.1914, "step": 1420 }, { - "epoch": 0.43, - "grad_norm": 23.284835815429688, - "learning_rate": 1.7153452941766064e-05, - "loss": 3.6339, + "epoch": 0.18, + "grad_norm": 17.165374755859375, + "learning_rate": 1.881186461950383e-05, + "loss": 2.8776, "step": 1421 }, { - "epoch": 0.43, - "grad_norm": 19.382251739501953, - "learning_rate": 1.7151448331161674e-05, - "loss": 3.1989, + "epoch": 0.18, + "grad_norm": 11.152364730834961, + "learning_rate": 1.881102790444714e-05, + "loss": 2.1746, "step": 1422 }, { - "epoch": 0.43, - "grad_norm": 17.688274383544922, - "learning_rate": 1.7149443720557284e-05, - "loss": 2.1218, + "epoch": 0.18, + "grad_norm": 19.310604095458984, + "learning_rate": 1.8810191189390455e-05, + "loss": 1.6934, "step": 1423 }, { - "epoch": 0.43, - "grad_norm": 24.966703414916992, - "learning_rate": 1.7147439109952894e-05, - "loss": 2.9954, + "epoch": 0.18, + "grad_norm": 25.408475875854492, + "learning_rate": 1.880935447433377e-05, + "loss": 2.5396, "step": 1424 }, { - "epoch": 0.43, - "grad_norm": 31.188905715942383, - "learning_rate": 1.7145434499348504e-05, - "loss": 2.9035, + "epoch": 0.18, + "grad_norm": 18.383811950683594, + "learning_rate": 1.880851775927708e-05, + "loss": 2.7541, "step": 1425 }, { - "epoch": 0.43, - "grad_norm": 13.262977600097656, - "learning_rate": 1.714342988874411e-05, - "loss": 1.3303, + "epoch": 0.18, + "grad_norm": 11.278987884521484, + "learning_rate": 1.8807681044220393e-05, + "loss": 1.5525, "step": 1426 }, { - "epoch": 0.43, - "grad_norm": 13.84818172454834, - "learning_rate": 1.7141425278139724e-05, - "loss": 2.3282, + "epoch": 0.18, + "grad_norm": 10.96129322052002, + "learning_rate": 1.8806844329163706e-05, + "loss": 1.641, "step": 1427 }, { - "epoch": 0.43, - "grad_norm": 55.73940658569336, - "learning_rate": 1.7139420667535334e-05, - "loss": 3.5851, + "epoch": 0.18, + "grad_norm": 21.37160873413086, + "learning_rate": 1.8806007614107017e-05, + "loss": 2.2955, "step": 1428 }, { - "epoch": 0.43, - "grad_norm": 11.50109577178955, - "learning_rate": 1.713741605693094e-05, - "loss": 2.6434, + "epoch": 0.18, + "grad_norm": 13.814507484436035, + "learning_rate": 1.880517089905033e-05, + "loss": 3.0471, "step": 1429 }, { - "epoch": 0.43, - "grad_norm": 17.894203186035156, - "learning_rate": 1.7135411446326555e-05, - "loss": 2.5986, + "epoch": 0.18, + "grad_norm": 12.956681251525879, + "learning_rate": 1.880433418399364e-05, + "loss": 1.7278, "step": 1430 }, { - "epoch": 0.43, - "grad_norm": 33.426055908203125, - "learning_rate": 1.713340683572216e-05, - "loss": 2.7803, + "epoch": 0.18, + "grad_norm": 12.984116554260254, + "learning_rate": 1.8803497468936954e-05, + "loss": 2.5082, "step": 1431 }, { - "epoch": 0.43, - "grad_norm": 10.518821716308594, - "learning_rate": 1.713140222511777e-05, - "loss": 2.1774, + "epoch": 0.18, + "grad_norm": 14.6563138961792, + "learning_rate": 1.8802660753880268e-05, + "loss": 3.9653, "step": 1432 }, { - "epoch": 0.43, - "grad_norm": 13.691694259643555, - "learning_rate": 1.712939761451338e-05, - "loss": 2.4883, + "epoch": 0.18, + "grad_norm": 6.648709297180176, + "learning_rate": 1.8801824038823578e-05, + "loss": 0.3524, "step": 1433 }, { - "epoch": 0.43, - "grad_norm": 19.365102767944336, - "learning_rate": 1.712739300390899e-05, - "loss": 2.6138, + "epoch": 0.18, + "grad_norm": 17.727685928344727, + "learning_rate": 1.8800987323766892e-05, + "loss": 2.1727, "step": 1434 }, { - "epoch": 0.43, - "grad_norm": 31.23158836364746, - "learning_rate": 1.71253883933046e-05, - "loss": 2.905, + "epoch": 0.18, + "grad_norm": 23.565282821655273, + "learning_rate": 1.8800150608710205e-05, + "loss": 2.1334, "step": 1435 }, { - "epoch": 0.43, - "grad_norm": 23.67169761657715, - "learning_rate": 1.7123383782700212e-05, - "loss": 3.4515, + "epoch": 0.18, + "grad_norm": 13.650922775268555, + "learning_rate": 1.8799313893653516e-05, + "loss": 2.2225, "step": 1436 }, { - "epoch": 0.43, - "grad_norm": 11.991448402404785, - "learning_rate": 1.7121379172095822e-05, - "loss": 2.562, + "epoch": 0.18, + "grad_norm": 10.054190635681152, + "learning_rate": 1.879847717859683e-05, + "loss": 3.5198, "step": 1437 }, { - "epoch": 0.43, - "grad_norm": 34.79066467285156, - "learning_rate": 1.7119374561491432e-05, - "loss": 3.1665, + "epoch": 0.18, + "grad_norm": 11.719369888305664, + "learning_rate": 1.8797640463540143e-05, + "loss": 1.1041, "step": 1438 }, { - "epoch": 0.43, - "grad_norm": 24.111236572265625, - "learning_rate": 1.7117369950887042e-05, - "loss": 3.2707, + "epoch": 0.18, + "grad_norm": 16.8707218170166, + "learning_rate": 1.8796803748483457e-05, + "loss": 2.3575, "step": 1439 }, { - "epoch": 0.43, - "grad_norm": 36.61920928955078, - "learning_rate": 1.7115365340282652e-05, - "loss": 2.3113, - "step": 1440 - }, - { - "epoch": 0.43, - "eval_loss": 0.4355357885360718, - "eval_runtime": 43.4342, - "eval_samples_per_second": 34.051, - "eval_steps_per_second": 34.051, + "epoch": 0.18, + "grad_norm": 13.826940536499023, + "learning_rate": 1.8795967033426767e-05, + "loss": 2.2994, "step": 1440 }, { - "epoch": 0.43, - "grad_norm": 25.45790672302246, - "learning_rate": 1.7113360729678262e-05, - "loss": 2.8492, + "epoch": 0.18, + "grad_norm": 13.495528221130371, + "learning_rate": 1.879513031837008e-05, + "loss": 1.6467, "step": 1441 }, { - "epoch": 0.43, - "grad_norm": 8.575899124145508, - "learning_rate": 1.7111356119073872e-05, - "loss": 2.934, + "epoch": 0.18, + "grad_norm": 8.298262596130371, + "learning_rate": 1.8794293603313394e-05, + "loss": 2.4304, "step": 1442 }, { - "epoch": 0.43, - "grad_norm": 14.703217506408691, - "learning_rate": 1.710935150846948e-05, - "loss": 2.8837, + "epoch": 0.18, + "grad_norm": 11.978004455566406, + "learning_rate": 1.8793456888256704e-05, + "loss": 1.8029, "step": 1443 }, { - "epoch": 0.43, - "grad_norm": 45.12884521484375, - "learning_rate": 1.7107346897865092e-05, - "loss": 4.0134, + "epoch": 0.18, + "grad_norm": 17.661035537719727, + "learning_rate": 1.8792620173200018e-05, + "loss": 1.3806, "step": 1444 }, { - "epoch": 0.43, - "grad_norm": 17.04209327697754, - "learning_rate": 1.71053422872607e-05, - "loss": 2.9208, + "epoch": 0.18, + "grad_norm": 16.51836585998535, + "learning_rate": 1.879178345814333e-05, + "loss": 2.9111, "step": 1445 }, { - "epoch": 0.43, - "grad_norm": 11.436484336853027, - "learning_rate": 1.710333767665631e-05, - "loss": 2.5288, + "epoch": 0.18, + "grad_norm": 40.740848541259766, + "learning_rate": 1.8790946743086645e-05, + "loss": 2.2159, "step": 1446 }, { - "epoch": 0.44, - "grad_norm": 11.852707862854004, - "learning_rate": 1.7101333066051923e-05, - "loss": 2.4064, + "epoch": 0.18, + "grad_norm": 12.385976791381836, + "learning_rate": 1.8790110028029956e-05, + "loss": 2.6437, "step": 1447 }, { - "epoch": 0.44, - "grad_norm": 18.86704444885254, - "learning_rate": 1.709932845544753e-05, - "loss": 2.924, + "epoch": 0.18, + "grad_norm": 19.24806022644043, + "learning_rate": 1.878927331297327e-05, + "loss": 2.9698, "step": 1448 }, { - "epoch": 0.44, - "grad_norm": 26.129491806030273, - "learning_rate": 1.7097323844843143e-05, - "loss": 2.4179, + "epoch": 0.18, + "grad_norm": 31.226917266845703, + "learning_rate": 1.8788436597916583e-05, + "loss": 4.2987, "step": 1449 }, { - "epoch": 0.44, - "grad_norm": 13.224143028259277, - "learning_rate": 1.709531923423875e-05, - "loss": 3.0095, + "epoch": 0.18, + "grad_norm": 13.335999488830566, + "learning_rate": 1.8787599882859893e-05, + "loss": 3.1112, "step": 1450 }, { - "epoch": 0.44, - "grad_norm": 12.908326148986816, - "learning_rate": 1.709331462363436e-05, - "loss": 2.6045, + "epoch": 0.18, + "grad_norm": 13.00089168548584, + "learning_rate": 1.8786763167803207e-05, + "loss": 5.4781, "step": 1451 }, { - "epoch": 0.44, - "grad_norm": 17.967126846313477, - "learning_rate": 1.7091310013029973e-05, - "loss": 1.9252, + "epoch": 0.18, + "grad_norm": 17.49352264404297, + "learning_rate": 1.878592645274652e-05, + "loss": 3.781, "step": 1452 }, { - "epoch": 0.44, - "grad_norm": 15.611589431762695, - "learning_rate": 1.708930540242558e-05, - "loss": 3.118, + "epoch": 0.18, + "grad_norm": 12.117816925048828, + "learning_rate": 1.878508973768983e-05, + "loss": 2.2466, "step": 1453 }, { - "epoch": 0.44, - "grad_norm": 19.359302520751953, - "learning_rate": 1.708730079182119e-05, - "loss": 2.8996, + "epoch": 0.18, + "grad_norm": 18.171384811401367, + "learning_rate": 1.8784253022633144e-05, + "loss": 2.1827, "step": 1454 }, { - "epoch": 0.44, - "grad_norm": 15.188103675842285, - "learning_rate": 1.70852961812168e-05, - "loss": 3.3204, + "epoch": 0.18, + "grad_norm": 14.677664756774902, + "learning_rate": 1.8783416307576458e-05, + "loss": 4.503, "step": 1455 }, { - "epoch": 0.44, - "grad_norm": 24.560422897338867, - "learning_rate": 1.708329157061241e-05, - "loss": 2.7835, + "epoch": 0.18, + "grad_norm": 7.858498573303223, + "learning_rate": 1.8782579592519768e-05, + "loss": 1.752, "step": 1456 }, { - "epoch": 0.44, - "grad_norm": 13.218994140625, - "learning_rate": 1.708128696000802e-05, - "loss": 2.3175, + "epoch": 0.18, + "grad_norm": 16.817955017089844, + "learning_rate": 1.8781742877463082e-05, + "loss": 1.8967, "step": 1457 }, { - "epoch": 0.44, - "grad_norm": 13.680419921875, - "learning_rate": 1.707928234940363e-05, - "loss": 2.2045, + "epoch": 0.18, + "grad_norm": 9.136285781860352, + "learning_rate": 1.8780906162406392e-05, + "loss": 2.2303, "step": 1458 }, { - "epoch": 0.44, - "grad_norm": 15.42464828491211, - "learning_rate": 1.707727773879924e-05, - "loss": 2.7285, + "epoch": 0.18, + "grad_norm": 10.163810729980469, + "learning_rate": 1.8780069447349706e-05, + "loss": 4.6567, "step": 1459 }, { - "epoch": 0.44, - "grad_norm": 25.709156036376953, - "learning_rate": 1.707527312819485e-05, - "loss": 3.6841, + "epoch": 0.18, + "grad_norm": 15.327980995178223, + "learning_rate": 1.877923273229302e-05, + "loss": 2.154, "step": 1460 }, { - "epoch": 0.44, - "grad_norm": 15.961677551269531, - "learning_rate": 1.707326851759046e-05, - "loss": 3.4276, + "epoch": 0.18, + "grad_norm": 17.83934783935547, + "learning_rate": 1.877839601723633e-05, + "loss": 2.4298, "step": 1461 }, { - "epoch": 0.44, - "grad_norm": 15.101017951965332, - "learning_rate": 1.7071263906986067e-05, - "loss": 2.935, + "epoch": 0.18, + "grad_norm": 11.144344329833984, + "learning_rate": 1.8777559302179643e-05, + "loss": 1.4427, "step": 1462 }, { - "epoch": 0.44, - "grad_norm": 15.92037582397461, - "learning_rate": 1.706925929638168e-05, - "loss": 2.3067, + "epoch": 0.18, + "grad_norm": 13.788614273071289, + "learning_rate": 1.8776722587122957e-05, + "loss": 2.1013, "step": 1463 }, { - "epoch": 0.44, - "grad_norm": 17.06378173828125, - "learning_rate": 1.7067254685777287e-05, - "loss": 2.2178, + "epoch": 0.18, + "grad_norm": 11.165741920471191, + "learning_rate": 1.8775885872066267e-05, + "loss": 3.3155, "step": 1464 }, { - "epoch": 0.44, - "grad_norm": 15.061570167541504, - "learning_rate": 1.7065250075172897e-05, - "loss": 2.2647, + "epoch": 0.18, + "grad_norm": 16.798688888549805, + "learning_rate": 1.877504915700958e-05, + "loss": 3.3293, "step": 1465 }, { - "epoch": 0.44, - "grad_norm": 11.610605239868164, - "learning_rate": 1.706324546456851e-05, - "loss": 2.2437, + "epoch": 0.18, + "grad_norm": 16.24199867248535, + "learning_rate": 1.8774212441952895e-05, + "loss": 3.1168, "step": 1466 }, { - "epoch": 0.44, - "grad_norm": 9.191241264343262, - "learning_rate": 1.7061240853964118e-05, - "loss": 2.011, + "epoch": 0.18, + "grad_norm": 8.398725509643555, + "learning_rate": 1.8773375726896208e-05, + "loss": 1.4716, "step": 1467 }, { - "epoch": 0.44, - "grad_norm": 29.908700942993164, - "learning_rate": 1.7059236243359728e-05, - "loss": 3.4803, + "epoch": 0.18, + "grad_norm": 12.215764045715332, + "learning_rate": 1.877253901183952e-05, + "loss": 3.7943, "step": 1468 }, { - "epoch": 0.44, - "grad_norm": 31.88568115234375, - "learning_rate": 1.7057231632755338e-05, - "loss": 1.611, + "epoch": 0.18, + "grad_norm": 30.421791076660156, + "learning_rate": 1.8771702296782832e-05, + "loss": 4.2063, "step": 1469 }, { - "epoch": 0.44, - "grad_norm": 18.888980865478516, - "learning_rate": 1.7055227022150948e-05, - "loss": 3.0609, + "epoch": 0.18, + "grad_norm": 8.843764305114746, + "learning_rate": 1.8770865581726146e-05, + "loss": 0.7433, "step": 1470 }, { - "epoch": 0.44, - "grad_norm": 15.416746139526367, - "learning_rate": 1.7053222411546558e-05, - "loss": 2.4115, + "epoch": 0.18, + "grad_norm": 14.626254081726074, + "learning_rate": 1.8770028866669456e-05, + "loss": 2.1776, "step": 1471 }, { - "epoch": 0.44, - "grad_norm": 16.257118225097656, - "learning_rate": 1.7051217800942168e-05, - "loss": 3.6617, + "epoch": 0.18, + "grad_norm": 12.44121265411377, + "learning_rate": 1.876919215161277e-05, + "loss": 3.0034, "step": 1472 }, { - "epoch": 0.44, - "grad_norm": 14.70289134979248, - "learning_rate": 1.7049213190337778e-05, - "loss": 2.0395, + "epoch": 0.18, + "grad_norm": 31.2838077545166, + "learning_rate": 1.8768355436556083e-05, + "loss": 3.8373, "step": 1473 }, { - "epoch": 0.44, - "grad_norm": 14.966999053955078, - "learning_rate": 1.7047208579733388e-05, - "loss": 2.7337, + "epoch": 0.18, + "grad_norm": 15.197158813476562, + "learning_rate": 1.8767518721499397e-05, + "loss": 3.3477, "step": 1474 }, { - "epoch": 0.44, - "grad_norm": 19.118864059448242, - "learning_rate": 1.7045203969128998e-05, - "loss": 2.5427, + "epoch": 0.19, + "grad_norm": 13.24867057800293, + "learning_rate": 1.8766682006442707e-05, + "loss": 4.5144, "step": 1475 }, { - "epoch": 0.44, - "grad_norm": 21.44363021850586, - "learning_rate": 1.704319935852461e-05, - "loss": 3.0899, + "epoch": 0.19, + "grad_norm": 8.047499656677246, + "learning_rate": 1.876584529138602e-05, + "loss": 0.7997, "step": 1476 }, { - "epoch": 0.44, - "grad_norm": 16.222488403320312, - "learning_rate": 1.704119474792022e-05, - "loss": 2.3837, + "epoch": 0.19, + "grad_norm": 12.983343124389648, + "learning_rate": 1.8765008576329335e-05, + "loss": 1.9454, "step": 1477 }, { - "epoch": 0.44, - "grad_norm": 13.13603401184082, - "learning_rate": 1.703919013731583e-05, - "loss": 2.4675, + "epoch": 0.19, + "grad_norm": 16.986835479736328, + "learning_rate": 1.8764171861272645e-05, + "loss": 2.1344, "step": 1478 }, { - "epoch": 0.44, - "grad_norm": 20.3343563079834, - "learning_rate": 1.703718552671144e-05, - "loss": 2.9424, + "epoch": 0.19, + "grad_norm": 11.096803665161133, + "learning_rate": 1.876333514621596e-05, + "loss": 2.1649, "step": 1479 }, { - "epoch": 0.44, - "grad_norm": 19.545503616333008, - "learning_rate": 1.703518091610705e-05, - "loss": 2.9953, + "epoch": 0.19, + "grad_norm": 16.026342391967773, + "learning_rate": 1.8762498431159272e-05, + "loss": 2.6944, "step": 1480 }, { - "epoch": 0.45, - "grad_norm": 18.053640365600586, - "learning_rate": 1.7033176305502655e-05, - "loss": 2.11, + "epoch": 0.19, + "grad_norm": 17.424779891967773, + "learning_rate": 1.8761661716102582e-05, + "loss": 2.2067, "step": 1481 }, { - "epoch": 0.45, - "grad_norm": 18.4859561920166, - "learning_rate": 1.703117169489827e-05, - "loss": 3.0607, + "epoch": 0.19, + "grad_norm": 22.919485092163086, + "learning_rate": 1.8760825001045896e-05, + "loss": 2.8243, "step": 1482 }, { - "epoch": 0.45, - "grad_norm": 13.820666313171387, - "learning_rate": 1.702916708429388e-05, - "loss": 2.8445, + "epoch": 0.19, + "grad_norm": 11.948049545288086, + "learning_rate": 1.8759988285989206e-05, + "loss": 3.5818, "step": 1483 }, { - "epoch": 0.45, - "grad_norm": 15.42257022857666, - "learning_rate": 1.7027162473689486e-05, - "loss": 2.8965, + "epoch": 0.19, + "grad_norm": 6.547436714172363, + "learning_rate": 1.875915157093252e-05, + "loss": 0.3148, "step": 1484 }, { - "epoch": 0.45, - "grad_norm": 27.479591369628906, - "learning_rate": 1.70251578630851e-05, - "loss": 2.9632, + "epoch": 0.19, + "grad_norm": 10.880529403686523, + "learning_rate": 1.8758314855875834e-05, + "loss": 2.3092, "step": 1485 }, { - "epoch": 0.45, - "grad_norm": 21.065073013305664, - "learning_rate": 1.7023153252480706e-05, - "loss": 2.7442, + "epoch": 0.19, + "grad_norm": 27.949281692504883, + "learning_rate": 1.8757478140819144e-05, + "loss": 3.5417, "step": 1486 }, { - "epoch": 0.45, - "grad_norm": 7.838624477386475, - "learning_rate": 1.7021148641876316e-05, - "loss": 2.0983, + "epoch": 0.19, + "grad_norm": 11.91715145111084, + "learning_rate": 1.8756641425762457e-05, + "loss": 1.8293, "step": 1487 }, { - "epoch": 0.45, - "grad_norm": 17.929758071899414, - "learning_rate": 1.7019144031271926e-05, - "loss": 2.3848, + "epoch": 0.19, + "grad_norm": 12.612098693847656, + "learning_rate": 1.875580471070577e-05, + "loss": 1.8214, "step": 1488 }, { - "epoch": 0.45, - "grad_norm": 10.071520805358887, - "learning_rate": 1.7017139420667536e-05, - "loss": 1.5338, + "epoch": 0.19, + "grad_norm": 14.226095199584961, + "learning_rate": 1.875496799564908e-05, + "loss": 2.8455, "step": 1489 }, { - "epoch": 0.45, - "grad_norm": 15.892718315124512, - "learning_rate": 1.7015134810063146e-05, - "loss": 2.5901, + "epoch": 0.19, + "grad_norm": 12.34013557434082, + "learning_rate": 1.8754131280592395e-05, + "loss": 2.5628, "step": 1490 }, { - "epoch": 0.45, - "grad_norm": 13.212607383728027, - "learning_rate": 1.7013130199458756e-05, - "loss": 3.0064, + "epoch": 0.19, + "grad_norm": 14.521828651428223, + "learning_rate": 1.875329456553571e-05, + "loss": 2.5909, "step": 1491 }, { - "epoch": 0.45, - "grad_norm": 21.533849716186523, - "learning_rate": 1.7011125588854366e-05, - "loss": 2.9632, + "epoch": 0.19, + "grad_norm": 10.721088409423828, + "learning_rate": 1.875245785047902e-05, + "loss": 3.2115, "step": 1492 }, { - "epoch": 0.45, - "grad_norm": 18.14595603942871, - "learning_rate": 1.7009120978249976e-05, - "loss": 1.7862, + "epoch": 0.19, + "grad_norm": 28.61941909790039, + "learning_rate": 1.8751621135422333e-05, + "loss": 3.6909, "step": 1493 }, { - "epoch": 0.45, - "grad_norm": 12.642379760742188, - "learning_rate": 1.7007116367645586e-05, - "loss": 2.7063, + "epoch": 0.19, + "grad_norm": 9.875435829162598, + "learning_rate": 1.8750784420365646e-05, + "loss": 1.6156, "step": 1494 }, { - "epoch": 0.45, - "grad_norm": 18.708799362182617, - "learning_rate": 1.7005111757041197e-05, - "loss": 2.7772, + "epoch": 0.19, + "grad_norm": 13.883271217346191, + "learning_rate": 1.874994770530896e-05, + "loss": 0.3683, "step": 1495 }, { - "epoch": 0.45, - "grad_norm": 29.240921020507812, - "learning_rate": 1.7003107146436807e-05, - "loss": 1.7695, + "epoch": 0.19, + "grad_norm": 46.34034729003906, + "learning_rate": 1.874911099025227e-05, + "loss": 3.1741, "step": 1496 }, { - "epoch": 0.45, - "grad_norm": 18.173198699951172, - "learning_rate": 1.7001102535832417e-05, - "loss": 2.528, + "epoch": 0.19, + "grad_norm": 17.740985870361328, + "learning_rate": 1.8748274275195584e-05, + "loss": 3.2501, "step": 1497 }, { - "epoch": 0.45, - "grad_norm": 10.611312866210938, - "learning_rate": 1.6999097925228027e-05, - "loss": 2.3301, + "epoch": 0.19, + "grad_norm": 11.90066146850586, + "learning_rate": 1.8747437560138897e-05, + "loss": 3.7449, "step": 1498 }, { - "epoch": 0.45, - "grad_norm": 17.193771362304688, - "learning_rate": 1.6997093314623637e-05, - "loss": 1.8752, + "epoch": 0.19, + "grad_norm": 26.554466247558594, + "learning_rate": 1.8746600845082208e-05, + "loss": 2.4212, "step": 1499 }, { - "epoch": 0.45, - "grad_norm": 17.25593376159668, - "learning_rate": 1.6995088704019244e-05, - "loss": 3.6294, + "epoch": 0.19, + "grad_norm": 19.78257179260254, + "learning_rate": 1.874576413002552e-05, + "loss": 4.9226, "step": 1500 }, { - "epoch": 0.45, - "grad_norm": 18.063823699951172, - "learning_rate": 1.6993084093414857e-05, - "loss": 3.0548, + "epoch": 0.19, + "grad_norm": 18.166656494140625, + "learning_rate": 1.8744927414968835e-05, + "loss": 2.0438, "step": 1501 }, { - "epoch": 0.45, - "grad_norm": 22.38393783569336, - "learning_rate": 1.6991079482810467e-05, - "loss": 2.6282, + "epoch": 0.19, + "grad_norm": 9.067051887512207, + "learning_rate": 1.874409069991215e-05, + "loss": 0.8948, "step": 1502 }, { - "epoch": 0.45, - "grad_norm": 11.480286598205566, - "learning_rate": 1.6989074872206074e-05, - "loss": 1.8335, + "epoch": 0.19, + "grad_norm": 9.333136558532715, + "learning_rate": 1.874325398485546e-05, + "loss": 1.4343, "step": 1503 }, { - "epoch": 0.45, - "grad_norm": 26.893537521362305, - "learning_rate": 1.6987070261601687e-05, - "loss": 2.7152, + "epoch": 0.19, + "grad_norm": 20.27508544921875, + "learning_rate": 1.8742417269798773e-05, + "loss": 2.5113, "step": 1504 }, { - "epoch": 0.45, - "grad_norm": 11.621118545532227, - "learning_rate": 1.6985065650997294e-05, - "loss": 2.3662, + "epoch": 0.19, + "grad_norm": 18.887235641479492, + "learning_rate": 1.8741580554742086e-05, + "loss": 2.838, "step": 1505 }, { - "epoch": 0.45, - "grad_norm": 15.238231658935547, - "learning_rate": 1.6983061040392904e-05, - "loss": 2.4623, + "epoch": 0.19, + "grad_norm": 19.680187225341797, + "learning_rate": 1.8740743839685396e-05, + "loss": 1.8419, "step": 1506 }, { - "epoch": 0.45, - "grad_norm": 20.844465255737305, - "learning_rate": 1.6981056429788514e-05, - "loss": 2.5909, + "epoch": 0.19, + "grad_norm": 9.750263214111328, + "learning_rate": 1.873990712462871e-05, + "loss": 3.3147, "step": 1507 }, { - "epoch": 0.45, - "grad_norm": 11.749519348144531, - "learning_rate": 1.6979051819184124e-05, - "loss": 1.8395, + "epoch": 0.19, + "grad_norm": 36.575958251953125, + "learning_rate": 1.8739070409572024e-05, + "loss": 2.0897, "step": 1508 }, { - "epoch": 0.45, - "grad_norm": 15.4244384765625, - "learning_rate": 1.6977047208579734e-05, - "loss": 3.7336, + "epoch": 0.19, + "grad_norm": 33.13447952270508, + "learning_rate": 1.8738233694515334e-05, + "loss": 4.701, "step": 1509 }, { - "epoch": 0.45, - "grad_norm": 11.57870864868164, - "learning_rate": 1.6975042597975344e-05, - "loss": 2.3881, + "epoch": 0.19, + "grad_norm": 10.510684967041016, + "learning_rate": 1.8737396979458648e-05, + "loss": 1.1409, "step": 1510 }, { - "epoch": 0.45, - "grad_norm": 21.28474235534668, - "learning_rate": 1.6973037987370954e-05, - "loss": 2.8733, + "epoch": 0.19, + "grad_norm": 18.95481300354004, + "learning_rate": 1.8736560264401958e-05, + "loss": 1.5948, "step": 1511 }, { - "epoch": 0.45, - "grad_norm": 13.479619026184082, - "learning_rate": 1.6971033376766565e-05, - "loss": 1.693, + "epoch": 0.19, + "grad_norm": 18.029176712036133, + "learning_rate": 1.873572354934527e-05, + "loss": 2.1726, "step": 1512 }, { - "epoch": 0.45, - "grad_norm": 17.223281860351562, - "learning_rate": 1.6969028766162175e-05, - "loss": 2.2971, + "epoch": 0.19, + "grad_norm": 13.098097801208496, + "learning_rate": 1.8734886834288585e-05, + "loss": 3.0375, "step": 1513 }, { - "epoch": 0.46, - "grad_norm": 31.056154251098633, - "learning_rate": 1.6967024155557785e-05, - "loss": 2.5655, + "epoch": 0.19, + "grad_norm": 8.929461479187012, + "learning_rate": 1.8734050119231896e-05, + "loss": 2.3063, "step": 1514 }, { - "epoch": 0.46, - "grad_norm": 60.43313980102539, - "learning_rate": 1.6965019544953395e-05, - "loss": 3.5182, + "epoch": 0.19, + "grad_norm": 16.9406795501709, + "learning_rate": 1.873321340417521e-05, + "loss": 2.5458, "step": 1515 }, { - "epoch": 0.46, - "grad_norm": 15.120966911315918, - "learning_rate": 1.6963014934349005e-05, - "loss": 2.7201, + "epoch": 0.19, + "grad_norm": 15.653889656066895, + "learning_rate": 1.8732376689118523e-05, + "loss": 3.3106, "step": 1516 }, { - "epoch": 0.46, - "grad_norm": 19.221208572387695, - "learning_rate": 1.6961010323744615e-05, - "loss": 3.3837, + "epoch": 0.19, + "grad_norm": 11.295975685119629, + "learning_rate": 1.8731539974061833e-05, + "loss": 0.8551, "step": 1517 }, { - "epoch": 0.46, - "grad_norm": 13.70907974243164, - "learning_rate": 1.6959005713140225e-05, - "loss": 1.8185, + "epoch": 0.19, + "grad_norm": 10.627366065979004, + "learning_rate": 1.8730703259005147e-05, + "loss": 3.8078, "step": 1518 }, { - "epoch": 0.46, - "grad_norm": 25.266483306884766, - "learning_rate": 1.6957001102535832e-05, - "loss": 3.0797, + "epoch": 0.19, + "grad_norm": 17.634016036987305, + "learning_rate": 1.872986654394846e-05, + "loss": 3.6376, "step": 1519 }, { - "epoch": 0.46, - "grad_norm": 16.34473419189453, - "learning_rate": 1.6954996491931445e-05, - "loss": 2.689, + "epoch": 0.19, + "grad_norm": 8.201330184936523, + "learning_rate": 1.872902982889177e-05, + "loss": 1.2747, "step": 1520 }, { - "epoch": 0.46, - "grad_norm": 21.672119140625, - "learning_rate": 1.6952991881327055e-05, - "loss": 2.4117, + "epoch": 0.19, + "grad_norm": 30.411375045776367, + "learning_rate": 1.8728193113835084e-05, + "loss": 3.7582, "step": 1521 }, { - "epoch": 0.46, - "grad_norm": 12.567007064819336, - "learning_rate": 1.6950987270722662e-05, - "loss": 2.5169, + "epoch": 0.19, + "grad_norm": 9.317015647888184, + "learning_rate": 1.8727356398778398e-05, + "loss": 2.8275, "step": 1522 }, { - "epoch": 0.46, - "grad_norm": 54.59273147583008, - "learning_rate": 1.6948982660118275e-05, - "loss": 3.4834, + "epoch": 0.19, + "grad_norm": 14.676838874816895, + "learning_rate": 1.872651968372171e-05, + "loss": 3.5349, "step": 1523 }, { - "epoch": 0.46, - "grad_norm": 18.03861427307129, - "learning_rate": 1.6946978049513882e-05, - "loss": 2.824, + "epoch": 0.19, + "grad_norm": 17.38626480102539, + "learning_rate": 1.8725682968665022e-05, + "loss": 3.3976, "step": 1524 }, { - "epoch": 0.46, - "grad_norm": 40.20027542114258, - "learning_rate": 1.6944973438909492e-05, - "loss": 2.5567, + "epoch": 0.19, + "grad_norm": 20.100765228271484, + "learning_rate": 1.8724846253608335e-05, + "loss": 3.4542, "step": 1525 }, { - "epoch": 0.46, - "grad_norm": 12.565523147583008, - "learning_rate": 1.6942968828305106e-05, - "loss": 3.1967, + "epoch": 0.19, + "grad_norm": 86.41800689697266, + "learning_rate": 1.872400953855165e-05, + "loss": 3.4961, "step": 1526 }, { - "epoch": 0.46, - "grad_norm": 12.132319450378418, - "learning_rate": 1.6940964217700712e-05, - "loss": 2.5713, + "epoch": 0.19, + "grad_norm": 11.995071411132812, + "learning_rate": 1.872317282349496e-05, + "loss": 3.2023, "step": 1527 }, { - "epoch": 0.46, - "grad_norm": 28.399211883544922, - "learning_rate": 1.6938959607096323e-05, - "loss": 2.7361, + "epoch": 0.19, + "grad_norm": 43.101261138916016, + "learning_rate": 1.8722336108438273e-05, + "loss": 1.917, "step": 1528 }, { - "epoch": 0.46, - "grad_norm": 20.132675170898438, - "learning_rate": 1.6936954996491933e-05, - "loss": 2.6361, + "epoch": 0.19, + "grad_norm": 7.935921669006348, + "learning_rate": 1.8721499393381587e-05, + "loss": 1.2631, "step": 1529 }, { - "epoch": 0.46, - "grad_norm": 19.281675338745117, - "learning_rate": 1.6934950385887543e-05, - "loss": 2.8946, + "epoch": 0.19, + "grad_norm": 14.268749237060547, + "learning_rate": 1.87206626783249e-05, + "loss": 4.7786, "step": 1530 }, { - "epoch": 0.46, - "grad_norm": 11.40804672241211, - "learning_rate": 1.6932945775283153e-05, - "loss": 2.4466, + "epoch": 0.19, + "grad_norm": 13.571290016174316, + "learning_rate": 1.871982596326821e-05, + "loss": 1.4386, "step": 1531 }, { - "epoch": 0.46, - "grad_norm": 19.21536636352539, - "learning_rate": 1.6930941164678763e-05, - "loss": 2.9093, + "epoch": 0.19, + "grad_norm": 28.266124725341797, + "learning_rate": 1.8718989248211524e-05, + "loss": 3.2817, "step": 1532 }, { - "epoch": 0.46, - "grad_norm": 13.203646659851074, - "learning_rate": 1.6928936554074373e-05, - "loss": 2.3911, + "epoch": 0.19, + "grad_norm": 10.189460754394531, + "learning_rate": 1.8718152533154838e-05, + "loss": 2.2712, "step": 1533 }, { - "epoch": 0.46, - "grad_norm": 11.949692726135254, - "learning_rate": 1.6926931943469983e-05, - "loss": 2.8228, + "epoch": 0.19, + "grad_norm": 8.047140121459961, + "learning_rate": 1.8717315818098148e-05, + "loss": 2.6007, "step": 1534 }, { - "epoch": 0.46, - "grad_norm": 14.0568208694458, - "learning_rate": 1.6924927332865593e-05, - "loss": 2.7099, + "epoch": 0.19, + "grad_norm": 12.582521438598633, + "learning_rate": 1.8716479103041462e-05, + "loss": 2.3067, "step": 1535 }, { - "epoch": 0.46, - "grad_norm": 55.63922882080078, - "learning_rate": 1.69229227222612e-05, - "loss": 2.8945, + "epoch": 0.19, + "grad_norm": 21.55182647705078, + "learning_rate": 1.8715642387984772e-05, + "loss": 3.2436, "step": 1536 }, { - "epoch": 0.46, - "grad_norm": 9.444130897521973, - "learning_rate": 1.6920918111656813e-05, - "loss": 1.714, + "epoch": 0.19, + "grad_norm": 9.425176620483398, + "learning_rate": 1.8714805672928086e-05, + "loss": 3.6856, "step": 1537 }, { - "epoch": 0.46, - "grad_norm": 12.79077434539795, - "learning_rate": 1.6918913501052423e-05, - "loss": 3.4578, + "epoch": 0.19, + "grad_norm": 15.825239181518555, + "learning_rate": 1.87139689578714e-05, + "loss": 0.7055, "step": 1538 }, { - "epoch": 0.46, - "grad_norm": 12.505064010620117, - "learning_rate": 1.691690889044803e-05, - "loss": 1.7876, + "epoch": 0.19, + "grad_norm": 11.436142921447754, + "learning_rate": 1.871313224281471e-05, + "loss": 1.6424, "step": 1539 }, { - "epoch": 0.46, - "grad_norm": 19.741012573242188, - "learning_rate": 1.6914904279843644e-05, - "loss": 2.2345, + "epoch": 0.19, + "grad_norm": 25.982975006103516, + "learning_rate": 1.8712295527758023e-05, + "loss": 2.9231, "step": 1540 }, { - "epoch": 0.46, - "grad_norm": 26.9569149017334, - "learning_rate": 1.691289966923925e-05, - "loss": 2.3699, + "epoch": 0.19, + "grad_norm": 21.5576171875, + "learning_rate": 1.8711458812701337e-05, + "loss": 2.6003, "step": 1541 }, { - "epoch": 0.46, - "grad_norm": 21.60605812072754, - "learning_rate": 1.691089505863486e-05, - "loss": 3.1069, + "epoch": 0.19, + "grad_norm": 12.502641677856445, + "learning_rate": 1.8710622097644647e-05, + "loss": 2.9228, "step": 1542 }, { - "epoch": 0.46, - "grad_norm": 14.085561752319336, - "learning_rate": 1.690889044803047e-05, - "loss": 2.5667, + "epoch": 0.19, + "grad_norm": 9.891007423400879, + "learning_rate": 1.870978538258796e-05, + "loss": 3.1422, "step": 1543 }, { - "epoch": 0.46, - "grad_norm": 19.00534439086914, - "learning_rate": 1.690688583742608e-05, - "loss": 2.816, + "epoch": 0.19, + "grad_norm": 16.10243034362793, + "learning_rate": 1.8708948667531274e-05, + "loss": 2.2476, "step": 1544 }, { - "epoch": 0.46, - "grad_norm": 14.58003044128418, - "learning_rate": 1.690488122682169e-05, - "loss": 3.1388, + "epoch": 0.19, + "grad_norm": 12.385344505310059, + "learning_rate": 1.8708111952474585e-05, + "loss": 3.9671, "step": 1545 }, { - "epoch": 0.46, - "grad_norm": 23.24366569519043, - "learning_rate": 1.69028766162173e-05, - "loss": 1.9925, + "epoch": 0.19, + "grad_norm": 22.93207359313965, + "learning_rate": 1.87072752374179e-05, + "loss": 3.9058, "step": 1546 }, { - "epoch": 0.47, - "grad_norm": 18.700740814208984, - "learning_rate": 1.690087200561291e-05, - "loss": 1.9935, + "epoch": 0.19, + "grad_norm": 11.850655555725098, + "learning_rate": 1.8706438522361212e-05, + "loss": 3.4488, "step": 1547 }, { - "epoch": 0.47, - "grad_norm": 20.79172134399414, - "learning_rate": 1.689886739500852e-05, - "loss": 3.1285, + "epoch": 0.19, + "grad_norm": 27.660612106323242, + "learning_rate": 1.8705601807304522e-05, + "loss": 3.3126, "step": 1548 }, { - "epoch": 0.47, - "grad_norm": 22.56122589111328, - "learning_rate": 1.689686278440413e-05, - "loss": 3.1273, + "epoch": 0.19, + "grad_norm": 13.83594036102295, + "learning_rate": 1.8704765092247836e-05, + "loss": 3.2726, "step": 1549 }, { - "epoch": 0.47, - "grad_norm": 12.774810791015625, - "learning_rate": 1.689485817379974e-05, - "loss": 2.8262, + "epoch": 0.19, + "grad_norm": 6.115479946136475, + "learning_rate": 1.870392837719115e-05, + "loss": 2.2482, "step": 1550 }, { - "epoch": 0.47, - "grad_norm": 16.529850006103516, - "learning_rate": 1.689285356319535e-05, - "loss": 2.3473, + "epoch": 0.19, + "grad_norm": 16.649349212646484, + "learning_rate": 1.8703091662134463e-05, + "loss": 3.2705, "step": 1551 }, { - "epoch": 0.47, - "grad_norm": 20.23556137084961, - "learning_rate": 1.689084895259096e-05, - "loss": 1.8937, + "epoch": 0.19, + "grad_norm": 26.15545654296875, + "learning_rate": 1.8702254947077774e-05, + "loss": 4.2139, "step": 1552 }, { - "epoch": 0.47, - "grad_norm": 23.965484619140625, - "learning_rate": 1.688884434198657e-05, - "loss": 2.5148, + "epoch": 0.19, + "grad_norm": 9.073019981384277, + "learning_rate": 1.8701418232021087e-05, + "loss": 1.6893, "step": 1553 }, { - "epoch": 0.47, - "grad_norm": 11.394997596740723, - "learning_rate": 1.688683973138218e-05, - "loss": 2.7078, + "epoch": 0.2, + "grad_norm": 13.878929138183594, + "learning_rate": 1.87005815169644e-05, + "loss": 2.7366, "step": 1554 }, { - "epoch": 0.47, - "grad_norm": 14.20688247680664, - "learning_rate": 1.6884835120777788e-05, - "loss": 1.9273, + "epoch": 0.2, + "grad_norm": 15.587919235229492, + "learning_rate": 1.869974480190771e-05, + "loss": 2.722, "step": 1555 }, { - "epoch": 0.47, - "grad_norm": 8.693448066711426, - "learning_rate": 1.68828305101734e-05, - "loss": 1.7773, + "epoch": 0.2, + "grad_norm": 14.054646492004395, + "learning_rate": 1.8698908086851025e-05, + "loss": 3.5987, "step": 1556 }, { - "epoch": 0.47, - "grad_norm": 14.1410493850708, - "learning_rate": 1.688082589956901e-05, - "loss": 2.5604, + "epoch": 0.2, + "grad_norm": 21.463497161865234, + "learning_rate": 1.869807137179434e-05, + "loss": 2.3904, "step": 1557 }, { - "epoch": 0.47, - "grad_norm": 29.748794555664062, - "learning_rate": 1.6878821288964618e-05, - "loss": 2.5162, + "epoch": 0.2, + "grad_norm": 13.603147506713867, + "learning_rate": 1.8697234656737652e-05, + "loss": 3.0553, "step": 1558 }, { - "epoch": 0.47, - "grad_norm": 10.367297172546387, - "learning_rate": 1.6876816678360232e-05, - "loss": 3.0466, + "epoch": 0.2, + "grad_norm": 6.39517068862915, + "learning_rate": 1.8696397941680962e-05, + "loss": 2.7938, "step": 1559 }, { - "epoch": 0.47, - "grad_norm": 16.686874389648438, - "learning_rate": 1.687481206775584e-05, - "loss": 3.304, - "step": 1560 - }, - { - "epoch": 0.47, - "eval_loss": 0.3973664343357086, - "eval_runtime": 43.4098, - "eval_samples_per_second": 34.071, - "eval_steps_per_second": 34.071, + "epoch": 0.2, + "grad_norm": 17.777820587158203, + "learning_rate": 1.8695561226624276e-05, + "loss": 3.9284, "step": 1560 }, { - "epoch": 0.47, - "grad_norm": 14.685885429382324, - "learning_rate": 1.687280745715145e-05, - "loss": 2.3338, + "epoch": 0.2, + "grad_norm": 6.8158111572265625, + "learning_rate": 1.869472451156759e-05, + "loss": 0.7039, "step": 1561 }, { - "epoch": 0.47, - "grad_norm": 20.013853073120117, - "learning_rate": 1.687080284654706e-05, - "loss": 2.3406, + "epoch": 0.2, + "grad_norm": 16.261743545532227, + "learning_rate": 1.86938877965109e-05, + "loss": 3.0032, "step": 1562 }, { - "epoch": 0.47, - "grad_norm": 14.868475914001465, - "learning_rate": 1.686879823594267e-05, - "loss": 1.8712, + "epoch": 0.2, + "grad_norm": 11.026827812194824, + "learning_rate": 1.8693051081454213e-05, + "loss": 2.5343, "step": 1563 }, { - "epoch": 0.47, - "grad_norm": 20.89446449279785, - "learning_rate": 1.686679362533828e-05, - "loss": 2.7493, + "epoch": 0.2, + "grad_norm": 17.33503532409668, + "learning_rate": 1.8692214366397524e-05, + "loss": 1.871, "step": 1564 }, { - "epoch": 0.47, - "grad_norm": 34.56494903564453, - "learning_rate": 1.686478901473389e-05, - "loss": 2.8977, + "epoch": 0.2, + "grad_norm": 11.103533744812012, + "learning_rate": 1.8691377651340837e-05, + "loss": 3.0134, "step": 1565 }, { - "epoch": 0.47, - "grad_norm": 14.068147659301758, - "learning_rate": 1.68627844041295e-05, - "loss": 2.5127, + "epoch": 0.2, + "grad_norm": 8.425973892211914, + "learning_rate": 1.869054093628415e-05, + "loss": 0.9428, "step": 1566 }, { - "epoch": 0.47, - "grad_norm": 18.625877380371094, - "learning_rate": 1.686077979352511e-05, - "loss": 2.8985, + "epoch": 0.2, + "grad_norm": 18.52934455871582, + "learning_rate": 1.868970422122746e-05, + "loss": 2.4734, "step": 1567 }, { - "epoch": 0.47, - "grad_norm": 11.287114143371582, - "learning_rate": 1.685877518292072e-05, - "loss": 1.6863, + "epoch": 0.2, + "grad_norm": 17.02704429626465, + "learning_rate": 1.8688867506170775e-05, + "loss": 3.5376, "step": 1568 }, { - "epoch": 0.47, - "grad_norm": 19.42438316345215, - "learning_rate": 1.685677057231633e-05, - "loss": 2.5056, + "epoch": 0.2, + "grad_norm": 14.429878234863281, + "learning_rate": 1.8688030791114085e-05, + "loss": 3.239, "step": 1569 }, { - "epoch": 0.47, - "grad_norm": 13.997237205505371, - "learning_rate": 1.685476596171194e-05, - "loss": 2.6668, + "epoch": 0.2, + "grad_norm": 11.238717079162598, + "learning_rate": 1.86871940760574e-05, + "loss": 1.6583, "step": 1570 }, { - "epoch": 0.47, - "grad_norm": 10.12904167175293, - "learning_rate": 1.685276135110755e-05, - "loss": 2.0919, + "epoch": 0.2, + "grad_norm": 11.936284065246582, + "learning_rate": 1.8686357361000713e-05, + "loss": 4.6975, "step": 1571 }, { - "epoch": 0.47, - "grad_norm": 23.484573364257812, - "learning_rate": 1.685075674050316e-05, - "loss": 2.9543, + "epoch": 0.2, + "grad_norm": 19.51447105407715, + "learning_rate": 1.8685520645944026e-05, + "loss": 1.3431, "step": 1572 }, { - "epoch": 0.47, - "grad_norm": 21.227386474609375, - "learning_rate": 1.684875212989877e-05, - "loss": 2.0336, + "epoch": 0.2, + "grad_norm": 17.191984176635742, + "learning_rate": 1.8684683930887336e-05, + "loss": 1.7212, "step": 1573 }, { - "epoch": 0.47, - "grad_norm": 30.29522132873535, - "learning_rate": 1.6846747519294376e-05, - "loss": 2.6453, + "epoch": 0.2, + "grad_norm": 12.201562881469727, + "learning_rate": 1.868384721583065e-05, + "loss": 3.2341, "step": 1574 }, { - "epoch": 0.47, - "grad_norm": 25.412912368774414, - "learning_rate": 1.684474290868999e-05, - "loss": 3.1794, + "epoch": 0.2, + "grad_norm": 18.343507766723633, + "learning_rate": 1.8683010500773964e-05, + "loss": 3.3134, "step": 1575 }, { - "epoch": 0.47, - "grad_norm": 26.188493728637695, - "learning_rate": 1.68427382980856e-05, - "loss": 2.9667, + "epoch": 0.2, + "grad_norm": 13.997854232788086, + "learning_rate": 1.8682173785717274e-05, + "loss": 2.8495, "step": 1576 }, { - "epoch": 0.47, - "grad_norm": 15.816750526428223, - "learning_rate": 1.6840733687481206e-05, - "loss": 2.0286, + "epoch": 0.2, + "grad_norm": 9.156922340393066, + "learning_rate": 1.8681337070660588e-05, + "loss": 0.344, "step": 1577 }, { - "epoch": 0.47, - "grad_norm": 17.47620964050293, - "learning_rate": 1.683872907687682e-05, - "loss": 3.0095, + "epoch": 0.2, + "grad_norm": 17.772275924682617, + "learning_rate": 1.86805003556039e-05, + "loss": 3.108, "step": 1578 }, { - "epoch": 0.47, - "grad_norm": 14.732656478881836, - "learning_rate": 1.6836724466272427e-05, - "loss": 3.2241, + "epoch": 0.2, + "grad_norm": 15.730620384216309, + "learning_rate": 1.8679663640547215e-05, + "loss": 3.0182, "step": 1579 }, { - "epoch": 0.48, - "grad_norm": 15.800749778747559, - "learning_rate": 1.6834719855668037e-05, - "loss": 2.4581, + "epoch": 0.2, + "grad_norm": 145.39260864257812, + "learning_rate": 1.8678826925490525e-05, + "loss": 2.7213, "step": 1580 }, { - "epoch": 0.48, - "grad_norm": 14.257530212402344, - "learning_rate": 1.683271524506365e-05, - "loss": 2.6594, + "epoch": 0.2, + "grad_norm": 15.215822219848633, + "learning_rate": 1.867799021043384e-05, + "loss": 2.2526, "step": 1581 }, { - "epoch": 0.48, - "grad_norm": 18.350427627563477, - "learning_rate": 1.6830710634459257e-05, - "loss": 2.8644, + "epoch": 0.2, + "grad_norm": 52.26152038574219, + "learning_rate": 1.8677153495377152e-05, + "loss": 2.0332, "step": 1582 }, { - "epoch": 0.48, - "grad_norm": 17.27747344970703, - "learning_rate": 1.6828706023854867e-05, - "loss": 2.8629, + "epoch": 0.2, + "grad_norm": 9.461394309997559, + "learning_rate": 1.8676316780320463e-05, + "loss": 0.6025, "step": 1583 }, { - "epoch": 0.48, - "grad_norm": 13.865232467651367, - "learning_rate": 1.6826701413250477e-05, - "loss": 2.115, + "epoch": 0.2, + "grad_norm": 14.551637649536133, + "learning_rate": 1.8675480065263776e-05, + "loss": 1.4883, "step": 1584 }, { - "epoch": 0.48, - "grad_norm": 26.224536895751953, - "learning_rate": 1.6824696802646087e-05, - "loss": 3.4931, + "epoch": 0.2, + "grad_norm": 29.688762664794922, + "learning_rate": 1.867464335020709e-05, + "loss": 2.9508, "step": 1585 }, { - "epoch": 0.48, - "grad_norm": 10.468141555786133, - "learning_rate": 1.6822692192041697e-05, - "loss": 1.9319, + "epoch": 0.2, + "grad_norm": 11.327323913574219, + "learning_rate": 1.8673806635150404e-05, + "loss": 3.3841, "step": 1586 }, { - "epoch": 0.48, - "grad_norm": 10.993379592895508, - "learning_rate": 1.6820687581437307e-05, - "loss": 2.8885, + "epoch": 0.2, + "grad_norm": 18.282663345336914, + "learning_rate": 1.8672969920093714e-05, + "loss": 3.8014, "step": 1587 }, { - "epoch": 0.48, - "grad_norm": 18.79608726501465, - "learning_rate": 1.6818682970832917e-05, - "loss": 3.4616, + "epoch": 0.2, + "grad_norm": 10.740460395812988, + "learning_rate": 1.8672133205037028e-05, + "loss": 3.5532, "step": 1588 }, { - "epoch": 0.48, - "grad_norm": 23.80633544921875, - "learning_rate": 1.6816678360228528e-05, - "loss": 3.5015, + "epoch": 0.2, + "grad_norm": 23.3198184967041, + "learning_rate": 1.8671296489980338e-05, + "loss": 3.5516, "step": 1589 }, { - "epoch": 0.48, - "grad_norm": 13.126861572265625, - "learning_rate": 1.6814673749624138e-05, - "loss": 2.2426, + "epoch": 0.2, + "grad_norm": 15.05379581451416, + "learning_rate": 1.867045977492365e-05, + "loss": 3.7878, "step": 1590 }, { - "epoch": 0.48, - "grad_norm": 18.61846923828125, - "learning_rate": 1.6812669139019748e-05, - "loss": 2.8252, + "epoch": 0.2, + "grad_norm": 19.40207290649414, + "learning_rate": 1.8669623059866965e-05, + "loss": 4.2777, "step": 1591 }, { - "epoch": 0.48, - "grad_norm": 9.052312850952148, - "learning_rate": 1.6810664528415358e-05, - "loss": 2.1735, + "epoch": 0.2, + "grad_norm": 13.312829971313477, + "learning_rate": 1.8668786344810275e-05, + "loss": 3.4725, "step": 1592 }, { - "epoch": 0.48, - "grad_norm": 9.982820510864258, - "learning_rate": 1.6808659917810964e-05, - "loss": 1.8825, + "epoch": 0.2, + "grad_norm": 10.412389755249023, + "learning_rate": 1.866794962975359e-05, + "loss": 1.704, "step": 1593 }, { - "epoch": 0.48, - "grad_norm": 18.984390258789062, - "learning_rate": 1.6806655307206578e-05, - "loss": 3.0772, + "epoch": 0.2, + "grad_norm": 13.629739761352539, + "learning_rate": 1.86671129146969e-05, + "loss": 1.4708, "step": 1594 }, { - "epoch": 0.48, - "grad_norm": 19.849212646484375, - "learning_rate": 1.6804650696602188e-05, - "loss": 2.8981, + "epoch": 0.2, + "grad_norm": 10.407073974609375, + "learning_rate": 1.8666276199640213e-05, + "loss": 1.7156, "step": 1595 }, { - "epoch": 0.48, - "grad_norm": 15.359475135803223, - "learning_rate": 1.6802646085997795e-05, - "loss": 2.7745, + "epoch": 0.2, + "grad_norm": 13.543862342834473, + "learning_rate": 1.8665439484583527e-05, + "loss": 4.2424, "step": 1596 }, { - "epoch": 0.48, - "grad_norm": 13.44256591796875, - "learning_rate": 1.6800641475393408e-05, - "loss": 2.4322, + "epoch": 0.2, + "grad_norm": 11.773777961730957, + "learning_rate": 1.8664602769526837e-05, + "loss": 1.0708, "step": 1597 }, { - "epoch": 0.48, - "grad_norm": 19.221418380737305, - "learning_rate": 1.6798636864789015e-05, - "loss": 1.8762, + "epoch": 0.2, + "grad_norm": 13.88772201538086, + "learning_rate": 1.866376605447015e-05, + "loss": 1.8847, "step": 1598 }, { - "epoch": 0.48, - "grad_norm": 20.303268432617188, - "learning_rate": 1.6796632254184625e-05, - "loss": 1.7788, + "epoch": 0.2, + "grad_norm": 32.566158294677734, + "learning_rate": 1.8662929339413464e-05, + "loss": 4.1873, "step": 1599 }, { - "epoch": 0.48, - "grad_norm": 19.06624412536621, - "learning_rate": 1.679462764358024e-05, - "loss": 3.7968, + "epoch": 0.2, + "grad_norm": 23.395357131958008, + "learning_rate": 1.8662092624356778e-05, + "loss": 3.2244, "step": 1600 }, { - "epoch": 0.48, - "grad_norm": 10.663858413696289, - "learning_rate": 1.6792623032975845e-05, - "loss": 1.1838, + "epoch": 0.2, + "eval_loss": 0.21643704175949097, + "eval_runtime": 95.2668, + "eval_samples_per_second": 37.18, + "eval_steps_per_second": 37.18, + "step": 1600 + }, + { + "epoch": 0.2, + "grad_norm": 14.963095664978027, + "learning_rate": 1.8661255909300088e-05, + "loss": 1.789, "step": 1601 }, { - "epoch": 0.48, - "grad_norm": 16.027814865112305, - "learning_rate": 1.6790618422371455e-05, - "loss": 2.7883, + "epoch": 0.2, + "grad_norm": 10.993432998657227, + "learning_rate": 1.8660419194243402e-05, + "loss": 2.4976, "step": 1602 }, { - "epoch": 0.48, - "grad_norm": 15.941658020019531, - "learning_rate": 1.6788613811767065e-05, - "loss": 2.9455, + "epoch": 0.2, + "grad_norm": 13.061546325683594, + "learning_rate": 1.8659582479186715e-05, + "loss": 2.7292, "step": 1603 }, { - "epoch": 0.48, - "grad_norm": 16.636554718017578, - "learning_rate": 1.6786609201162675e-05, - "loss": 2.8941, + "epoch": 0.2, + "grad_norm": 17.390151977539062, + "learning_rate": 1.8658745764130026e-05, + "loss": 3.6218, "step": 1604 }, { - "epoch": 0.48, - "grad_norm": 11.56485366821289, - "learning_rate": 1.6784604590558285e-05, - "loss": 2.961, + "epoch": 0.2, + "grad_norm": 15.30650520324707, + "learning_rate": 1.865790904907334e-05, + "loss": 5.6267, "step": 1605 }, { - "epoch": 0.48, - "grad_norm": 13.037918090820312, - "learning_rate": 1.6782599979953896e-05, - "loss": 2.292, + "epoch": 0.2, + "grad_norm": 19.716659545898438, + "learning_rate": 1.8657072334016653e-05, + "loss": 3.9792, "step": 1606 }, { - "epoch": 0.48, - "grad_norm": 16.472829818725586, - "learning_rate": 1.6780595369349506e-05, - "loss": 3.5172, + "epoch": 0.2, + "grad_norm": 18.685670852661133, + "learning_rate": 1.8656235618959963e-05, + "loss": 2.7784, "step": 1607 }, { - "epoch": 0.48, - "grad_norm": 21.085201263427734, - "learning_rate": 1.6778590758745116e-05, - "loss": 2.8983, + "epoch": 0.2, + "grad_norm": 12.650303840637207, + "learning_rate": 1.8655398903903277e-05, + "loss": 2.6199, "step": 1608 }, { - "epoch": 0.48, - "grad_norm": 43.585323333740234, - "learning_rate": 1.6776586148140726e-05, - "loss": 2.7387, + "epoch": 0.2, + "grad_norm": 9.990423202514648, + "learning_rate": 1.865456218884659e-05, + "loss": 1.2948, "step": 1609 }, { - "epoch": 0.48, - "grad_norm": 12.80986213684082, - "learning_rate": 1.6774581537536332e-05, - "loss": 2.1192, + "epoch": 0.2, + "grad_norm": 12.272200584411621, + "learning_rate": 1.8653725473789904e-05, + "loss": 1.7085, "step": 1610 }, { - "epoch": 0.48, - "grad_norm": 24.946006774902344, - "learning_rate": 1.6772576926931946e-05, - "loss": 3.2683, + "epoch": 0.2, + "grad_norm": 9.659276008605957, + "learning_rate": 1.8652888758733214e-05, + "loss": 3.2644, "step": 1611 }, { - "epoch": 0.48, - "grad_norm": 10.049246788024902, - "learning_rate": 1.6770572316327556e-05, - "loss": 2.5447, + "epoch": 0.2, + "grad_norm": 8.444182395935059, + "learning_rate": 1.8652052043676528e-05, + "loss": 1.3415, "step": 1612 }, { - "epoch": 0.48, - "grad_norm": 13.763422012329102, - "learning_rate": 1.6768567705723163e-05, - "loss": 2.1387, + "epoch": 0.2, + "grad_norm": 10.08949089050293, + "learning_rate": 1.8651215328619842e-05, + "loss": 1.0016, "step": 1613 }, { - "epoch": 0.49, - "grad_norm": 18.731260299682617, - "learning_rate": 1.6766563095118776e-05, - "loss": 1.9977, + "epoch": 0.2, + "grad_norm": 11.300054550170898, + "learning_rate": 1.8650378613563152e-05, + "loss": 2.5986, "step": 1614 }, { - "epoch": 0.49, - "grad_norm": 11.16000747680664, - "learning_rate": 1.6764558484514383e-05, - "loss": 1.9304, + "epoch": 0.2, + "grad_norm": 13.937424659729004, + "learning_rate": 1.8649541898506466e-05, + "loss": 1.9747, "step": 1615 }, { - "epoch": 0.49, - "grad_norm": 15.650193214416504, - "learning_rate": 1.6762553873909993e-05, - "loss": 3.0744, + "epoch": 0.2, + "grad_norm": 12.886364936828613, + "learning_rate": 1.864870518344978e-05, + "loss": 1.5003, "step": 1616 }, { - "epoch": 0.49, - "grad_norm": 19.635541915893555, - "learning_rate": 1.6760549263305603e-05, - "loss": 2.1045, + "epoch": 0.2, + "grad_norm": 14.680170059204102, + "learning_rate": 1.864786846839309e-05, + "loss": 3.1874, "step": 1617 }, { - "epoch": 0.49, - "grad_norm": 17.894126892089844, - "learning_rate": 1.6758544652701213e-05, - "loss": 2.6789, + "epoch": 0.2, + "grad_norm": 16.69063377380371, + "learning_rate": 1.8647031753336403e-05, + "loss": 3.7762, "step": 1618 }, { - "epoch": 0.49, - "grad_norm": 20.0896053314209, - "learning_rate": 1.6756540042096823e-05, - "loss": 2.6357, + "epoch": 0.2, + "grad_norm": 19.271764755249023, + "learning_rate": 1.8646195038279717e-05, + "loss": 2.1444, "step": 1619 }, { - "epoch": 0.49, - "grad_norm": 18.415935516357422, - "learning_rate": 1.6754535431492433e-05, - "loss": 2.9713, + "epoch": 0.2, + "grad_norm": 11.025174140930176, + "learning_rate": 1.8645358323223027e-05, + "loss": 2.8204, "step": 1620 }, { - "epoch": 0.49, - "grad_norm": 22.75372314453125, - "learning_rate": 1.6752530820888043e-05, - "loss": 3.3434, + "epoch": 0.2, + "grad_norm": 12.710426330566406, + "learning_rate": 1.864452160816634e-05, + "loss": 1.4955, "step": 1621 }, { - "epoch": 0.49, - "grad_norm": 20.71872901916504, - "learning_rate": 1.6750526210283654e-05, - "loss": 2.466, + "epoch": 0.2, + "grad_norm": 47.94896697998047, + "learning_rate": 1.864368489310965e-05, + "loss": 1.9602, "step": 1622 }, { - "epoch": 0.49, - "grad_norm": 15.37236499786377, - "learning_rate": 1.6748521599679264e-05, - "loss": 3.1419, + "epoch": 0.2, + "grad_norm": 13.462227821350098, + "learning_rate": 1.8642848178052965e-05, + "loss": 2.2038, "step": 1623 }, { - "epoch": 0.49, - "grad_norm": 16.39763641357422, - "learning_rate": 1.6746516989074874e-05, - "loss": 2.6307, + "epoch": 0.2, + "grad_norm": 15.98450756072998, + "learning_rate": 1.8642011462996278e-05, + "loss": 4.13, "step": 1624 }, { - "epoch": 0.49, - "grad_norm": 12.221854209899902, - "learning_rate": 1.6744512378470484e-05, - "loss": 3.121, + "epoch": 0.2, + "grad_norm": 9.845216751098633, + "learning_rate": 1.864117474793959e-05, + "loss": 0.5645, "step": 1625 }, { - "epoch": 0.49, - "grad_norm": 12.343154907226562, - "learning_rate": 1.6742507767866094e-05, - "loss": 2.0893, + "epoch": 0.2, + "grad_norm": 16.83765411376953, + "learning_rate": 1.8640338032882902e-05, + "loss": 1.9015, "step": 1626 }, { - "epoch": 0.49, - "grad_norm": 15.398106575012207, - "learning_rate": 1.6740503157261704e-05, - "loss": 2.126, + "epoch": 0.2, + "grad_norm": 21.475820541381836, + "learning_rate": 1.8639501317826216e-05, + "loss": 3.4947, "step": 1627 }, { - "epoch": 0.49, - "grad_norm": 16.43120574951172, - "learning_rate": 1.6738498546657314e-05, - "loss": 3.3734, + "epoch": 0.2, + "grad_norm": 20.425395965576172, + "learning_rate": 1.8638664602769526e-05, + "loss": 2.7783, "step": 1628 }, { - "epoch": 0.49, - "grad_norm": 11.764989852905273, - "learning_rate": 1.673649393605292e-05, - "loss": 1.5352, + "epoch": 0.2, + "grad_norm": 9.8743257522583, + "learning_rate": 1.863782788771284e-05, + "loss": 2.3828, "step": 1629 }, { - "epoch": 0.49, - "grad_norm": 24.872587203979492, - "learning_rate": 1.6734489325448534e-05, - "loss": 2.9974, + "epoch": 0.2, + "grad_norm": 9.016057014465332, + "learning_rate": 1.8636991172656153e-05, + "loss": 1.0653, "step": 1630 }, { - "epoch": 0.49, - "grad_norm": 16.981983184814453, - "learning_rate": 1.6732484714844144e-05, - "loss": 2.1175, + "epoch": 0.2, + "grad_norm": 23.619979858398438, + "learning_rate": 1.8636154457599467e-05, + "loss": 2.5996, "step": 1631 }, { - "epoch": 0.49, - "grad_norm": 20.261165618896484, - "learning_rate": 1.673048010423975e-05, - "loss": 3.5279, + "epoch": 0.2, + "grad_norm": 19.225311279296875, + "learning_rate": 1.8635317742542777e-05, + "loss": 3.6662, "step": 1632 }, { - "epoch": 0.49, - "grad_norm": 26.34723663330078, - "learning_rate": 1.6728475493635364e-05, - "loss": 3.3491, + "epoch": 0.2, + "grad_norm": 26.433652877807617, + "learning_rate": 1.863448102748609e-05, + "loss": 2.8623, "step": 1633 }, { - "epoch": 0.49, - "grad_norm": 18.659168243408203, - "learning_rate": 1.672647088303097e-05, - "loss": 2.9663, + "epoch": 0.21, + "grad_norm": 228.86788940429688, + "learning_rate": 1.8633644312429405e-05, + "loss": 3.3586, "step": 1634 }, { - "epoch": 0.49, - "grad_norm": 11.186785697937012, - "learning_rate": 1.672446627242658e-05, - "loss": 2.1397, + "epoch": 0.21, + "grad_norm": 9.119266510009766, + "learning_rate": 1.8632807597372715e-05, + "loss": 1.3587, "step": 1635 }, { - "epoch": 0.49, - "grad_norm": 9.327524185180664, - "learning_rate": 1.672246166182219e-05, - "loss": 1.744, + "epoch": 0.21, + "grad_norm": 29.663616180419922, + "learning_rate": 1.863197088231603e-05, + "loss": 4.7198, "step": 1636 }, { - "epoch": 0.49, - "grad_norm": 17.69675064086914, - "learning_rate": 1.67204570512178e-05, - "loss": 2.9865, + "epoch": 0.21, + "grad_norm": 20.09743309020996, + "learning_rate": 1.8631134167259342e-05, + "loss": 2.0186, "step": 1637 }, { - "epoch": 0.49, - "grad_norm": 21.239151000976562, - "learning_rate": 1.671845244061341e-05, - "loss": 2.8212, + "epoch": 0.21, + "grad_norm": 18.866987228393555, + "learning_rate": 1.8630297452202656e-05, + "loss": 1.6415, "step": 1638 }, { - "epoch": 0.49, - "grad_norm": 41.4120979309082, - "learning_rate": 1.671644783000902e-05, - "loss": 2.7785, + "epoch": 0.21, + "grad_norm": 9.930262565612793, + "learning_rate": 1.8629460737145966e-05, + "loss": 1.4622, "step": 1639 }, { - "epoch": 0.49, - "grad_norm": 11.790454864501953, - "learning_rate": 1.671444321940463e-05, - "loss": 2.2797, + "epoch": 0.21, + "grad_norm": 7.938800811767578, + "learning_rate": 1.862862402208928e-05, + "loss": 2.2411, "step": 1640 }, { - "epoch": 0.49, - "grad_norm": 17.779918670654297, - "learning_rate": 1.6712438608800242e-05, - "loss": 3.0112, + "epoch": 0.21, + "grad_norm": 59.838722229003906, + "learning_rate": 1.8627787307032593e-05, + "loss": 1.5835, "step": 1641 }, { - "epoch": 0.49, - "grad_norm": 16.839773178100586, - "learning_rate": 1.6710433998195852e-05, - "loss": 2.4351, + "epoch": 0.21, + "grad_norm": 14.412603378295898, + "learning_rate": 1.8626950591975904e-05, + "loss": 2.4392, "step": 1642 }, { - "epoch": 0.49, - "grad_norm": 15.281671524047852, - "learning_rate": 1.6708429387591462e-05, - "loss": 1.9823, + "epoch": 0.21, + "grad_norm": 12.059946060180664, + "learning_rate": 1.8626113876919217e-05, + "loss": 1.2654, "step": 1643 }, { - "epoch": 0.49, - "grad_norm": 19.457735061645508, - "learning_rate": 1.6706424776987072e-05, - "loss": 3.6049, + "epoch": 0.21, + "grad_norm": 9.609929084777832, + "learning_rate": 1.862527716186253e-05, + "loss": 2.3594, "step": 1644 }, { - "epoch": 0.49, - "grad_norm": 16.624021530151367, - "learning_rate": 1.6704420166382682e-05, - "loss": 3.3101, + "epoch": 0.21, + "grad_norm": 16.069400787353516, + "learning_rate": 1.862444044680584e-05, + "loss": 2.0094, "step": 1645 }, { - "epoch": 0.49, - "grad_norm": 14.08040714263916, - "learning_rate": 1.6702415555778292e-05, - "loss": 2.4127, + "epoch": 0.21, + "grad_norm": 10.690958976745605, + "learning_rate": 1.8623603731749155e-05, + "loss": 1.2511, "step": 1646 }, { - "epoch": 0.5, - "grad_norm": 23.419265747070312, - "learning_rate": 1.6700410945173902e-05, - "loss": 3.5622, + "epoch": 0.21, + "grad_norm": 25.660438537597656, + "learning_rate": 1.8622767016692465e-05, + "loss": 2.8155, "step": 1647 }, { - "epoch": 0.5, - "grad_norm": 12.655960083007812, - "learning_rate": 1.669840633456951e-05, - "loss": 1.7092, + "epoch": 0.21, + "grad_norm": 14.544229507446289, + "learning_rate": 1.862193030163578e-05, + "loss": 2.4869, "step": 1648 }, { - "epoch": 0.5, - "grad_norm": 10.199254989624023, - "learning_rate": 1.6696401723965122e-05, - "loss": 1.9251, + "epoch": 0.21, + "grad_norm": 17.626832962036133, + "learning_rate": 1.8621093586579092e-05, + "loss": 2.6804, "step": 1649 }, { - "epoch": 0.5, - "grad_norm": 30.36039161682129, - "learning_rate": 1.6694397113360732e-05, - "loss": 1.9794, + "epoch": 0.21, + "grad_norm": 19.331235885620117, + "learning_rate": 1.8620256871522403e-05, + "loss": 1.5767, "step": 1650 }, { - "epoch": 0.5, - "grad_norm": 20.610151290893555, - "learning_rate": 1.669239250275634e-05, - "loss": 2.0625, + "epoch": 0.21, + "grad_norm": 18.1611270904541, + "learning_rate": 1.8619420156465716e-05, + "loss": 1.382, "step": 1651 }, { - "epoch": 0.5, - "grad_norm": 11.706425666809082, - "learning_rate": 1.6690387892151953e-05, - "loss": 1.8749, + "epoch": 0.21, + "grad_norm": 16.357955932617188, + "learning_rate": 1.861858344140903e-05, + "loss": 3.0114, "step": 1652 }, { - "epoch": 0.5, - "grad_norm": 9.867399215698242, - "learning_rate": 1.668838328154756e-05, - "loss": 2.4526, + "epoch": 0.21, + "grad_norm": 12.59953784942627, + "learning_rate": 1.861774672635234e-05, + "loss": 2.191, "step": 1653 }, { - "epoch": 0.5, - "grad_norm": 21.617414474487305, - "learning_rate": 1.668637867094317e-05, - "loss": 4.5465, + "epoch": 0.21, + "grad_norm": 14.794685363769531, + "learning_rate": 1.8616910011295654e-05, + "loss": 3.3433, "step": 1654 }, { - "epoch": 0.5, - "grad_norm": 21.315481185913086, - "learning_rate": 1.6684374060338783e-05, - "loss": 2.3748, + "epoch": 0.21, + "grad_norm": 15.138568878173828, + "learning_rate": 1.8616073296238968e-05, + "loss": 0.7934, "step": 1655 }, { - "epoch": 0.5, - "grad_norm": 19.910005569458008, - "learning_rate": 1.668236944973439e-05, - "loss": 2.4007, + "epoch": 0.21, + "grad_norm": 16.43613052368164, + "learning_rate": 1.8615236581182278e-05, + "loss": 3.3455, "step": 1656 }, { - "epoch": 0.5, - "grad_norm": 11.719648361206055, - "learning_rate": 1.668036483913e-05, - "loss": 2.5453, + "epoch": 0.21, + "grad_norm": 24.419034957885742, + "learning_rate": 1.861439986612559e-05, + "loss": 2.7773, "step": 1657 }, { - "epoch": 0.5, - "grad_norm": 14.366843223571777, - "learning_rate": 1.667836022852561e-05, - "loss": 2.7823, + "epoch": 0.21, + "grad_norm": 16.706079483032227, + "learning_rate": 1.8613563151068905e-05, + "loss": 3.6568, "step": 1658 }, { - "epoch": 0.5, - "grad_norm": 20.43113136291504, - "learning_rate": 1.667635561792122e-05, - "loss": 3.0117, + "epoch": 0.21, + "grad_norm": 13.282275199890137, + "learning_rate": 1.861272643601222e-05, + "loss": 2.1749, "step": 1659 }, { - "epoch": 0.5, - "grad_norm": 25.725727081298828, - "learning_rate": 1.667435100731683e-05, - "loss": 1.9852, + "epoch": 0.21, + "grad_norm": 8.940580368041992, + "learning_rate": 1.861188972095553e-05, + "loss": 2.3572, "step": 1660 }, { - "epoch": 0.5, - "grad_norm": 12.890719413757324, - "learning_rate": 1.667234639671244e-05, - "loss": 2.8881, + "epoch": 0.21, + "grad_norm": 11.337531089782715, + "learning_rate": 1.8611053005898843e-05, + "loss": 3.3588, "step": 1661 }, { - "epoch": 0.5, - "grad_norm": 15.555781364440918, - "learning_rate": 1.667034178610805e-05, - "loss": 3.1708, + "epoch": 0.21, + "grad_norm": 18.37208366394043, + "learning_rate": 1.8610216290842156e-05, + "loss": 2.1123, "step": 1662 }, { - "epoch": 0.5, - "grad_norm": 13.397451400756836, - "learning_rate": 1.666833717550366e-05, - "loss": 2.0334, + "epoch": 0.21, + "grad_norm": 14.14362907409668, + "learning_rate": 1.8609379575785467e-05, + "loss": 4.3833, "step": 1663 }, { - "epoch": 0.5, - "grad_norm": 12.852067947387695, - "learning_rate": 1.666633256489927e-05, - "loss": 3.193, + "epoch": 0.21, + "grad_norm": 15.549084663391113, + "learning_rate": 1.860854286072878e-05, + "loss": 3.4541, "step": 1664 }, { - "epoch": 0.5, - "grad_norm": 17.795087814331055, - "learning_rate": 1.666432795429488e-05, - "loss": 2.1889, + "epoch": 0.21, + "grad_norm": 17.050514221191406, + "learning_rate": 1.8607706145672094e-05, + "loss": 2.1293, "step": 1665 }, { - "epoch": 0.5, - "grad_norm": 10.84477710723877, - "learning_rate": 1.666232334369049e-05, - "loss": 2.1249, + "epoch": 0.21, + "grad_norm": 106.90633392333984, + "learning_rate": 1.8606869430615408e-05, + "loss": 2.8493, "step": 1666 }, { - "epoch": 0.5, - "grad_norm": 13.947750091552734, - "learning_rate": 1.66603187330861e-05, - "loss": 2.1903, + "epoch": 0.21, + "grad_norm": 13.605634689331055, + "learning_rate": 1.8606032715558718e-05, + "loss": 1.3126, "step": 1667 }, { - "epoch": 0.5, - "grad_norm": 15.641617774963379, - "learning_rate": 1.665831412248171e-05, - "loss": 1.4892, + "epoch": 0.21, + "grad_norm": 18.58948516845703, + "learning_rate": 1.860519600050203e-05, + "loss": 3.1841, "step": 1668 }, { - "epoch": 0.5, - "grad_norm": 17.841720581054688, - "learning_rate": 1.665630951187732e-05, - "loss": 2.2156, + "epoch": 0.21, + "grad_norm": 12.607535362243652, + "learning_rate": 1.8604359285445345e-05, + "loss": 3.6446, "step": 1669 }, { - "epoch": 0.5, - "grad_norm": 21.81488800048828, - "learning_rate": 1.6654304901272927e-05, - "loss": 2.7059, + "epoch": 0.21, + "grad_norm": 5.925419330596924, + "learning_rate": 1.8603522570388655e-05, + "loss": 0.8371, "step": 1670 }, { - "epoch": 0.5, - "grad_norm": 10.093061447143555, - "learning_rate": 1.665230029066854e-05, - "loss": 2.087, + "epoch": 0.21, + "grad_norm": 13.583077430725098, + "learning_rate": 1.860268585533197e-05, + "loss": 1.5817, "step": 1671 }, { - "epoch": 0.5, - "grad_norm": 12.38084602355957, - "learning_rate": 1.6650295680064148e-05, - "loss": 2.6622, + "epoch": 0.21, + "grad_norm": 17.19635772705078, + "learning_rate": 1.8601849140275283e-05, + "loss": 4.8983, "step": 1672 }, { - "epoch": 0.5, - "grad_norm": 13.34916877746582, - "learning_rate": 1.6648291069459758e-05, - "loss": 3.5172, + "epoch": 0.21, + "grad_norm": 14.493489265441895, + "learning_rate": 1.8601012425218593e-05, + "loss": 3.0678, "step": 1673 }, { - "epoch": 0.5, - "grad_norm": 14.111736297607422, - "learning_rate": 1.664628645885537e-05, - "loss": 2.5334, + "epoch": 0.21, + "grad_norm": 10.430756568908691, + "learning_rate": 1.8600175710161907e-05, + "loss": 5.0035, "step": 1674 }, { - "epoch": 0.5, - "grad_norm": 19.57823944091797, - "learning_rate": 1.6644281848250978e-05, - "loss": 2.8923, + "epoch": 0.21, + "grad_norm": 41.852569580078125, + "learning_rate": 1.8599338995105217e-05, + "loss": 2.7717, "step": 1675 }, { - "epoch": 0.5, - "grad_norm": 20.907072067260742, - "learning_rate": 1.6642277237646588e-05, - "loss": 2.7302, + "epoch": 0.21, + "grad_norm": 26.517459869384766, + "learning_rate": 1.859850228004853e-05, + "loss": 4.5046, "step": 1676 }, { - "epoch": 0.5, - "grad_norm": 12.42391300201416, - "learning_rate": 1.6640272627042198e-05, - "loss": 2.4441, + "epoch": 0.21, + "grad_norm": 18.16859245300293, + "learning_rate": 1.8597665564991844e-05, + "loss": 4.4326, "step": 1677 }, { - "epoch": 0.5, - "grad_norm": 39.446815490722656, - "learning_rate": 1.6638268016437808e-05, - "loss": 3.3596, + "epoch": 0.21, + "grad_norm": 12.79591178894043, + "learning_rate": 1.8596828849935154e-05, + "loss": 2.8772, "step": 1678 }, { - "epoch": 0.5, - "grad_norm": 10.514243125915527, - "learning_rate": 1.6636263405833418e-05, - "loss": 1.9339, + "epoch": 0.21, + "grad_norm": 32.08021926879883, + "learning_rate": 1.8595992134878468e-05, + "loss": 1.3034, "step": 1679 }, { - "epoch": 0.51, - "grad_norm": 22.982282638549805, - "learning_rate": 1.6634258795229028e-05, - "loss": 2.6203, - "step": 1680 - }, - { - "epoch": 0.51, - "eval_loss": 0.421953409910202, - "eval_runtime": 43.6652, - "eval_samples_per_second": 33.871, - "eval_steps_per_second": 33.871, + "epoch": 0.21, + "grad_norm": 23.306074142456055, + "learning_rate": 1.859515541982178e-05, + "loss": 3.1075, "step": 1680 }, { - "epoch": 0.51, - "grad_norm": 31.43738555908203, - "learning_rate": 1.663225418462464e-05, - "loss": 2.5882, + "epoch": 0.21, + "grad_norm": 9.065752983093262, + "learning_rate": 1.8594318704765092e-05, + "loss": 2.7193, "step": 1681 }, { - "epoch": 0.51, - "grad_norm": 10.706387519836426, - "learning_rate": 1.663024957402025e-05, - "loss": 2.1867, + "epoch": 0.21, + "grad_norm": 77.59383392333984, + "learning_rate": 1.8593481989708406e-05, + "loss": 1.0788, "step": 1682 }, { - "epoch": 0.51, - "grad_norm": 15.15072250366211, - "learning_rate": 1.662824496341586e-05, - "loss": 2.9213, + "epoch": 0.21, + "grad_norm": 29.036340713500977, + "learning_rate": 1.859264527465172e-05, + "loss": 3.8931, "step": 1683 }, { - "epoch": 0.51, - "grad_norm": 9.624457359313965, - "learning_rate": 1.6626240352811465e-05, - "loss": 1.8701, + "epoch": 0.21, + "grad_norm": 20.310911178588867, + "learning_rate": 1.859180855959503e-05, + "loss": 2.2261, "step": 1684 }, { - "epoch": 0.51, - "grad_norm": 9.49343490600586, - "learning_rate": 1.662423574220708e-05, - "loss": 1.612, + "epoch": 0.21, + "grad_norm": 14.476409912109375, + "learning_rate": 1.8590971844538343e-05, + "loss": 2.332, "step": 1685 }, { - "epoch": 0.51, - "grad_norm": 28.716997146606445, - "learning_rate": 1.662223113160269e-05, - "loss": 3.3154, + "epoch": 0.21, + "grad_norm": 7.494418621063232, + "learning_rate": 1.8590135129481657e-05, + "loss": 0.7061, "step": 1686 }, { - "epoch": 0.51, - "grad_norm": 9.385660171508789, - "learning_rate": 1.66202265209983e-05, - "loss": 2.5589, + "epoch": 0.21, + "grad_norm": 8.152714729309082, + "learning_rate": 1.858929841442497e-05, + "loss": 2.0858, "step": 1687 }, { - "epoch": 0.51, - "grad_norm": 14.978555679321289, - "learning_rate": 1.661822191039391e-05, - "loss": 2.1595, + "epoch": 0.21, + "grad_norm": 9.176353454589844, + "learning_rate": 1.858846169936828e-05, + "loss": 1.212, "step": 1688 }, { - "epoch": 0.51, - "grad_norm": 18.335002899169922, - "learning_rate": 1.6616217299789516e-05, - "loss": 2.4189, + "epoch": 0.21, + "grad_norm": 23.76990509033203, + "learning_rate": 1.8587624984311594e-05, + "loss": 3.4625, "step": 1689 }, { - "epoch": 0.51, - "grad_norm": 16.582286834716797, - "learning_rate": 1.661421268918513e-05, - "loss": 3.3232, + "epoch": 0.21, + "grad_norm": 19.800321578979492, + "learning_rate": 1.8586788269254908e-05, + "loss": 1.423, "step": 1690 }, { - "epoch": 0.51, - "grad_norm": 13.659370422363281, - "learning_rate": 1.6612208078580736e-05, - "loss": 2.2705, + "epoch": 0.21, + "grad_norm": 12.460290908813477, + "learning_rate": 1.8585951554198218e-05, + "loss": 1.7099, "step": 1691 }, { - "epoch": 0.51, - "grad_norm": 13.425541877746582, - "learning_rate": 1.6610203467976346e-05, - "loss": 2.4391, + "epoch": 0.21, + "grad_norm": 11.062760353088379, + "learning_rate": 1.8585114839141532e-05, + "loss": 2.7525, "step": 1692 }, { - "epoch": 0.51, - "grad_norm": 22.200546264648438, - "learning_rate": 1.660819885737196e-05, - "loss": 2.4271, + "epoch": 0.21, + "grad_norm": 9.447924613952637, + "learning_rate": 1.8584278124084846e-05, + "loss": 2.3637, "step": 1693 }, { - "epoch": 0.51, - "grad_norm": 14.445297241210938, - "learning_rate": 1.6606194246767566e-05, - "loss": 2.3692, + "epoch": 0.21, + "grad_norm": 14.916996955871582, + "learning_rate": 1.858344140902816e-05, + "loss": 2.6405, "step": 1694 }, { - "epoch": 0.51, - "grad_norm": 14.760397911071777, - "learning_rate": 1.6604189636163176e-05, - "loss": 2.3689, + "epoch": 0.21, + "grad_norm": 10.311155319213867, + "learning_rate": 1.858260469397147e-05, + "loss": 2.2143, "step": 1695 }, { - "epoch": 0.51, - "grad_norm": 10.618377685546875, - "learning_rate": 1.6602185025558786e-05, - "loss": 2.1075, + "epoch": 0.21, + "grad_norm": 12.385705947875977, + "learning_rate": 1.8581767978914783e-05, + "loss": 3.1643, "step": 1696 }, { - "epoch": 0.51, - "grad_norm": 17.689512252807617, - "learning_rate": 1.6600180414954396e-05, - "loss": 3.0902, + "epoch": 0.21, + "grad_norm": 15.891322135925293, + "learning_rate": 1.8580931263858097e-05, + "loss": 2.2877, "step": 1697 }, { - "epoch": 0.51, - "grad_norm": 8.896835327148438, - "learning_rate": 1.6598175804350006e-05, - "loss": 2.3301, + "epoch": 0.21, + "grad_norm": 20.978282928466797, + "learning_rate": 1.8580094548801407e-05, + "loss": 1.7149, "step": 1698 }, { - "epoch": 0.51, - "grad_norm": 9.717402458190918, - "learning_rate": 1.6596171193745616e-05, - "loss": 1.9009, + "epoch": 0.21, + "grad_norm": 52.45249557495117, + "learning_rate": 1.857925783374472e-05, + "loss": 2.7727, "step": 1699 }, { - "epoch": 0.51, - "grad_norm": 16.815967559814453, - "learning_rate": 1.6594166583141227e-05, - "loss": 3.0338, + "epoch": 0.21, + "grad_norm": 16.271333694458008, + "learning_rate": 1.857842111868803e-05, + "loss": 3.4275, "step": 1700 }, { - "epoch": 0.51, - "grad_norm": 33.1158447265625, - "learning_rate": 1.6592161972536837e-05, - "loss": 3.0146, + "epoch": 0.21, + "grad_norm": 10.972915649414062, + "learning_rate": 1.8577584403631345e-05, + "loss": 3.8511, "step": 1701 }, { - "epoch": 0.51, - "grad_norm": 12.088573455810547, - "learning_rate": 1.6590157361932447e-05, - "loss": 3.0398, + "epoch": 0.21, + "grad_norm": 19.936786651611328, + "learning_rate": 1.8576747688574658e-05, + "loss": 2.3473, "step": 1702 }, { - "epoch": 0.51, - "grad_norm": 24.571584701538086, - "learning_rate": 1.6588152751328053e-05, - "loss": 3.3241, + "epoch": 0.21, + "grad_norm": 14.996315956115723, + "learning_rate": 1.857591097351797e-05, + "loss": 3.5185, "step": 1703 }, { - "epoch": 0.51, - "grad_norm": 23.808401107788086, - "learning_rate": 1.6586148140723667e-05, - "loss": 3.3574, + "epoch": 0.21, + "grad_norm": 13.889389038085938, + "learning_rate": 1.8575074258461282e-05, + "loss": 4.0015, "step": 1704 }, { - "epoch": 0.51, - "grad_norm": 20.748384475708008, - "learning_rate": 1.6584143530119277e-05, - "loss": 3.0009, + "epoch": 0.21, + "grad_norm": 15.474664688110352, + "learning_rate": 1.8574237543404592e-05, + "loss": 3.308, "step": 1705 }, { - "epoch": 0.51, - "grad_norm": 24.722551345825195, - "learning_rate": 1.6582138919514884e-05, - "loss": 2.8073, + "epoch": 0.21, + "grad_norm": 9.600517272949219, + "learning_rate": 1.8573400828347906e-05, + "loss": 4.5351, "step": 1706 }, { - "epoch": 0.51, - "grad_norm": 16.103742599487305, - "learning_rate": 1.6580134308910497e-05, - "loss": 3.2797, + "epoch": 0.21, + "grad_norm": 11.001420974731445, + "learning_rate": 1.857256411329122e-05, + "loss": 2.574, "step": 1707 }, { - "epoch": 0.51, - "grad_norm": 17.112245559692383, - "learning_rate": 1.6578129698306104e-05, - "loss": 2.8844, + "epoch": 0.21, + "grad_norm": 16.208568572998047, + "learning_rate": 1.8571727398234533e-05, + "loss": 2.8805, "step": 1708 }, { - "epoch": 0.51, - "grad_norm": 12.628554344177246, - "learning_rate": 1.6576125087701714e-05, - "loss": 2.5837, + "epoch": 0.21, + "grad_norm": 9.639992713928223, + "learning_rate": 1.8570890683177844e-05, + "loss": 2.6542, "step": 1709 }, { - "epoch": 0.51, - "grad_norm": 15.136996269226074, - "learning_rate": 1.6574120477097327e-05, - "loss": 2.8925, + "epoch": 0.21, + "grad_norm": 14.849407196044922, + "learning_rate": 1.8570053968121157e-05, + "loss": 1.8041, "step": 1710 }, { - "epoch": 0.51, - "grad_norm": 15.211194038391113, - "learning_rate": 1.6572115866492934e-05, - "loss": 2.2617, + "epoch": 0.21, + "grad_norm": 13.08426570892334, + "learning_rate": 1.856921725306447e-05, + "loss": 2.8605, "step": 1711 }, { - "epoch": 0.51, - "grad_norm": 13.138132095336914, - "learning_rate": 1.6570111255888544e-05, - "loss": 2.1384, + "epoch": 0.21, + "grad_norm": 15.084927558898926, + "learning_rate": 1.856838053800778e-05, + "loss": 2.7685, "step": 1712 }, { - "epoch": 0.52, - "grad_norm": 15.303442001342773, - "learning_rate": 1.6568106645284154e-05, - "loss": 3.7071, + "epoch": 0.21, + "grad_norm": 12.851953506469727, + "learning_rate": 1.8567543822951095e-05, + "loss": 3.5303, "step": 1713 }, { - "epoch": 0.52, - "grad_norm": 16.974143981933594, - "learning_rate": 1.6566102034679764e-05, - "loss": 2.0731, + "epoch": 0.22, + "grad_norm": 11.996140480041504, + "learning_rate": 1.856670710789441e-05, + "loss": 3.8478, "step": 1714 }, { - "epoch": 0.52, - "grad_norm": 22.324352264404297, - "learning_rate": 1.6564097424075374e-05, - "loss": 2.5875, + "epoch": 0.22, + "grad_norm": 8.756845474243164, + "learning_rate": 1.8565870392837722e-05, + "loss": 0.718, "step": 1715 }, { - "epoch": 0.52, - "grad_norm": 17.300701141357422, - "learning_rate": 1.6562092813470984e-05, - "loss": 2.9271, + "epoch": 0.22, + "grad_norm": 16.061248779296875, + "learning_rate": 1.8565033677781032e-05, + "loss": 3.183, "step": 1716 }, { - "epoch": 0.52, - "grad_norm": 20.104694366455078, - "learning_rate": 1.6560088202866595e-05, - "loss": 3.3379, + "epoch": 0.22, + "grad_norm": 12.82740592956543, + "learning_rate": 1.8564196962724346e-05, + "loss": 2.9077, "step": 1717 }, { - "epoch": 0.52, - "grad_norm": 11.9177827835083, - "learning_rate": 1.6558083592262205e-05, - "loss": 2.2438, + "epoch": 0.22, + "grad_norm": 11.336594581604004, + "learning_rate": 1.856336024766766e-05, + "loss": 3.04, "step": 1718 }, { - "epoch": 0.52, - "grad_norm": 12.473027229309082, - "learning_rate": 1.6556078981657815e-05, - "loss": 2.8623, + "epoch": 0.22, + "grad_norm": 9.98063850402832, + "learning_rate": 1.856252353261097e-05, + "loss": 3.3082, "step": 1719 }, { - "epoch": 0.52, - "grad_norm": 16.957218170166016, - "learning_rate": 1.6554074371053425e-05, - "loss": 2.7224, + "epoch": 0.22, + "grad_norm": 14.953742027282715, + "learning_rate": 1.8561686817554284e-05, + "loss": 4.0954, "step": 1720 }, { - "epoch": 0.52, - "grad_norm": 9.168384552001953, - "learning_rate": 1.6552069760449035e-05, - "loss": 1.4801, + "epoch": 0.22, + "grad_norm": 11.000082969665527, + "learning_rate": 1.8560850102497597e-05, + "loss": 2.8741, "step": 1721 }, { - "epoch": 0.52, - "grad_norm": 14.333914756774902, - "learning_rate": 1.655006514984464e-05, - "loss": 1.834, + "epoch": 0.22, + "grad_norm": 31.708345413208008, + "learning_rate": 1.856001338744091e-05, + "loss": 3.8666, "step": 1722 }, { - "epoch": 0.52, - "grad_norm": 8.387497901916504, - "learning_rate": 1.6548060539240255e-05, - "loss": 1.4313, + "epoch": 0.22, + "grad_norm": 21.968839645385742, + "learning_rate": 1.855917667238422e-05, + "loss": 1.3914, "step": 1723 }, { - "epoch": 0.52, - "grad_norm": 10.79047679901123, - "learning_rate": 1.6546055928635865e-05, - "loss": 3.2971, + "epoch": 0.22, + "grad_norm": 13.351322174072266, + "learning_rate": 1.8558339957327535e-05, + "loss": 2.0592, "step": 1724 }, { - "epoch": 0.52, - "grad_norm": 10.988299369812012, - "learning_rate": 1.6544051318031472e-05, - "loss": 2.5413, + "epoch": 0.22, + "grad_norm": 12.113936424255371, + "learning_rate": 1.855750324227085e-05, + "loss": 2.0291, "step": 1725 }, { - "epoch": 0.52, - "grad_norm": 16.327035903930664, - "learning_rate": 1.6542046707427085e-05, - "loss": 2.1365, + "epoch": 0.22, + "grad_norm": 10.533174514770508, + "learning_rate": 1.855666652721416e-05, + "loss": 2.5006, "step": 1726 }, { - "epoch": 0.52, - "grad_norm": 10.375070571899414, - "learning_rate": 1.6540042096822692e-05, - "loss": 2.7831, + "epoch": 0.22, + "grad_norm": 8.11629581451416, + "learning_rate": 1.8555829812157472e-05, + "loss": 2.5132, "step": 1727 }, { - "epoch": 0.52, - "grad_norm": 11.78280258178711, - "learning_rate": 1.6538037486218302e-05, - "loss": 2.6977, + "epoch": 0.22, + "grad_norm": 14.704581260681152, + "learning_rate": 1.8554993097100783e-05, + "loss": 3.4344, "step": 1728 }, { - "epoch": 0.52, - "grad_norm": 25.220386505126953, - "learning_rate": 1.6536032875613916e-05, - "loss": 2.6352, + "epoch": 0.22, + "grad_norm": 12.885751724243164, + "learning_rate": 1.8554156382044096e-05, + "loss": 1.8441, "step": 1729 }, { - "epoch": 0.52, - "grad_norm": 38.015220642089844, - "learning_rate": 1.6534028265009522e-05, - "loss": 2.9953, + "epoch": 0.22, + "grad_norm": 16.6527042388916, + "learning_rate": 1.855331966698741e-05, + "loss": 2.329, "step": 1730 }, { - "epoch": 0.52, - "grad_norm": 16.3960018157959, - "learning_rate": 1.6532023654405132e-05, - "loss": 2.1179, + "epoch": 0.22, + "grad_norm": 15.419515609741211, + "learning_rate": 1.855248295193072e-05, + "loss": 2.7254, "step": 1731 }, { - "epoch": 0.52, - "grad_norm": 17.71184730529785, - "learning_rate": 1.6530019043800742e-05, - "loss": 2.3667, + "epoch": 0.22, + "grad_norm": 16.190248489379883, + "learning_rate": 1.8551646236874034e-05, + "loss": 2.6196, "step": 1732 }, { - "epoch": 0.52, - "grad_norm": 11.10054874420166, - "learning_rate": 1.6528014433196353e-05, - "loss": 1.6067, + "epoch": 0.22, + "grad_norm": 18.080875396728516, + "learning_rate": 1.8550809521817344e-05, + "loss": 0.8927, "step": 1733 }, { - "epoch": 0.52, - "grad_norm": 19.382043838500977, - "learning_rate": 1.6526009822591963e-05, - "loss": 3.0277, + "epoch": 0.22, + "grad_norm": 20.11777687072754, + "learning_rate": 1.8549972806760658e-05, + "loss": 3.1772, "step": 1734 }, { - "epoch": 0.52, - "grad_norm": 64.88998413085938, - "learning_rate": 1.6524005211987573e-05, - "loss": 1.9251, + "epoch": 0.22, + "grad_norm": 9.067489624023438, + "learning_rate": 1.854913609170397e-05, + "loss": 2.5178, "step": 1735 }, { - "epoch": 0.52, - "grad_norm": 16.99798583984375, - "learning_rate": 1.6522000601383183e-05, - "loss": 2.2818, + "epoch": 0.22, + "grad_norm": 10.937751770019531, + "learning_rate": 1.8548299376647285e-05, + "loss": 3.259, "step": 1736 }, { - "epoch": 0.52, - "grad_norm": 11.27112102508545, - "learning_rate": 1.6519995990778793e-05, - "loss": 2.4658, + "epoch": 0.22, + "grad_norm": 10.825780868530273, + "learning_rate": 1.8547462661590595e-05, + "loss": 0.5879, "step": 1737 }, { - "epoch": 0.52, - "grad_norm": 16.193252563476562, - "learning_rate": 1.6517991380174403e-05, - "loss": 2.023, + "epoch": 0.22, + "grad_norm": 30.348739624023438, + "learning_rate": 1.854662594653391e-05, + "loss": 3.1827, "step": 1738 }, { - "epoch": 0.52, - "grad_norm": 11.178709983825684, - "learning_rate": 1.6515986769570013e-05, - "loss": 2.0991, + "epoch": 0.22, + "grad_norm": 19.03845977783203, + "learning_rate": 1.8545789231477223e-05, + "loss": 2.0507, "step": 1739 }, { - "epoch": 0.52, - "grad_norm": 12.193035125732422, - "learning_rate": 1.6513982158965623e-05, - "loss": 2.4835, + "epoch": 0.22, + "grad_norm": 19.138124465942383, + "learning_rate": 1.8544952516420533e-05, + "loss": 2.5662, "step": 1740 }, { - "epoch": 0.52, - "grad_norm": 11.212671279907227, - "learning_rate": 1.6511977548361233e-05, - "loss": 1.8206, + "epoch": 0.22, + "grad_norm": 17.365880966186523, + "learning_rate": 1.8544115801363846e-05, + "loss": 4.8537, "step": 1741 }, { - "epoch": 0.52, - "grad_norm": 7.858899116516113, - "learning_rate": 1.6509972937756843e-05, - "loss": 2.2607, + "epoch": 0.22, + "grad_norm": 14.248538970947266, + "learning_rate": 1.854327908630716e-05, + "loss": 2.9112, "step": 1742 }, { - "epoch": 0.52, - "grad_norm": 34.4583740234375, - "learning_rate": 1.6507968327152453e-05, - "loss": 2.8326, + "epoch": 0.22, + "grad_norm": 11.511013984680176, + "learning_rate": 1.8542442371250474e-05, + "loss": 1.631, "step": 1743 }, { - "epoch": 0.52, - "grad_norm": 15.727729797363281, - "learning_rate": 1.650596371654806e-05, - "loss": 3.1574, + "epoch": 0.22, + "grad_norm": 17.367900848388672, + "learning_rate": 1.8541605656193784e-05, + "loss": 3.3912, "step": 1744 }, { - "epoch": 0.52, - "grad_norm": 25.254718780517578, - "learning_rate": 1.6503959105943674e-05, - "loss": 3.1741, + "epoch": 0.22, + "grad_norm": 10.672541618347168, + "learning_rate": 1.8540768941137098e-05, + "loss": 2.0005, "step": 1745 }, { - "epoch": 0.52, - "grad_norm": 17.325634002685547, - "learning_rate": 1.650195449533928e-05, - "loss": 2.9572, + "epoch": 0.22, + "grad_norm": 27.745975494384766, + "learning_rate": 1.853993222608041e-05, + "loss": 3.8916, "step": 1746 }, { - "epoch": 0.53, - "grad_norm": 23.073179244995117, - "learning_rate": 1.649994988473489e-05, - "loss": 1.7773, + "epoch": 0.22, + "grad_norm": 23.153770446777344, + "learning_rate": 1.853909551102372e-05, + "loss": 2.1451, "step": 1747 }, { - "epoch": 0.53, - "grad_norm": 16.520126342773438, - "learning_rate": 1.6497945274130504e-05, - "loss": 3.4431, + "epoch": 0.22, + "grad_norm": 11.951934814453125, + "learning_rate": 1.8538258795967035e-05, + "loss": 1.9602, "step": 1748 }, { - "epoch": 0.53, - "grad_norm": 18.78542709350586, - "learning_rate": 1.649594066352611e-05, - "loss": 2.6952, + "epoch": 0.22, + "grad_norm": 12.270580291748047, + "learning_rate": 1.853742208091035e-05, + "loss": 1.9256, "step": 1749 }, { - "epoch": 0.53, - "grad_norm": 19.45966911315918, - "learning_rate": 1.649393605292172e-05, - "loss": 2.005, + "epoch": 0.22, + "grad_norm": 19.807544708251953, + "learning_rate": 1.8536585365853663e-05, + "loss": 3.9123, "step": 1750 }, { - "epoch": 0.53, - "grad_norm": 11.350322723388672, - "learning_rate": 1.649193144231733e-05, - "loss": 1.6326, + "epoch": 0.22, + "grad_norm": 13.968215942382812, + "learning_rate": 1.8535748650796973e-05, + "loss": 1.5282, "step": 1751 }, { - "epoch": 0.53, - "grad_norm": 18.4759521484375, - "learning_rate": 1.648992683171294e-05, - "loss": 2.4786, + "epoch": 0.22, + "grad_norm": 9.959256172180176, + "learning_rate": 1.8534911935740286e-05, + "loss": 2.4537, "step": 1752 }, { - "epoch": 0.53, - "grad_norm": 15.755661010742188, - "learning_rate": 1.648792222110855e-05, - "loss": 1.7824, + "epoch": 0.22, + "grad_norm": 10.980196952819824, + "learning_rate": 1.8534075220683597e-05, + "loss": 1.7011, "step": 1753 }, { - "epoch": 0.53, - "grad_norm": 46.813907623291016, - "learning_rate": 1.648591761050416e-05, - "loss": 3.5372, + "epoch": 0.22, + "grad_norm": 13.64299201965332, + "learning_rate": 1.853323850562691e-05, + "loss": 2.0559, "step": 1754 }, { - "epoch": 0.53, - "grad_norm": 11.6588134765625, - "learning_rate": 1.648391299989977e-05, - "loss": 1.7987, + "epoch": 0.22, + "grad_norm": 16.080829620361328, + "learning_rate": 1.8532401790570224e-05, + "loss": 3.1677, "step": 1755 }, { - "epoch": 0.53, - "grad_norm": 15.711779594421387, - "learning_rate": 1.648190838929538e-05, - "loss": 2.7252, + "epoch": 0.22, + "grad_norm": 16.70003890991211, + "learning_rate": 1.8531565075513534e-05, + "loss": 2.5473, "step": 1756 }, { - "epoch": 0.53, - "grad_norm": 9.573955535888672, - "learning_rate": 1.647990377869099e-05, - "loss": 2.4631, + "epoch": 0.22, + "grad_norm": 12.727276802062988, + "learning_rate": 1.8530728360456848e-05, + "loss": 3.6629, "step": 1757 }, { - "epoch": 0.53, - "grad_norm": 9.92371940612793, - "learning_rate": 1.64778991680866e-05, - "loss": 2.0796, + "epoch": 0.22, + "grad_norm": 15.42647933959961, + "learning_rate": 1.8529891645400158e-05, + "loss": 3.0492, "step": 1758 }, { - "epoch": 0.53, - "grad_norm": 26.95187759399414, - "learning_rate": 1.647589455748221e-05, - "loss": 2.8893, + "epoch": 0.22, + "grad_norm": 76.11067962646484, + "learning_rate": 1.8529054930343472e-05, + "loss": 0.5017, "step": 1759 }, { - "epoch": 0.53, - "grad_norm": 9.273636817932129, - "learning_rate": 1.647388994687782e-05, - "loss": 2.0131, + "epoch": 0.22, + "grad_norm": 10.16500186920166, + "learning_rate": 1.8528218215286785e-05, + "loss": 3.383, "step": 1760 }, { - "epoch": 0.53, - "grad_norm": 19.220373153686523, - "learning_rate": 1.647188533627343e-05, - "loss": 3.017, + "epoch": 0.22, + "grad_norm": 12.610782623291016, + "learning_rate": 1.8527381500230096e-05, + "loss": 2.814, "step": 1761 }, { - "epoch": 0.53, - "grad_norm": 34.35186767578125, - "learning_rate": 1.646988072566904e-05, - "loss": 3.3928, + "epoch": 0.22, + "grad_norm": 11.346853256225586, + "learning_rate": 1.852654478517341e-05, + "loss": 1.7856, "step": 1762 }, { - "epoch": 0.53, - "grad_norm": 36.41823959350586, - "learning_rate": 1.6467876115064648e-05, - "loss": 3.6049, + "epoch": 0.22, + "grad_norm": 7.411301612854004, + "learning_rate": 1.8525708070116723e-05, + "loss": 1.8628, "step": 1763 }, { - "epoch": 0.53, - "grad_norm": 14.849743843078613, - "learning_rate": 1.6465871504460262e-05, - "loss": 1.9525, + "epoch": 0.22, + "grad_norm": 14.970037460327148, + "learning_rate": 1.8524871355060037e-05, + "loss": 1.8636, "step": 1764 }, { - "epoch": 0.53, - "grad_norm": 20.26209259033203, - "learning_rate": 1.646386689385587e-05, - "loss": 2.2319, + "epoch": 0.22, + "grad_norm": 24.586057662963867, + "learning_rate": 1.8524034640003347e-05, + "loss": 3.4602, "step": 1765 }, { - "epoch": 0.53, - "grad_norm": 24.784208297729492, - "learning_rate": 1.646186228325148e-05, - "loss": 2.5943, + "epoch": 0.22, + "grad_norm": 6.546807289123535, + "learning_rate": 1.852319792494666e-05, + "loss": 0.668, "step": 1766 }, { - "epoch": 0.53, - "grad_norm": 8.97102165222168, - "learning_rate": 1.6459857672647092e-05, - "loss": 2.0995, + "epoch": 0.22, + "grad_norm": 11.386066436767578, + "learning_rate": 1.8522361209889974e-05, + "loss": 1.7019, "step": 1767 }, { - "epoch": 0.53, - "grad_norm": 12.025717735290527, - "learning_rate": 1.64578530620427e-05, - "loss": 1.3348, + "epoch": 0.22, + "grad_norm": 15.573068618774414, + "learning_rate": 1.8521524494833284e-05, + "loss": 3.1102, "step": 1768 }, { - "epoch": 0.53, - "grad_norm": 22.216552734375, - "learning_rate": 1.645584845143831e-05, - "loss": 2.9476, + "epoch": 0.22, + "grad_norm": 18.69664192199707, + "learning_rate": 1.8520687779776598e-05, + "loss": 1.7954, "step": 1769 }, { - "epoch": 0.53, - "grad_norm": 16.27435874938965, - "learning_rate": 1.645384384083392e-05, - "loss": 2.9931, + "epoch": 0.22, + "grad_norm": 9.728567123413086, + "learning_rate": 1.8519851064719912e-05, + "loss": 1.1618, "step": 1770 }, { - "epoch": 0.53, - "grad_norm": 23.192441940307617, - "learning_rate": 1.645183923022953e-05, - "loss": 3.2747, + "epoch": 0.22, + "grad_norm": 19.483200073242188, + "learning_rate": 1.8519014349663225e-05, + "loss": 3.4606, "step": 1771 }, { - "epoch": 0.53, - "grad_norm": 26.984561920166016, - "learning_rate": 1.644983461962514e-05, - "loss": 3.2546, + "epoch": 0.22, + "grad_norm": 14.560922622680664, + "learning_rate": 1.8518177634606536e-05, + "loss": 3.6956, "step": 1772 }, { - "epoch": 0.53, - "grad_norm": 16.29502296447754, - "learning_rate": 1.644783000902075e-05, - "loss": 2.9974, + "epoch": 0.22, + "grad_norm": 24.239315032958984, + "learning_rate": 1.851734091954985e-05, + "loss": 1.1153, "step": 1773 }, { - "epoch": 0.53, - "grad_norm": 15.848845481872559, - "learning_rate": 1.644582539841636e-05, - "loss": 3.1271, + "epoch": 0.22, + "grad_norm": 15.974748611450195, + "learning_rate": 1.8516504204493163e-05, + "loss": 3.7795, "step": 1774 }, { - "epoch": 0.53, - "grad_norm": 34.28373336791992, - "learning_rate": 1.644382078781197e-05, - "loss": 3.1274, + "epoch": 0.22, + "grad_norm": 12.274819374084473, + "learning_rate": 1.8515667489436473e-05, + "loss": 2.016, "step": 1775 }, { - "epoch": 0.53, - "grad_norm": 18.812297821044922, - "learning_rate": 1.644181617720758e-05, - "loss": 2.0213, + "epoch": 0.22, + "grad_norm": 8.790360450744629, + "learning_rate": 1.8514830774379787e-05, + "loss": 2.2788, "step": 1776 }, { - "epoch": 0.53, - "grad_norm": 23.698867797851562, - "learning_rate": 1.6439811566603186e-05, - "loss": 3.1921, + "epoch": 0.22, + "grad_norm": 12.69150447845459, + "learning_rate": 1.85139940593231e-05, + "loss": 1.2259, "step": 1777 }, { - "epoch": 0.53, - "grad_norm": 8.073270797729492, - "learning_rate": 1.64378069559988e-05, - "loss": 2.1989, + "epoch": 0.22, + "grad_norm": 12.652657508850098, + "learning_rate": 1.8513157344266414e-05, + "loss": 2.6977, "step": 1778 }, { - "epoch": 0.53, - "grad_norm": 18.90115737915039, - "learning_rate": 1.643580234539441e-05, - "loss": 3.2046, + "epoch": 0.22, + "grad_norm": 182.5238037109375, + "learning_rate": 1.8512320629209724e-05, + "loss": 2.9445, "step": 1779 }, { - "epoch": 0.54, - "grad_norm": 12.170257568359375, - "learning_rate": 1.6433797734790016e-05, - "loss": 3.0221, + "epoch": 0.22, + "grad_norm": 14.875434875488281, + "learning_rate": 1.8511483914153038e-05, + "loss": 1.0557, "step": 1780 }, { - "epoch": 0.54, - "grad_norm": 19.89399528503418, - "learning_rate": 1.643179312418563e-05, - "loss": 3.4413, + "epoch": 0.22, + "grad_norm": 15.446181297302246, + "learning_rate": 1.851064719909635e-05, + "loss": 2.3366, "step": 1781 }, { - "epoch": 0.54, - "grad_norm": 14.685297012329102, - "learning_rate": 1.6429788513581237e-05, - "loss": 2.2774, + "epoch": 0.22, + "grad_norm": 10.390913963317871, + "learning_rate": 1.8509810484039662e-05, + "loss": 1.2231, "step": 1782 }, { - "epoch": 0.54, - "grad_norm": 29.041440963745117, - "learning_rate": 1.6427783902976847e-05, - "loss": 2.4153, + "epoch": 0.22, + "grad_norm": 12.480498313903809, + "learning_rate": 1.8508973768982976e-05, + "loss": 1.8289, "step": 1783 }, { - "epoch": 0.54, - "grad_norm": 13.65660285949707, - "learning_rate": 1.642577929237246e-05, - "loss": 2.6716, + "epoch": 0.22, + "grad_norm": 8.062909126281738, + "learning_rate": 1.8508137053926286e-05, + "loss": 2.944, "step": 1784 }, { - "epoch": 0.54, - "grad_norm": 11.705273628234863, - "learning_rate": 1.6423774681768067e-05, - "loss": 1.8001, + "epoch": 0.22, + "grad_norm": 27.470788955688477, + "learning_rate": 1.85073003388696e-05, + "loss": 3.3835, "step": 1785 }, { - "epoch": 0.54, - "grad_norm": 25.235740661621094, - "learning_rate": 1.6421770071163677e-05, - "loss": 3.1693, + "epoch": 0.22, + "grad_norm": 15.176369667053223, + "learning_rate": 1.850646362381291e-05, + "loss": 2.6695, "step": 1786 }, { - "epoch": 0.54, - "grad_norm": 15.884099960327148, - "learning_rate": 1.6419765460559287e-05, - "loss": 2.4285, + "epoch": 0.22, + "grad_norm": 16.90056037902832, + "learning_rate": 1.8505626908756223e-05, + "loss": 2.991, "step": 1787 }, { - "epoch": 0.54, - "grad_norm": 52.04803466796875, - "learning_rate": 1.6417760849954897e-05, - "loss": 4.0988, + "epoch": 0.22, + "grad_norm": 13.010893821716309, + "learning_rate": 1.8504790193699537e-05, + "loss": 3.5261, "step": 1788 }, { - "epoch": 0.54, - "grad_norm": 42.220130920410156, - "learning_rate": 1.6415756239350507e-05, - "loss": 2.3157, + "epoch": 0.22, + "grad_norm": 31.022720336914062, + "learning_rate": 1.8503953478642847e-05, + "loss": 3.7701, "step": 1789 }, { - "epoch": 0.54, - "grad_norm": 19.528057098388672, - "learning_rate": 1.6413751628746117e-05, - "loss": 2.153, + "epoch": 0.22, + "grad_norm": 8.040053367614746, + "learning_rate": 1.850311676358616e-05, + "loss": 0.5569, "step": 1790 }, { - "epoch": 0.54, - "grad_norm": 10.022256851196289, - "learning_rate": 1.6411747018141727e-05, - "loss": 2.6832, + "epoch": 0.22, + "grad_norm": 5.776892185211182, + "learning_rate": 1.8502280048529475e-05, + "loss": 0.5414, "step": 1791 }, { - "epoch": 0.54, - "grad_norm": 14.05746841430664, - "learning_rate": 1.6409742407537337e-05, - "loss": 2.6188, + "epoch": 0.22, + "grad_norm": 10.185456275939941, + "learning_rate": 1.850144333347279e-05, + "loss": 2.0107, "step": 1792 }, { - "epoch": 0.54, - "grad_norm": 17.109386444091797, - "learning_rate": 1.6407737796932947e-05, - "loss": 2.4536, + "epoch": 0.23, + "grad_norm": 12.830659866333008, + "learning_rate": 1.85006066184161e-05, + "loss": 2.1403, "step": 1793 }, { - "epoch": 0.54, - "grad_norm": 14.443997383117676, - "learning_rate": 1.6405733186328558e-05, - "loss": 2.4826, + "epoch": 0.23, + "grad_norm": 26.65049934387207, + "learning_rate": 1.8499769903359412e-05, + "loss": 2.5158, "step": 1794 }, { - "epoch": 0.54, - "grad_norm": 29.712779998779297, - "learning_rate": 1.6403728575724168e-05, - "loss": 3.0167, + "epoch": 0.23, + "grad_norm": 8.251920700073242, + "learning_rate": 1.8498933188302726e-05, + "loss": 2.5884, "step": 1795 }, { - "epoch": 0.54, - "grad_norm": 19.298267364501953, - "learning_rate": 1.6401723965119778e-05, - "loss": 2.9807, + "epoch": 0.23, + "grad_norm": 14.79460620880127, + "learning_rate": 1.8498096473246036e-05, + "loss": 2.9623, "step": 1796 }, { - "epoch": 0.54, - "grad_norm": 15.033485412597656, - "learning_rate": 1.6399719354515388e-05, - "loss": 2.6656, + "epoch": 0.23, + "grad_norm": 11.604616165161133, + "learning_rate": 1.849725975818935e-05, + "loss": 1.7841, "step": 1797 }, { - "epoch": 0.54, - "grad_norm": 16.12212371826172, - "learning_rate": 1.6397714743910998e-05, - "loss": 2.3691, + "epoch": 0.23, + "grad_norm": 13.62449836730957, + "learning_rate": 1.8496423043132663e-05, + "loss": 2.9062, "step": 1798 }, { - "epoch": 0.54, - "grad_norm": 26.953134536743164, - "learning_rate": 1.6395710133306605e-05, - "loss": 3.2797, + "epoch": 0.23, + "grad_norm": 21.44681167602539, + "learning_rate": 1.8495586328075977e-05, + "loss": 3.5239, "step": 1799 }, { - "epoch": 0.54, - "grad_norm": 9.630372047424316, - "learning_rate": 1.6393705522702218e-05, - "loss": 1.7721, - "step": 1800 - }, - { - "epoch": 0.54, - "eval_loss": 0.3416885435581207, - "eval_runtime": 43.4913, - "eval_samples_per_second": 34.007, - "eval_steps_per_second": 34.007, + "epoch": 0.23, + "grad_norm": 10.320944786071777, + "learning_rate": 1.8494749613019287e-05, + "loss": 2.2619, "step": 1800 }, { - "epoch": 0.54, - "grad_norm": 18.797924041748047, - "learning_rate": 1.6391700912097825e-05, - "loss": 1.5968, + "epoch": 0.23, + "grad_norm": 9.892986297607422, + "learning_rate": 1.84939128979626e-05, + "loss": 1.4009, "step": 1801 }, { - "epoch": 0.54, - "grad_norm": 31.31778907775879, - "learning_rate": 1.6389696301493435e-05, - "loss": 2.9072, + "epoch": 0.23, + "grad_norm": 7.621852397918701, + "learning_rate": 1.8493076182905915e-05, + "loss": 0.2668, "step": 1802 }, { - "epoch": 0.54, - "grad_norm": 14.117547035217285, - "learning_rate": 1.6387691690889048e-05, - "loss": 2.4949, + "epoch": 0.23, + "grad_norm": 10.474602699279785, + "learning_rate": 1.8492239467849225e-05, + "loss": 3.0311, "step": 1803 }, { - "epoch": 0.54, - "grad_norm": 12.579538345336914, - "learning_rate": 1.6385687080284655e-05, - "loss": 1.8951, + "epoch": 0.23, + "grad_norm": 13.508172988891602, + "learning_rate": 1.849140275279254e-05, + "loss": 3.6226, "step": 1804 }, { - "epoch": 0.54, - "grad_norm": 23.75373077392578, - "learning_rate": 1.6383682469680265e-05, - "loss": 1.9489, + "epoch": 0.23, + "grad_norm": 19.923870086669922, + "learning_rate": 1.8490566037735852e-05, + "loss": 2.9188, "step": 1805 }, { - "epoch": 0.54, - "grad_norm": 13.660398483276367, - "learning_rate": 1.6381677859075875e-05, - "loss": 1.6119, + "epoch": 0.23, + "grad_norm": 7.190248489379883, + "learning_rate": 1.8489729322679162e-05, + "loss": 1.7364, "step": 1806 }, { - "epoch": 0.54, - "grad_norm": 60.78506851196289, - "learning_rate": 1.6379673248471485e-05, - "loss": 2.7651, + "epoch": 0.23, + "grad_norm": 13.450784683227539, + "learning_rate": 1.8488892607622476e-05, + "loss": 1.7814, "step": 1807 }, { - "epoch": 0.54, - "grad_norm": 14.931727409362793, - "learning_rate": 1.6377668637867095e-05, - "loss": 2.2028, + "epoch": 0.23, + "grad_norm": 12.926718711853027, + "learning_rate": 1.848805589256579e-05, + "loss": 2.7234, "step": 1808 }, { - "epoch": 0.54, - "grad_norm": 10.574440956115723, - "learning_rate": 1.6375664027262705e-05, - "loss": 2.6472, + "epoch": 0.23, + "grad_norm": 6.936464309692383, + "learning_rate": 1.84872191775091e-05, + "loss": 0.7802, "step": 1809 }, { - "epoch": 0.54, - "grad_norm": 14.71934986114502, - "learning_rate": 1.6373659416658315e-05, - "loss": 2.3538, + "epoch": 0.23, + "grad_norm": 27.081253051757812, + "learning_rate": 1.8486382462452414e-05, + "loss": 3.1689, "step": 1810 }, { - "epoch": 0.54, - "grad_norm": 16.00834846496582, - "learning_rate": 1.6371654806053926e-05, - "loss": 2.8576, + "epoch": 0.23, + "grad_norm": 14.529179573059082, + "learning_rate": 1.8485545747395724e-05, + "loss": 3.096, "step": 1811 }, { - "epoch": 0.54, - "grad_norm": 15.810260772705078, - "learning_rate": 1.6369650195449536e-05, - "loss": 2.9982, + "epoch": 0.23, + "grad_norm": 11.196646690368652, + "learning_rate": 1.8484709032339038e-05, + "loss": 2.3944, "step": 1812 }, { - "epoch": 0.55, - "grad_norm": 29.83978843688965, - "learning_rate": 1.6367645584845146e-05, - "loss": 3.0221, + "epoch": 0.23, + "grad_norm": 20.353017807006836, + "learning_rate": 1.848387231728235e-05, + "loss": 2.2399, "step": 1813 }, { - "epoch": 0.55, - "grad_norm": 16.944915771484375, - "learning_rate": 1.6365640974240756e-05, - "loss": 2.8306, + "epoch": 0.23, + "grad_norm": 11.980401039123535, + "learning_rate": 1.848303560222566e-05, + "loss": 1.1062, "step": 1814 }, { - "epoch": 0.55, - "grad_norm": 22.454910278320312, - "learning_rate": 1.6363636363636366e-05, - "loss": 2.3444, + "epoch": 0.23, + "grad_norm": 8.730816841125488, + "learning_rate": 1.8482198887168975e-05, + "loss": 1.8447, "step": 1815 }, { - "epoch": 0.55, - "grad_norm": 16.41376495361328, - "learning_rate": 1.6361631753031976e-05, - "loss": 1.8047, + "epoch": 0.23, + "grad_norm": 23.536231994628906, + "learning_rate": 1.848136217211229e-05, + "loss": 4.1924, "step": 1816 }, { - "epoch": 0.55, - "grad_norm": 4576.21875, - "learning_rate": 1.6359627142427586e-05, - "loss": 2.2836, + "epoch": 0.23, + "grad_norm": 22.22958755493164, + "learning_rate": 1.84805254570556e-05, + "loss": 1.1769, "step": 1817 }, { - "epoch": 0.55, - "grad_norm": 15.997746467590332, - "learning_rate": 1.6357622531823193e-05, - "loss": 2.5703, + "epoch": 0.23, + "grad_norm": 12.718234062194824, + "learning_rate": 1.8479688741998913e-05, + "loss": 2.6205, "step": 1818 }, { - "epoch": 0.55, - "grad_norm": 15.037249565124512, - "learning_rate": 1.6355617921218806e-05, - "loss": 3.2597, + "epoch": 0.23, + "grad_norm": 8.55314826965332, + "learning_rate": 1.8478852026942226e-05, + "loss": 0.3194, "step": 1819 }, { - "epoch": 0.55, - "grad_norm": 13.758872032165527, - "learning_rate": 1.6353613310614413e-05, - "loss": 3.4616, + "epoch": 0.23, + "grad_norm": 20.39598846435547, + "learning_rate": 1.847801531188554e-05, + "loss": 1.7185, "step": 1820 }, { - "epoch": 0.55, - "grad_norm": 10.099397659301758, - "learning_rate": 1.6351608700010023e-05, - "loss": 1.96, + "epoch": 0.23, + "grad_norm": 13.92740535736084, + "learning_rate": 1.847717859682885e-05, + "loss": 1.4368, "step": 1821 }, { - "epoch": 0.55, - "grad_norm": 32.01512908935547, - "learning_rate": 1.6349604089405636e-05, - "loss": 4.0462, + "epoch": 0.23, + "grad_norm": 10.905973434448242, + "learning_rate": 1.8476341881772164e-05, + "loss": 1.3227, "step": 1822 }, { - "epoch": 0.55, - "grad_norm": 17.568777084350586, - "learning_rate": 1.6347599478801243e-05, - "loss": 2.5459, + "epoch": 0.23, + "grad_norm": 17.807950973510742, + "learning_rate": 1.8475505166715478e-05, + "loss": 2.9048, "step": 1823 }, { - "epoch": 0.55, - "grad_norm": 20.525028228759766, - "learning_rate": 1.6345594868196853e-05, - "loss": 2.6286, + "epoch": 0.23, + "grad_norm": 18.465328216552734, + "learning_rate": 1.8474668451658788e-05, + "loss": 1.9646, "step": 1824 }, { - "epoch": 0.55, - "grad_norm": 18.882827758789062, - "learning_rate": 1.6343590257592463e-05, - "loss": 2.5098, + "epoch": 0.23, + "grad_norm": 13.115753173828125, + "learning_rate": 1.84738317366021e-05, + "loss": 3.1957, "step": 1825 }, { - "epoch": 0.55, - "grad_norm": 10.823426246643066, - "learning_rate": 1.6341585646988073e-05, - "loss": 1.6721, + "epoch": 0.23, + "grad_norm": 11.76266860961914, + "learning_rate": 1.8472995021545415e-05, + "loss": 1.2549, "step": 1826 }, { - "epoch": 0.55, - "grad_norm": 14.602689743041992, - "learning_rate": 1.6339581036383684e-05, - "loss": 2.6786, + "epoch": 0.23, + "grad_norm": 19.048133850097656, + "learning_rate": 1.847215830648873e-05, + "loss": 2.3595, "step": 1827 }, { - "epoch": 0.55, - "grad_norm": 17.44394302368164, - "learning_rate": 1.6337576425779294e-05, - "loss": 2.5762, + "epoch": 0.23, + "grad_norm": 11.120207786560059, + "learning_rate": 1.847132159143204e-05, + "loss": 2.949, "step": 1828 }, { - "epoch": 0.55, - "grad_norm": 19.712026596069336, - "learning_rate": 1.6335571815174904e-05, - "loss": 2.6143, + "epoch": 0.23, + "grad_norm": 7.369383335113525, + "learning_rate": 1.8470484876375353e-05, + "loss": 2.196, "step": 1829 }, { - "epoch": 0.55, - "grad_norm": 25.916528701782227, - "learning_rate": 1.6333567204570514e-05, - "loss": 2.5159, + "epoch": 0.23, + "grad_norm": 9.80823040008545, + "learning_rate": 1.8469648161318666e-05, + "loss": 3.9954, "step": 1830 }, { - "epoch": 0.55, - "grad_norm": 17.429548263549805, - "learning_rate": 1.6331562593966124e-05, - "loss": 3.5722, + "epoch": 0.23, + "grad_norm": 9.584776878356934, + "learning_rate": 1.8468811446261977e-05, + "loss": 1.239, "step": 1831 }, { - "epoch": 0.55, - "grad_norm": 14.735082626342773, - "learning_rate": 1.6329557983361734e-05, - "loss": 2.0687, + "epoch": 0.23, + "grad_norm": 13.136709213256836, + "learning_rate": 1.846797473120529e-05, + "loss": 2.052, "step": 1832 }, { - "epoch": 0.55, - "grad_norm": 19.132709503173828, - "learning_rate": 1.6327553372757344e-05, - "loss": 2.9178, + "epoch": 0.23, + "grad_norm": 11.339838981628418, + "learning_rate": 1.8467138016148604e-05, + "loss": 1.2994, "step": 1833 }, { - "epoch": 0.55, - "grad_norm": 20.842416763305664, - "learning_rate": 1.6325548762152954e-05, - "loss": 3.3049, + "epoch": 0.23, + "grad_norm": 27.841861724853516, + "learning_rate": 1.8466301301091914e-05, + "loss": 3.0134, "step": 1834 }, { - "epoch": 0.55, - "grad_norm": 14.198040008544922, - "learning_rate": 1.6323544151548564e-05, - "loss": 2.8631, + "epoch": 0.23, + "grad_norm": 13.050997734069824, + "learning_rate": 1.8465464586035228e-05, + "loss": 3.0181, "step": 1835 }, { - "epoch": 0.55, - "grad_norm": 12.003619194030762, - "learning_rate": 1.6321539540944174e-05, - "loss": 2.0466, + "epoch": 0.23, + "grad_norm": 75.58671569824219, + "learning_rate": 1.846462787097854e-05, + "loss": 3.5269, "step": 1836 }, { - "epoch": 0.55, - "grad_norm": 14.629396438598633, - "learning_rate": 1.631953493033978e-05, - "loss": 2.967, + "epoch": 0.23, + "grad_norm": 10.211413383483887, + "learning_rate": 1.8463791155921852e-05, + "loss": 2.2705, "step": 1837 }, { - "epoch": 0.55, - "grad_norm": 15.829573631286621, - "learning_rate": 1.6317530319735394e-05, - "loss": 2.1286, + "epoch": 0.23, + "grad_norm": 14.928301811218262, + "learning_rate": 1.8462954440865165e-05, + "loss": 2.5123, "step": 1838 }, { - "epoch": 0.55, - "grad_norm": 14.251692771911621, - "learning_rate": 1.6315525709131e-05, - "loss": 2.4073, + "epoch": 0.23, + "grad_norm": 33.32582473754883, + "learning_rate": 1.8462117725808476e-05, + "loss": 3.597, "step": 1839 }, { - "epoch": 0.55, - "grad_norm": 25.430139541625977, - "learning_rate": 1.631352109852661e-05, - "loss": 2.9292, + "epoch": 0.23, + "grad_norm": 33.28379440307617, + "learning_rate": 1.846128101075179e-05, + "loss": 4.2034, "step": 1840 }, { - "epoch": 0.55, - "grad_norm": 27.490047454833984, - "learning_rate": 1.6311516487922225e-05, - "loss": 3.4921, + "epoch": 0.23, + "grad_norm": 8.760516166687012, + "learning_rate": 1.8460444295695103e-05, + "loss": 2.4341, "step": 1841 }, { - "epoch": 0.55, - "grad_norm": 9.101457595825195, - "learning_rate": 1.630951187731783e-05, - "loss": 2.1141, + "epoch": 0.23, + "grad_norm": 11.161774635314941, + "learning_rate": 1.8459607580638413e-05, + "loss": 2.317, "step": 1842 }, { - "epoch": 0.55, - "grad_norm": 12.929884910583496, - "learning_rate": 1.630750726671344e-05, - "loss": 1.934, + "epoch": 0.23, + "grad_norm": 9.371776580810547, + "learning_rate": 1.8458770865581727e-05, + "loss": 2.9491, "step": 1843 }, { - "epoch": 0.55, - "grad_norm": 11.764657020568848, - "learning_rate": 1.630550265610905e-05, - "loss": 2.0135, + "epoch": 0.23, + "grad_norm": 23.16387176513672, + "learning_rate": 1.845793415052504e-05, + "loss": 2.3188, "step": 1844 }, { - "epoch": 0.55, - "grad_norm": 15.838483810424805, - "learning_rate": 1.630349804550466e-05, - "loss": 2.7645, + "epoch": 0.23, + "grad_norm": 11.673296928405762, + "learning_rate": 1.845709743546835e-05, + "loss": 2.4927, "step": 1845 }, { - "epoch": 0.56, - "grad_norm": 19.013673782348633, - "learning_rate": 1.6301493434900272e-05, - "loss": 2.2205, + "epoch": 0.23, + "grad_norm": 14.898637771606445, + "learning_rate": 1.8456260720411664e-05, + "loss": 2.679, "step": 1846 }, { - "epoch": 0.56, - "grad_norm": 12.440937042236328, - "learning_rate": 1.6299488824295882e-05, - "loss": 3.1805, + "epoch": 0.23, + "grad_norm": 10.247369766235352, + "learning_rate": 1.8455424005354978e-05, + "loss": 2.9585, "step": 1847 }, { - "epoch": 0.56, - "grad_norm": 18.47015380859375, - "learning_rate": 1.6297484213691492e-05, - "loss": 2.9228, + "epoch": 0.23, + "grad_norm": 14.360986709594727, + "learning_rate": 1.8454587290298292e-05, + "loss": 2.2375, "step": 1848 }, { - "epoch": 0.56, - "grad_norm": 26.46478843688965, - "learning_rate": 1.6295479603087102e-05, - "loss": 3.1306, + "epoch": 0.23, + "grad_norm": 12.506513595581055, + "learning_rate": 1.8453750575241602e-05, + "loss": 2.4649, "step": 1849 }, { - "epoch": 0.56, - "grad_norm": 20.930150985717773, - "learning_rate": 1.6293474992482712e-05, - "loss": 2.2428, + "epoch": 0.23, + "grad_norm": 15.462091445922852, + "learning_rate": 1.8452913860184916e-05, + "loss": 2.2894, "step": 1850 }, { - "epoch": 0.56, - "grad_norm": 13.267045021057129, - "learning_rate": 1.629147038187832e-05, - "loss": 2.6697, + "epoch": 0.23, + "grad_norm": 13.013998031616211, + "learning_rate": 1.845207714512823e-05, + "loss": 3.056, "step": 1851 }, { - "epoch": 0.56, - "grad_norm": 11.723921775817871, - "learning_rate": 1.6289465771273932e-05, - "loss": 2.8602, + "epoch": 0.23, + "grad_norm": 10.314661979675293, + "learning_rate": 1.845124043007154e-05, + "loss": 2.3853, "step": 1852 }, { - "epoch": 0.56, - "grad_norm": 11.42064094543457, - "learning_rate": 1.6287461160669542e-05, - "loss": 2.9313, + "epoch": 0.23, + "grad_norm": 8.996312141418457, + "learning_rate": 1.8450403715014853e-05, + "loss": 1.611, "step": 1853 }, { - "epoch": 0.56, - "grad_norm": 31.033870697021484, - "learning_rate": 1.628545655006515e-05, - "loss": 3.2394, + "epoch": 0.23, + "grad_norm": 19.560791015625, + "learning_rate": 1.8449566999958167e-05, + "loss": 5.2749, "step": 1854 }, { - "epoch": 0.56, - "grad_norm": 17.858312606811523, - "learning_rate": 1.6283451939460762e-05, - "loss": 2.1223, + "epoch": 0.23, + "grad_norm": 14.763134956359863, + "learning_rate": 1.844873028490148e-05, + "loss": 1.3857, "step": 1855 }, { - "epoch": 0.56, - "grad_norm": 13.080681800842285, - "learning_rate": 1.628144732885637e-05, - "loss": 2.4824, + "epoch": 0.23, + "grad_norm": 7.593055725097656, + "learning_rate": 1.844789356984479e-05, + "loss": 2.0708, "step": 1856 }, { - "epoch": 0.56, - "grad_norm": 9.161316871643066, - "learning_rate": 1.6279442718251983e-05, - "loss": 1.5993, + "epoch": 0.23, + "grad_norm": 14.49431037902832, + "learning_rate": 1.8447056854788104e-05, + "loss": 2.6208, "step": 1857 }, { - "epoch": 0.56, - "grad_norm": 15.82925796508789, - "learning_rate": 1.6277438107647593e-05, - "loss": 2.2007, + "epoch": 0.23, + "grad_norm": 6.890941619873047, + "learning_rate": 1.8446220139731418e-05, + "loss": 0.7461, "step": 1858 }, { - "epoch": 0.56, - "grad_norm": 29.120468139648438, - "learning_rate": 1.62754334970432e-05, - "loss": 3.2814, + "epoch": 0.23, + "grad_norm": 13.095731735229492, + "learning_rate": 1.8445383424674728e-05, + "loss": 3.8139, "step": 1859 }, { - "epoch": 0.56, - "grad_norm": 20.672487258911133, - "learning_rate": 1.6273428886438813e-05, - "loss": 2.6652, + "epoch": 0.23, + "grad_norm": 8.560629844665527, + "learning_rate": 1.8444546709618042e-05, + "loss": 3.0648, "step": 1860 }, { - "epoch": 0.56, - "grad_norm": 18.959318161010742, - "learning_rate": 1.627142427583442e-05, - "loss": 2.5797, + "epoch": 0.23, + "grad_norm": 27.346872329711914, + "learning_rate": 1.8443709994561356e-05, + "loss": 1.476, "step": 1861 }, { - "epoch": 0.56, - "grad_norm": 10.144336700439453, - "learning_rate": 1.626941966523003e-05, - "loss": 2.1145, + "epoch": 0.23, + "grad_norm": 16.805004119873047, + "learning_rate": 1.8442873279504666e-05, + "loss": 1.9073, "step": 1862 }, { - "epoch": 0.56, - "grad_norm": 19.466264724731445, - "learning_rate": 1.626741505462564e-05, - "loss": 2.4139, + "epoch": 0.23, + "grad_norm": 10.577855110168457, + "learning_rate": 1.844203656444798e-05, + "loss": 2.9577, "step": 1863 }, { - "epoch": 0.56, - "grad_norm": 13.246243476867676, - "learning_rate": 1.626541044402125e-05, - "loss": 2.2984, + "epoch": 0.23, + "grad_norm": 17.957332611083984, + "learning_rate": 1.844119984939129e-05, + "loss": 2.2535, "step": 1864 }, { - "epoch": 0.56, - "grad_norm": 15.933266639709473, - "learning_rate": 1.626340583341686e-05, - "loss": 2.6167, + "epoch": 0.23, + "grad_norm": 23.276569366455078, + "learning_rate": 1.8440363134334603e-05, + "loss": 3.8149, "step": 1865 }, { - "epoch": 0.56, - "grad_norm": 10.605395317077637, - "learning_rate": 1.626140122281247e-05, - "loss": 2.094, + "epoch": 0.23, + "grad_norm": 8.072980880737305, + "learning_rate": 1.8439526419277917e-05, + "loss": 1.0882, "step": 1866 }, { - "epoch": 0.56, - "grad_norm": 14.293509483337402, - "learning_rate": 1.625939661220808e-05, - "loss": 2.4193, + "epoch": 0.23, + "grad_norm": 15.412882804870605, + "learning_rate": 1.8438689704221227e-05, + "loss": 4.7679, "step": 1867 }, { - "epoch": 0.56, - "grad_norm": 14.355191230773926, - "learning_rate": 1.625739200160369e-05, - "loss": 3.3217, + "epoch": 0.23, + "grad_norm": 24.126787185668945, + "learning_rate": 1.843785298916454e-05, + "loss": 2.9979, "step": 1868 }, { - "epoch": 0.56, - "grad_norm": 11.297273635864258, - "learning_rate": 1.62553873909993e-05, - "loss": 1.9198, + "epoch": 0.23, + "grad_norm": 11.626553535461426, + "learning_rate": 1.8437016274107855e-05, + "loss": 2.255, "step": 1869 }, { - "epoch": 0.56, - "grad_norm": 19.611162185668945, - "learning_rate": 1.625338278039491e-05, - "loss": 2.3439, + "epoch": 0.23, + "grad_norm": 12.555859565734863, + "learning_rate": 1.8436179559051165e-05, + "loss": 3.1869, "step": 1870 }, { - "epoch": 0.56, - "grad_norm": 9.530548095703125, - "learning_rate": 1.625137816979052e-05, - "loss": 1.1765, + "epoch": 0.23, + "grad_norm": 13.366609573364258, + "learning_rate": 1.843534284399448e-05, + "loss": 2.9125, "step": 1871 }, { - "epoch": 0.56, - "grad_norm": 20.621294021606445, - "learning_rate": 1.624937355918613e-05, - "loss": 2.2362, + "epoch": 0.23, + "grad_norm": 16.570537567138672, + "learning_rate": 1.8434506128937792e-05, + "loss": 3.2158, "step": 1872 }, { - "epoch": 0.56, - "grad_norm": 20.514188766479492, - "learning_rate": 1.6247368948581737e-05, - "loss": 3.2915, + "epoch": 0.24, + "grad_norm": 20.994041442871094, + "learning_rate": 1.8433669413881102e-05, + "loss": 1.722, "step": 1873 }, { - "epoch": 0.56, - "grad_norm": 11.529969215393066, - "learning_rate": 1.624536433797735e-05, - "loss": 1.7275, + "epoch": 0.24, + "grad_norm": 11.375537872314453, + "learning_rate": 1.8432832698824416e-05, + "loss": 1.4647, "step": 1874 }, { - "epoch": 0.56, - "grad_norm": 15.720181465148926, - "learning_rate": 1.6243359727372957e-05, - "loss": 2.0002, + "epoch": 0.24, + "grad_norm": 27.82518196105957, + "learning_rate": 1.843199598376773e-05, + "loss": 3.7956, "step": 1875 }, { - "epoch": 0.56, - "grad_norm": 20.808303833007812, - "learning_rate": 1.6241355116768567e-05, - "loss": 3.8721, + "epoch": 0.24, + "grad_norm": 15.789790153503418, + "learning_rate": 1.8431159268711043e-05, + "loss": 0.1957, "step": 1876 }, { - "epoch": 0.56, - "grad_norm": 11.557236671447754, - "learning_rate": 1.623935050616418e-05, - "loss": 1.6353, + "epoch": 0.24, + "grad_norm": 12.570982933044434, + "learning_rate": 1.8430322553654354e-05, + "loss": 2.4985, "step": 1877 }, { - "epoch": 0.56, - "grad_norm": 13.909945487976074, - "learning_rate": 1.6237345895559788e-05, - "loss": 2.2762, + "epoch": 0.24, + "grad_norm": 13.817831993103027, + "learning_rate": 1.8429485838597667e-05, + "loss": 1.6574, "step": 1878 }, { - "epoch": 0.56, - "grad_norm": 36.823299407958984, - "learning_rate": 1.6235341284955398e-05, - "loss": 2.6792, + "epoch": 0.24, + "grad_norm": 18.990236282348633, + "learning_rate": 1.842864912354098e-05, + "loss": 4.569, "step": 1879 }, { - "epoch": 0.57, - "grad_norm": 73.27782440185547, - "learning_rate": 1.6233336674351008e-05, - "loss": 3.6506, + "epoch": 0.24, + "grad_norm": 9.704483032226562, + "learning_rate": 1.842781240848429e-05, + "loss": 1.0726, "step": 1880 }, { - "epoch": 0.57, - "grad_norm": 15.239794731140137, - "learning_rate": 1.6231332063746618e-05, - "loss": 2.5237, + "epoch": 0.24, + "grad_norm": 16.10625648498535, + "learning_rate": 1.8426975693427605e-05, + "loss": 5.0485, "step": 1881 }, { - "epoch": 0.57, - "grad_norm": 28.48020362854004, - "learning_rate": 1.6229327453142228e-05, - "loss": 2.8108, + "epoch": 0.24, + "grad_norm": 10.295207977294922, + "learning_rate": 1.842613897837092e-05, + "loss": 2.6966, "step": 1882 }, { - "epoch": 0.57, - "grad_norm": 9.497574806213379, - "learning_rate": 1.6227322842537838e-05, - "loss": 1.4183, + "epoch": 0.24, + "grad_norm": 11.28648567199707, + "learning_rate": 1.8425302263314232e-05, + "loss": 3.5715, "step": 1883 }, { - "epoch": 0.57, - "grad_norm": 12.64692211151123, - "learning_rate": 1.6225318231933448e-05, - "loss": 2.5578, + "epoch": 0.24, + "grad_norm": 6.4751482009887695, + "learning_rate": 1.8424465548257542e-05, + "loss": 1.2767, "step": 1884 }, { - "epoch": 0.57, - "grad_norm": 23.391645431518555, - "learning_rate": 1.6223313621329058e-05, - "loss": 2.03, + "epoch": 0.24, + "grad_norm": 11.318130493164062, + "learning_rate": 1.8423628833200856e-05, + "loss": 2.1673, "step": 1885 }, { - "epoch": 0.57, - "grad_norm": 25.81462860107422, - "learning_rate": 1.622130901072467e-05, - "loss": 2.3658, + "epoch": 0.24, + "grad_norm": 18.204483032226562, + "learning_rate": 1.842279211814417e-05, + "loss": 1.1198, "step": 1886 }, { - "epoch": 0.57, - "grad_norm": 22.7410831451416, - "learning_rate": 1.621930440012028e-05, - "loss": 2.291, + "epoch": 0.24, + "grad_norm": 4.5781073570251465, + "learning_rate": 1.842195540308748e-05, + "loss": 0.1754, "step": 1887 }, { - "epoch": 0.57, - "grad_norm": 14.532958030700684, - "learning_rate": 1.621729978951589e-05, - "loss": 2.3839, + "epoch": 0.24, + "grad_norm": 9.175607681274414, + "learning_rate": 1.8421118688030794e-05, + "loss": 0.4124, "step": 1888 }, { - "epoch": 0.57, - "grad_norm": 9.742128372192383, - "learning_rate": 1.62152951789115e-05, - "loss": 1.9635, + "epoch": 0.24, + "grad_norm": 22.146230697631836, + "learning_rate": 1.8420281972974107e-05, + "loss": 2.7644, "step": 1889 }, { - "epoch": 0.57, - "grad_norm": 11.445870399475098, - "learning_rate": 1.621329056830711e-05, - "loss": 2.448, + "epoch": 0.24, + "grad_norm": 9.474859237670898, + "learning_rate": 1.8419445257917418e-05, + "loss": 3.5436, "step": 1890 }, { - "epoch": 0.57, - "grad_norm": 17.39885902404785, - "learning_rate": 1.621128595770272e-05, - "loss": 1.9545, + "epoch": 0.24, + "grad_norm": 18.500133514404297, + "learning_rate": 1.841860854286073e-05, + "loss": 1.9568, "step": 1891 }, { - "epoch": 0.57, - "grad_norm": 11.735391616821289, - "learning_rate": 1.6209281347098325e-05, - "loss": 2.49, + "epoch": 0.24, + "grad_norm": 15.487665176391602, + "learning_rate": 1.841777182780404e-05, + "loss": 2.2543, "step": 1892 }, { - "epoch": 0.57, - "grad_norm": 31.197790145874023, - "learning_rate": 1.620727673649394e-05, - "loss": 2.4989, + "epoch": 0.24, + "grad_norm": 35.86439514160156, + "learning_rate": 1.8416935112747355e-05, + "loss": 3.3658, "step": 1893 }, { - "epoch": 0.57, - "grad_norm": 28.4355525970459, - "learning_rate": 1.6205272125889546e-05, - "loss": 2.9879, + "epoch": 0.24, + "grad_norm": 11.434447288513184, + "learning_rate": 1.841609839769067e-05, + "loss": 1.0026, "step": 1894 }, { - "epoch": 0.57, - "grad_norm": 21.622291564941406, - "learning_rate": 1.6203267515285156e-05, - "loss": 3.1001, + "epoch": 0.24, + "grad_norm": 19.575517654418945, + "learning_rate": 1.841526168263398e-05, + "loss": 1.4819, "step": 1895 }, { - "epoch": 0.57, - "grad_norm": 15.58934497833252, - "learning_rate": 1.620126290468077e-05, - "loss": 2.0499, + "epoch": 0.24, + "grad_norm": 17.874048233032227, + "learning_rate": 1.8414424967577293e-05, + "loss": 1.6523, "step": 1896 }, { - "epoch": 0.57, - "grad_norm": 16.792016983032227, - "learning_rate": 1.6199258294076376e-05, - "loss": 2.5507, + "epoch": 0.24, + "grad_norm": 14.109743118286133, + "learning_rate": 1.8413588252520606e-05, + "loss": 3.1316, "step": 1897 }, { - "epoch": 0.57, - "grad_norm": 15.837790489196777, - "learning_rate": 1.6197253683471986e-05, - "loss": 2.5776, + "epoch": 0.24, + "grad_norm": 14.72187614440918, + "learning_rate": 1.8412751537463917e-05, + "loss": 3.0954, "step": 1898 }, { - "epoch": 0.57, - "grad_norm": 13.746726989746094, - "learning_rate": 1.6195249072867596e-05, - "loss": 1.7229, + "epoch": 0.24, + "grad_norm": 49.131553649902344, + "learning_rate": 1.841191482240723e-05, + "loss": 2.0844, "step": 1899 }, { - "epoch": 0.57, - "grad_norm": 20.217098236083984, - "learning_rate": 1.6193244462263206e-05, - "loss": 2.7257, + "epoch": 0.24, + "grad_norm": 13.758391380310059, + "learning_rate": 1.8411078107350544e-05, + "loss": 4.0891, "step": 1900 }, { - "epoch": 0.57, - "grad_norm": 14.193788528442383, - "learning_rate": 1.6191239851658816e-05, - "loss": 2.4532, + "epoch": 0.24, + "grad_norm": 11.789108276367188, + "learning_rate": 1.8410241392293854e-05, + "loss": 1.2636, "step": 1901 }, { - "epoch": 0.57, - "grad_norm": 12.840964317321777, - "learning_rate": 1.6189235241054426e-05, - "loss": 1.3137, + "epoch": 0.24, + "grad_norm": 25.986669540405273, + "learning_rate": 1.8409404677237168e-05, + "loss": 2.2081, "step": 1902 }, { - "epoch": 0.57, - "grad_norm": 10.857292175292969, - "learning_rate": 1.6187230630450036e-05, - "loss": 2.3681, + "epoch": 0.24, + "grad_norm": 15.064836502075195, + "learning_rate": 1.840856796218048e-05, + "loss": 2.0525, "step": 1903 }, { - "epoch": 0.57, - "grad_norm": 11.725610733032227, - "learning_rate": 1.6185226019845646e-05, - "loss": 2.0677, + "epoch": 0.24, + "grad_norm": 7.967344760894775, + "learning_rate": 1.8407731247123795e-05, + "loss": 1.1078, "step": 1904 }, { - "epoch": 0.57, - "grad_norm": 16.389385223388672, - "learning_rate": 1.6183221409241257e-05, - "loss": 2.2652, + "epoch": 0.24, + "grad_norm": 30.750852584838867, + "learning_rate": 1.8406894532067105e-05, + "loss": 1.8963, "step": 1905 }, { - "epoch": 0.57, - "grad_norm": 20.219581604003906, - "learning_rate": 1.6181216798636867e-05, - "loss": 2.3222, + "epoch": 0.24, + "grad_norm": 7.841359615325928, + "learning_rate": 1.840605781701042e-05, + "loss": 2.8736, "step": 1906 }, { - "epoch": 0.57, - "grad_norm": 15.775784492492676, - "learning_rate": 1.6179212188032477e-05, - "loss": 2.0435, + "epoch": 0.24, + "grad_norm": 14.268943786621094, + "learning_rate": 1.8405221101953733e-05, + "loss": 2.2186, "step": 1907 }, { - "epoch": 0.57, - "grad_norm": 11.360732078552246, - "learning_rate": 1.6177207577428087e-05, - "loss": 2.0024, + "epoch": 0.24, + "grad_norm": 22.337039947509766, + "learning_rate": 1.8404384386897043e-05, + "loss": 2.3814, "step": 1908 }, { - "epoch": 0.57, - "grad_norm": 26.330806732177734, - "learning_rate": 1.6175202966823697e-05, - "loss": 3.1264, + "epoch": 0.24, + "grad_norm": 15.489692687988281, + "learning_rate": 1.8403547671840357e-05, + "loss": 2.0205, "step": 1909 }, { - "epoch": 0.57, - "grad_norm": 16.440622329711914, - "learning_rate": 1.6173198356219307e-05, - "loss": 2.6761, + "epoch": 0.24, + "grad_norm": 14.391831398010254, + "learning_rate": 1.840271095678367e-05, + "loss": 3.1901, "step": 1910 }, { - "epoch": 0.57, - "grad_norm": 14.55276107788086, - "learning_rate": 1.6171193745614914e-05, - "loss": 2.2345, + "epoch": 0.24, + "grad_norm": 11.057231903076172, + "learning_rate": 1.8401874241726984e-05, + "loss": 3.2508, "step": 1911 }, { - "epoch": 0.57, - "grad_norm": 15.58680534362793, - "learning_rate": 1.6169189135010527e-05, - "loss": 2.8794, + "epoch": 0.24, + "grad_norm": 18.586143493652344, + "learning_rate": 1.8401037526670294e-05, + "loss": 2.3013, "step": 1912 }, { - "epoch": 0.58, - "grad_norm": 27.979393005371094, - "learning_rate": 1.6167184524406137e-05, - "loss": 2.5893, + "epoch": 0.24, + "grad_norm": 8.885278701782227, + "learning_rate": 1.8400200811613608e-05, + "loss": 1.8043, "step": 1913 }, { - "epoch": 0.58, - "grad_norm": 14.484015464782715, - "learning_rate": 1.6165179913801744e-05, - "loss": 2.4981, + "epoch": 0.24, + "grad_norm": 21.466678619384766, + "learning_rate": 1.839936409655692e-05, + "loss": 2.3803, "step": 1914 }, { - "epoch": 0.58, - "grad_norm": 9.881906509399414, - "learning_rate": 1.6163175303197357e-05, - "loss": 2.1874, + "epoch": 0.24, + "grad_norm": 26.071767807006836, + "learning_rate": 1.839852738150023e-05, + "loss": 3.6033, "step": 1915 }, { - "epoch": 0.58, - "grad_norm": 17.09141731262207, - "learning_rate": 1.6161170692592964e-05, - "loss": 2.9159, + "epoch": 0.24, + "grad_norm": 19.87377166748047, + "learning_rate": 1.8397690666443545e-05, + "loss": 2.1179, "step": 1916 }, { - "epoch": 0.58, - "grad_norm": 18.972490310668945, - "learning_rate": 1.6159166081988574e-05, - "loss": 1.6856, + "epoch": 0.24, + "grad_norm": 15.754097938537598, + "learning_rate": 1.8396853951386856e-05, + "loss": 4.6128, "step": 1917 }, { - "epoch": 0.58, - "grad_norm": 10.02529525756836, - "learning_rate": 1.6157161471384184e-05, - "loss": 1.7176, + "epoch": 0.24, + "grad_norm": 17.967514038085938, + "learning_rate": 1.839601723633017e-05, + "loss": 2.6943, "step": 1918 }, { - "epoch": 0.58, - "grad_norm": 21.166505813598633, - "learning_rate": 1.6155156860779794e-05, - "loss": 2.5582, + "epoch": 0.24, + "grad_norm": 14.497222900390625, + "learning_rate": 1.8395180521273483e-05, + "loss": 3.1712, "step": 1919 }, { - "epoch": 0.58, - "grad_norm": 34.15064239501953, - "learning_rate": 1.6153152250175404e-05, - "loss": 4.2157, - "step": 1920 - }, - { - "epoch": 0.58, - "eval_loss": 0.339510053396225, - "eval_runtime": 43.4353, - "eval_samples_per_second": 34.051, - "eval_steps_per_second": 34.051, + "epoch": 0.24, + "grad_norm": 17.02864646911621, + "learning_rate": 1.8394343806216793e-05, + "loss": 2.9962, "step": 1920 }, { - "epoch": 0.58, - "grad_norm": 18.826351165771484, - "learning_rate": 1.6151147639571015e-05, - "loss": 2.8172, + "epoch": 0.24, + "grad_norm": 20.203519821166992, + "learning_rate": 1.8393507091160107e-05, + "loss": 2.0085, "step": 1921 }, { - "epoch": 0.58, - "grad_norm": 28.641447067260742, - "learning_rate": 1.6149143028966625e-05, - "loss": 3.8235, + "epoch": 0.24, + "grad_norm": 11.49713134765625, + "learning_rate": 1.8392670376103417e-05, + "loss": 3.9497, "step": 1922 }, { - "epoch": 0.58, - "grad_norm": 18.224559783935547, - "learning_rate": 1.6147138418362235e-05, - "loss": 1.6789, + "epoch": 0.24, + "grad_norm": 32.90269088745117, + "learning_rate": 1.839183366104673e-05, + "loss": 2.3974, "step": 1923 }, { - "epoch": 0.58, - "grad_norm": 20.033323287963867, - "learning_rate": 1.6145133807757845e-05, - "loss": 2.2715, + "epoch": 0.24, + "grad_norm": 18.34015464782715, + "learning_rate": 1.8390996945990044e-05, + "loss": 3.679, "step": 1924 }, { - "epoch": 0.58, - "grad_norm": 18.451032638549805, - "learning_rate": 1.6143129197153455e-05, - "loss": 3.6021, + "epoch": 0.24, + "grad_norm": 13.834124565124512, + "learning_rate": 1.8390160230933358e-05, + "loss": 1.8375, "step": 1925 }, { - "epoch": 0.58, - "grad_norm": 13.090971946716309, - "learning_rate": 1.6141124586549065e-05, - "loss": 2.8423, + "epoch": 0.24, + "grad_norm": 10.36095905303955, + "learning_rate": 1.8389323515876668e-05, + "loss": 1.666, "step": 1926 }, { - "epoch": 0.58, - "grad_norm": 31.57698631286621, - "learning_rate": 1.6139119975944675e-05, - "loss": 2.1646, + "epoch": 0.24, + "grad_norm": 11.952203750610352, + "learning_rate": 1.8388486800819982e-05, + "loss": 3.0777, "step": 1927 }, { - "epoch": 0.58, - "grad_norm": 19.8615779876709, - "learning_rate": 1.6137115365340285e-05, - "loss": 2.4651, + "epoch": 0.24, + "grad_norm": 14.914665222167969, + "learning_rate": 1.8387650085763296e-05, + "loss": 3.1809, "step": 1928 }, { - "epoch": 0.58, - "grad_norm": 17.440027236938477, - "learning_rate": 1.6135110754735895e-05, - "loss": 2.2323, + "epoch": 0.24, + "grad_norm": 26.21071434020996, + "learning_rate": 1.8386813370706606e-05, + "loss": 3.7649, "step": 1929 }, { - "epoch": 0.58, - "grad_norm": 12.359676361083984, - "learning_rate": 1.6133106144131502e-05, - "loss": 3.0975, + "epoch": 0.24, + "grad_norm": 12.51376724243164, + "learning_rate": 1.838597665564992e-05, + "loss": 1.9043, "step": 1930 }, { - "epoch": 0.58, - "grad_norm": 7.871943473815918, - "learning_rate": 1.6131101533527115e-05, - "loss": 1.8966, + "epoch": 0.24, + "grad_norm": 11.830827713012695, + "learning_rate": 1.8385139940593233e-05, + "loss": 1.3338, "step": 1931 }, { - "epoch": 0.58, - "grad_norm": 13.588000297546387, - "learning_rate": 1.6129096922922725e-05, - "loss": 2.7859, + "epoch": 0.24, + "grad_norm": 24.52362632751465, + "learning_rate": 1.8384303225536547e-05, + "loss": 1.8047, "step": 1932 }, { - "epoch": 0.58, - "grad_norm": 16.730295181274414, - "learning_rate": 1.6127092312318332e-05, - "loss": 2.4625, + "epoch": 0.24, + "grad_norm": 12.227885246276855, + "learning_rate": 1.8383466510479857e-05, + "loss": 3.0994, "step": 1933 }, { - "epoch": 0.58, - "grad_norm": 13.648134231567383, - "learning_rate": 1.6125087701713946e-05, - "loss": 2.5004, + "epoch": 0.24, + "grad_norm": 12.330459594726562, + "learning_rate": 1.838262979542317e-05, + "loss": 2.7654, "step": 1934 }, { - "epoch": 0.58, - "grad_norm": 19.520750045776367, - "learning_rate": 1.6123083091109552e-05, - "loss": 3.5212, + "epoch": 0.24, + "grad_norm": 18.657487869262695, + "learning_rate": 1.8381793080366484e-05, + "loss": 3.6686, "step": 1935 }, { - "epoch": 0.58, - "grad_norm": 19.73345184326172, - "learning_rate": 1.6121078480505162e-05, - "loss": 2.2163, + "epoch": 0.24, + "grad_norm": 14.701178550720215, + "learning_rate": 1.8380956365309795e-05, + "loss": 2.7045, "step": 1936 }, { - "epoch": 0.58, - "grad_norm": 11.382865905761719, - "learning_rate": 1.6119073869900772e-05, - "loss": 1.5394, + "epoch": 0.24, + "grad_norm": 21.401046752929688, + "learning_rate": 1.8380119650253108e-05, + "loss": 2.1688, "step": 1937 }, { - "epoch": 0.58, - "grad_norm": 11.32697868347168, - "learning_rate": 1.6117069259296383e-05, - "loss": 1.7365, + "epoch": 0.24, + "grad_norm": 11.859993934631348, + "learning_rate": 1.8379282935196422e-05, + "loss": 2.4173, "step": 1938 }, { - "epoch": 0.58, - "grad_norm": 17.160417556762695, - "learning_rate": 1.6115064648691993e-05, - "loss": 3.0884, + "epoch": 0.24, + "grad_norm": 12.437385559082031, + "learning_rate": 1.8378446220139735e-05, + "loss": 3.579, "step": 1939 }, { - "epoch": 0.58, - "grad_norm": 10.711620330810547, - "learning_rate": 1.6113060038087603e-05, - "loss": 2.0598, + "epoch": 0.24, + "grad_norm": 22.76668357849121, + "learning_rate": 1.8377609505083046e-05, + "loss": 4.4727, "step": 1940 }, { - "epoch": 0.58, - "grad_norm": 15.657376289367676, - "learning_rate": 1.6111055427483213e-05, - "loss": 2.5557, + "epoch": 0.24, + "grad_norm": 23.67467498779297, + "learning_rate": 1.837677279002636e-05, + "loss": 3.4649, "step": 1941 }, { - "epoch": 0.58, - "grad_norm": 17.509824752807617, - "learning_rate": 1.6109050816878823e-05, - "loss": 2.2766, + "epoch": 0.24, + "grad_norm": 15.801477432250977, + "learning_rate": 1.8375936074969673e-05, + "loss": 4.4846, "step": 1942 }, { - "epoch": 0.58, - "grad_norm": 10.339923858642578, - "learning_rate": 1.6107046206274433e-05, - "loss": 1.8058, + "epoch": 0.24, + "grad_norm": 15.054116249084473, + "learning_rate": 1.8375099359912983e-05, + "loss": 2.2226, "step": 1943 }, { - "epoch": 0.58, - "grad_norm": 16.32088851928711, - "learning_rate": 1.6105041595670043e-05, - "loss": 2.2214, + "epoch": 0.24, + "grad_norm": 11.429304122924805, + "learning_rate": 1.8374262644856297e-05, + "loss": 1.7485, "step": 1944 }, { - "epoch": 0.58, - "grad_norm": 34.30913543701172, - "learning_rate": 1.6103036985065653e-05, - "loss": 3.2443, + "epoch": 0.24, + "grad_norm": 18.58489227294922, + "learning_rate": 1.8373425929799607e-05, + "loss": 4.9844, "step": 1945 }, { - "epoch": 0.59, - "grad_norm": 13.773527145385742, - "learning_rate": 1.6101032374461263e-05, - "loss": 3.0382, + "epoch": 0.24, + "grad_norm": 12.300481796264648, + "learning_rate": 1.837258921474292e-05, + "loss": 2.1001, "step": 1946 }, { - "epoch": 0.59, - "grad_norm": 23.005949020385742, - "learning_rate": 1.609902776385687e-05, - "loss": 2.0551, + "epoch": 0.24, + "grad_norm": 12.821094512939453, + "learning_rate": 1.8371752499686235e-05, + "loss": 2.51, "step": 1947 }, { - "epoch": 0.59, - "grad_norm": 32.260467529296875, - "learning_rate": 1.6097023153252483e-05, - "loss": 2.4883, + "epoch": 0.24, + "grad_norm": 10.40767765045166, + "learning_rate": 1.8370915784629545e-05, + "loss": 3.1537, "step": 1948 }, { - "epoch": 0.59, - "grad_norm": 13.194154739379883, - "learning_rate": 1.609501854264809e-05, - "loss": 2.7221, + "epoch": 0.24, + "grad_norm": 35.25419616699219, + "learning_rate": 1.837007906957286e-05, + "loss": 3.2483, "step": 1949 }, { - "epoch": 0.59, - "grad_norm": 29.038406372070312, - "learning_rate": 1.60930139320437e-05, - "loss": 2.7329, + "epoch": 0.24, + "grad_norm": 24.408248901367188, + "learning_rate": 1.836924235451617e-05, + "loss": 2.5151, "step": 1950 }, { - "epoch": 0.59, - "grad_norm": 15.281466484069824, - "learning_rate": 1.6091009321439314e-05, - "loss": 2.5336, + "epoch": 0.24, + "grad_norm": 15.579033851623535, + "learning_rate": 1.8368405639459482e-05, + "loss": 3.2173, "step": 1951 }, { - "epoch": 0.59, - "grad_norm": 14.665143013000488, - "learning_rate": 1.608900471083492e-05, - "loss": 1.9784, + "epoch": 0.24, + "grad_norm": 9.459357261657715, + "learning_rate": 1.8367568924402796e-05, + "loss": 3.3478, "step": 1952 }, { - "epoch": 0.59, - "grad_norm": 17.575735092163086, - "learning_rate": 1.608700010023053e-05, - "loss": 1.5022, + "epoch": 0.25, + "grad_norm": 13.950507164001465, + "learning_rate": 1.836673220934611e-05, + "loss": 2.4268, "step": 1953 }, { - "epoch": 0.59, - "grad_norm": 16.851743698120117, - "learning_rate": 1.608499548962614e-05, - "loss": 2.1047, + "epoch": 0.25, + "grad_norm": 16.881439208984375, + "learning_rate": 1.836589549428942e-05, + "loss": 1.0458, "step": 1954 }, { - "epoch": 0.59, - "grad_norm": 17.620439529418945, - "learning_rate": 1.608299087902175e-05, - "loss": 2.4975, + "epoch": 0.25, + "grad_norm": 23.890779495239258, + "learning_rate": 1.8365058779232734e-05, + "loss": 3.3657, "step": 1955 }, { - "epoch": 0.59, - "grad_norm": 18.211750030517578, - "learning_rate": 1.608098626841736e-05, - "loss": 2.441, + "epoch": 0.25, + "grad_norm": 11.523681640625, + "learning_rate": 1.8364222064176047e-05, + "loss": 1.7303, "step": 1956 }, { - "epoch": 0.59, - "grad_norm": 38.04513168334961, - "learning_rate": 1.607898165781297e-05, - "loss": 3.2777, + "epoch": 0.25, + "grad_norm": 16.36073875427246, + "learning_rate": 1.8363385349119357e-05, + "loss": 1.4092, "step": 1957 }, { - "epoch": 0.59, - "grad_norm": 18.044525146484375, - "learning_rate": 1.607697704720858e-05, - "loss": 3.4203, + "epoch": 0.25, + "grad_norm": 14.88504409790039, + "learning_rate": 1.836254863406267e-05, + "loss": 1.4235, "step": 1958 }, { - "epoch": 0.59, - "grad_norm": 14.710494995117188, - "learning_rate": 1.607497243660419e-05, - "loss": 2.0862, + "epoch": 0.25, + "grad_norm": 30.882617950439453, + "learning_rate": 1.8361711919005985e-05, + "loss": 1.8152, "step": 1959 }, { - "epoch": 0.59, - "grad_norm": 10.158746719360352, - "learning_rate": 1.60729678259998e-05, - "loss": 1.8249, + "epoch": 0.25, + "grad_norm": 8.716890335083008, + "learning_rate": 1.83608752039493e-05, + "loss": 1.9457, "step": 1960 }, { - "epoch": 0.59, - "grad_norm": 14.311506271362305, - "learning_rate": 1.607096321539541e-05, - "loss": 2.0706, + "epoch": 0.25, + "grad_norm": 37.496131896972656, + "learning_rate": 1.836003848889261e-05, + "loss": 4.3405, "step": 1961 }, { - "epoch": 0.59, - "grad_norm": 9.432246208190918, - "learning_rate": 1.606895860479102e-05, - "loss": 2.0623, + "epoch": 0.25, + "grad_norm": 7.394501686096191, + "learning_rate": 1.8359201773835922e-05, + "loss": 1.7728, "step": 1962 }, { - "epoch": 0.59, - "grad_norm": 17.04642105102539, - "learning_rate": 1.606695399418663e-05, - "loss": 2.918, + "epoch": 0.25, + "grad_norm": 15.787137031555176, + "learning_rate": 1.8358365058779236e-05, + "loss": 3.8866, "step": 1963 }, { - "epoch": 0.59, - "grad_norm": 11.562948226928711, - "learning_rate": 1.606494938358224e-05, - "loss": 2.5815, + "epoch": 0.25, + "grad_norm": 13.577347755432129, + "learning_rate": 1.8357528343722546e-05, + "loss": 2.8812, "step": 1964 }, { - "epoch": 0.59, - "grad_norm": 18.638172149658203, - "learning_rate": 1.606294477297785e-05, - "loss": 2.995, + "epoch": 0.25, + "grad_norm": 14.8187837600708, + "learning_rate": 1.835669162866586e-05, + "loss": 3.5142, "step": 1965 }, { - "epoch": 0.59, - "grad_norm": 14.144832611083984, - "learning_rate": 1.6060940162373458e-05, - "loss": 2.1113, + "epoch": 0.25, + "grad_norm": 9.960477828979492, + "learning_rate": 1.8355854913609174e-05, + "loss": 3.0019, "step": 1966 }, { - "epoch": 0.59, - "grad_norm": 16.268152236938477, - "learning_rate": 1.605893555176907e-05, - "loss": 2.2524, + "epoch": 0.25, + "grad_norm": 10.168802261352539, + "learning_rate": 1.8355018198552484e-05, + "loss": 2.6793, "step": 1967 }, { - "epoch": 0.59, - "grad_norm": 16.423564910888672, - "learning_rate": 1.6056930941164678e-05, - "loss": 2.2185, + "epoch": 0.25, + "grad_norm": 12.786600112915039, + "learning_rate": 1.8354181483495797e-05, + "loss": 0.4634, "step": 1968 }, { - "epoch": 0.59, - "grad_norm": 28.028411865234375, - "learning_rate": 1.605492633056029e-05, - "loss": 2.556, + "epoch": 0.25, + "grad_norm": 10.668661117553711, + "learning_rate": 1.835334476843911e-05, + "loss": 2.2684, "step": 1969 }, { - "epoch": 0.59, - "grad_norm": 21.24933624267578, - "learning_rate": 1.6052921719955902e-05, - "loss": 2.3257, + "epoch": 0.25, + "grad_norm": 20.51132583618164, + "learning_rate": 1.835250805338242e-05, + "loss": 2.2883, "step": 1970 }, { - "epoch": 0.59, - "grad_norm": 15.96957778930664, - "learning_rate": 1.605091710935151e-05, - "loss": 2.617, + "epoch": 0.25, + "grad_norm": 13.205888748168945, + "learning_rate": 1.8351671338325735e-05, + "loss": 2.5053, "step": 1971 }, { - "epoch": 0.59, - "grad_norm": 10.580945014953613, - "learning_rate": 1.604891249874712e-05, - "loss": 1.4834, + "epoch": 0.25, + "grad_norm": 17.55050277709961, + "learning_rate": 1.835083462326905e-05, + "loss": 2.3348, "step": 1972 }, { - "epoch": 0.59, - "grad_norm": 12.444055557250977, - "learning_rate": 1.604690788814273e-05, - "loss": 2.222, + "epoch": 0.25, + "grad_norm": 16.795305252075195, + "learning_rate": 1.834999790821236e-05, + "loss": 2.2641, "step": 1973 }, { - "epoch": 0.59, - "grad_norm": 12.718807220458984, - "learning_rate": 1.604490327753834e-05, - "loss": 1.986, + "epoch": 0.25, + "grad_norm": 9.307861328125, + "learning_rate": 1.8349161193155673e-05, + "loss": 1.1856, "step": 1974 }, { - "epoch": 0.59, - "grad_norm": 11.073088645935059, - "learning_rate": 1.604289866693395e-05, - "loss": 2.3385, + "epoch": 0.25, + "grad_norm": 18.906524658203125, + "learning_rate": 1.8348324478098983e-05, + "loss": 2.9946, "step": 1975 }, { - "epoch": 0.59, - "grad_norm": 22.884178161621094, - "learning_rate": 1.604089405632956e-05, - "loss": 3.075, + "epoch": 0.25, + "grad_norm": 12.760396957397461, + "learning_rate": 1.8347487763042296e-05, + "loss": 0.9203, "step": 1976 }, { - "epoch": 0.59, - "grad_norm": 13.408631324768066, - "learning_rate": 1.603888944572517e-05, - "loss": 1.867, + "epoch": 0.25, + "grad_norm": 8.600692749023438, + "learning_rate": 1.834665104798561e-05, + "loss": 1.4238, "step": 1977 }, { - "epoch": 0.59, - "grad_norm": 20.047677993774414, - "learning_rate": 1.603688483512078e-05, - "loss": 2.3888, + "epoch": 0.25, + "grad_norm": 13.111052513122559, + "learning_rate": 1.834581433292892e-05, + "loss": 1.2993, "step": 1978 }, { - "epoch": 0.6, - "grad_norm": 56.602203369140625, - "learning_rate": 1.603488022451639e-05, - "loss": 2.9184, + "epoch": 0.25, + "grad_norm": 10.117164611816406, + "learning_rate": 1.8344977617872234e-05, + "loss": 2.9947, "step": 1979 }, { - "epoch": 0.6, - "grad_norm": 23.749616622924805, - "learning_rate": 1.6032875613912e-05, - "loss": 2.7749, + "epoch": 0.25, + "grad_norm": 18.097362518310547, + "learning_rate": 1.8344140902815548e-05, + "loss": 2.3721, "step": 1980 }, { - "epoch": 0.6, - "grad_norm": 26.525043487548828, - "learning_rate": 1.603087100330761e-05, - "loss": 2.1839, + "epoch": 0.25, + "grad_norm": 26.985071182250977, + "learning_rate": 1.8343304187758858e-05, + "loss": 2.5174, "step": 1981 }, { - "epoch": 0.6, - "grad_norm": 32.17291259765625, - "learning_rate": 1.602886639270322e-05, - "loss": 2.7472, + "epoch": 0.25, + "grad_norm": 13.80362606048584, + "learning_rate": 1.834246747270217e-05, + "loss": 2.1506, "step": 1982 }, { - "epoch": 0.6, - "grad_norm": 13.829222679138184, - "learning_rate": 1.602686178209883e-05, - "loss": 2.3262, + "epoch": 0.25, + "grad_norm": 17.8221492767334, + "learning_rate": 1.8341630757645485e-05, + "loss": 2.271, "step": 1983 }, { - "epoch": 0.6, - "grad_norm": 10.74743366241455, - "learning_rate": 1.602485717149444e-05, - "loss": 3.0373, + "epoch": 0.25, + "grad_norm": 13.265314102172852, + "learning_rate": 1.83407940425888e-05, + "loss": 2.7834, "step": 1984 }, { - "epoch": 0.6, - "grad_norm": 19.987829208374023, - "learning_rate": 1.6022852560890046e-05, - "loss": 2.2646, + "epoch": 0.25, + "grad_norm": 24.445533752441406, + "learning_rate": 1.833995732753211e-05, + "loss": 1.5695, "step": 1985 }, { - "epoch": 0.6, - "grad_norm": 15.228018760681152, - "learning_rate": 1.602084795028566e-05, - "loss": 2.8864, + "epoch": 0.25, + "grad_norm": 9.981922149658203, + "learning_rate": 1.8339120612475423e-05, + "loss": 1.6212, "step": 1986 }, { - "epoch": 0.6, - "grad_norm": 39.494258880615234, - "learning_rate": 1.601884333968127e-05, - "loss": 3.7293, + "epoch": 0.25, + "grad_norm": 12.802506446838379, + "learning_rate": 1.8338283897418736e-05, + "loss": 1.9037, "step": 1987 }, { - "epoch": 0.6, - "grad_norm": 21.58100700378418, - "learning_rate": 1.6016838729076877e-05, - "loss": 3.238, + "epoch": 0.25, + "grad_norm": 15.808159828186035, + "learning_rate": 1.8337447182362047e-05, + "loss": 1.8119, "step": 1988 }, { - "epoch": 0.6, - "grad_norm": 26.791006088256836, - "learning_rate": 1.601483411847249e-05, - "loss": 2.7061, + "epoch": 0.25, + "grad_norm": 15.071918487548828, + "learning_rate": 1.833661046730536e-05, + "loss": 1.9275, "step": 1989 }, { - "epoch": 0.6, - "grad_norm": 26.071443557739258, - "learning_rate": 1.6012829507868097e-05, - "loss": 2.4725, + "epoch": 0.25, + "grad_norm": 14.01952838897705, + "learning_rate": 1.8335773752248674e-05, + "loss": 2.6755, "step": 1990 }, { - "epoch": 0.6, - "grad_norm": 14.587154388427734, - "learning_rate": 1.6010824897263707e-05, - "loss": 2.7513, + "epoch": 0.25, + "grad_norm": 10.879013061523438, + "learning_rate": 1.8334937037191988e-05, + "loss": 2.146, "step": 1991 }, { - "epoch": 0.6, - "grad_norm": 19.951908111572266, - "learning_rate": 1.6008820286659317e-05, - "loss": 2.3058, + "epoch": 0.25, + "grad_norm": 14.858782768249512, + "learning_rate": 1.8334100322135298e-05, + "loss": 1.8593, "step": 1992 }, { - "epoch": 0.6, - "grad_norm": 18.910688400268555, - "learning_rate": 1.6006815676054927e-05, - "loss": 2.4444, + "epoch": 0.25, + "grad_norm": 9.486717224121094, + "learning_rate": 1.833326360707861e-05, + "loss": 1.4072, "step": 1993 }, { - "epoch": 0.6, - "grad_norm": 12.591381072998047, - "learning_rate": 1.6004811065450537e-05, - "loss": 3.0018, + "epoch": 0.25, + "grad_norm": 15.695525169372559, + "learning_rate": 1.8332426892021925e-05, + "loss": 1.6865, "step": 1994 }, { - "epoch": 0.6, - "grad_norm": 25.672998428344727, - "learning_rate": 1.6002806454846147e-05, - "loss": 2.6811, + "epoch": 0.25, + "grad_norm": 10.566913604736328, + "learning_rate": 1.8331590176965235e-05, + "loss": 4.4794, "step": 1995 }, { - "epoch": 0.6, - "grad_norm": 24.982601165771484, - "learning_rate": 1.6000801844241757e-05, - "loss": 3.1955, + "epoch": 0.25, + "grad_norm": 11.069722175598145, + "learning_rate": 1.833075346190855e-05, + "loss": 2.5513, "step": 1996 }, { - "epoch": 0.6, - "grad_norm": 12.664204597473145, - "learning_rate": 1.5998797233637367e-05, - "loss": 2.2998, + "epoch": 0.25, + "grad_norm": 7.837258338928223, + "learning_rate": 1.8329916746851863e-05, + "loss": 2.3469, "step": 1997 }, { - "epoch": 0.6, - "grad_norm": 10.164942741394043, - "learning_rate": 1.5996792623032977e-05, - "loss": 3.4713, + "epoch": 0.25, + "grad_norm": 11.02144718170166, + "learning_rate": 1.8329080031795173e-05, + "loss": 2.1457, "step": 1998 }, { - "epoch": 0.6, - "grad_norm": 11.533797264099121, - "learning_rate": 1.5994788012428588e-05, - "loss": 2.7492, + "epoch": 0.25, + "grad_norm": 12.016775131225586, + "learning_rate": 1.8328243316738487e-05, + "loss": 2.4249, "step": 1999 }, { - "epoch": 0.6, - "grad_norm": 31.470319747924805, - "learning_rate": 1.5992783401824198e-05, - "loss": 3.0072, + "epoch": 0.25, + "grad_norm": 13.773508071899414, + "learning_rate": 1.83274066016818e-05, + "loss": 3.2491, "step": 2000 }, { - "epoch": 0.6, - "grad_norm": 17.791658401489258, - "learning_rate": 1.5990778791219808e-05, - "loss": 1.9317, + "epoch": 0.25, + "eval_loss": 0.17568634450435638, + "eval_runtime": 93.8646, + "eval_samples_per_second": 37.735, + "eval_steps_per_second": 37.735, + "step": 2000 + }, + { + "epoch": 0.25, + "grad_norm": 28.11022186279297, + "learning_rate": 1.832656988662511e-05, + "loss": 3.2889, "step": 2001 }, { - "epoch": 0.6, - "grad_norm": 31.150108337402344, - "learning_rate": 1.5988774180615418e-05, - "loss": 2.8161, + "epoch": 0.25, + "grad_norm": 11.31784725189209, + "learning_rate": 1.8325733171568424e-05, + "loss": 2.3865, "step": 2002 }, { - "epoch": 0.6, - "grad_norm": 13.658658027648926, - "learning_rate": 1.5986769570011028e-05, - "loss": 1.9486, + "epoch": 0.25, + "grad_norm": 5.304311752319336, + "learning_rate": 1.8324896456511734e-05, + "loss": 0.5511, "step": 2003 }, { - "epoch": 0.6, - "grad_norm": 13.074492454528809, - "learning_rate": 1.5984764959406635e-05, - "loss": 1.8306, + "epoch": 0.25, + "grad_norm": 19.37568473815918, + "learning_rate": 1.8324059741455048e-05, + "loss": 2.0012, "step": 2004 }, { - "epoch": 0.6, - "grad_norm": 13.934167861938477, - "learning_rate": 1.5982760348802248e-05, - "loss": 2.0469, + "epoch": 0.25, + "grad_norm": 8.72088623046875, + "learning_rate": 1.8323223026398362e-05, + "loss": 1.6433, "step": 2005 }, { - "epoch": 0.6, - "grad_norm": 19.25830078125, - "learning_rate": 1.5980755738197858e-05, - "loss": 3.0445, + "epoch": 0.25, + "grad_norm": 21.67353630065918, + "learning_rate": 1.8322386311341672e-05, + "loss": 2.4568, "step": 2006 }, { - "epoch": 0.6, - "grad_norm": 12.957606315612793, - "learning_rate": 1.5978751127593465e-05, - "loss": 2.2492, + "epoch": 0.25, + "grad_norm": 16.720203399658203, + "learning_rate": 1.8321549596284986e-05, + "loss": 2.1404, "step": 2007 }, { - "epoch": 0.6, - "grad_norm": 18.43289566040039, - "learning_rate": 1.5976746516989078e-05, - "loss": 3.2304, + "epoch": 0.25, + "grad_norm": 13.917689323425293, + "learning_rate": 1.83207128812283e-05, + "loss": 3.6468, "step": 2008 }, { - "epoch": 0.6, - "grad_norm": 15.042205810546875, - "learning_rate": 1.5974741906384685e-05, - "loss": 2.5782, + "epoch": 0.25, + "grad_norm": 13.364958763122559, + "learning_rate": 1.831987616617161e-05, + "loss": 2.2146, "step": 2009 }, { - "epoch": 0.6, - "grad_norm": 32.639183044433594, - "learning_rate": 1.5972737295780295e-05, - "loss": 2.2884, + "epoch": 0.25, + "grad_norm": 16.78025245666504, + "learning_rate": 1.8319039451114923e-05, + "loss": 3.4028, "step": 2010 }, { - "epoch": 0.6, - "grad_norm": 14.541574478149414, - "learning_rate": 1.5970732685175905e-05, - "loss": 2.1057, + "epoch": 0.25, + "grad_norm": 14.87304401397705, + "learning_rate": 1.8318202736058237e-05, + "loss": 2.1005, "step": 2011 }, { - "epoch": 0.6, - "grad_norm": 33.099979400634766, - "learning_rate": 1.5968728074571515e-05, - "loss": 2.8465, + "epoch": 0.25, + "grad_norm": 21.702777862548828, + "learning_rate": 1.831736602100155e-05, + "loss": 2.0948, "step": 2012 }, { - "epoch": 0.61, - "grad_norm": 29.527706146240234, - "learning_rate": 1.5966723463967125e-05, - "loss": 1.8221, + "epoch": 0.25, + "grad_norm": 17.03411293029785, + "learning_rate": 1.831652930594486e-05, + "loss": 3.4917, "step": 2013 }, { - "epoch": 0.61, - "grad_norm": 12.222841262817383, - "learning_rate": 1.5964718853362735e-05, - "loss": 1.9562, + "epoch": 0.25, + "grad_norm": 122.54450225830078, + "learning_rate": 1.8315692590888174e-05, + "loss": 1.5136, "step": 2014 }, { - "epoch": 0.61, - "grad_norm": 11.430624961853027, - "learning_rate": 1.5962714242758345e-05, - "loss": 1.8148, + "epoch": 0.25, + "grad_norm": 8.881357192993164, + "learning_rate": 1.8314855875831488e-05, + "loss": 1.9177, "step": 2015 }, { - "epoch": 0.61, - "grad_norm": 12.992792129516602, - "learning_rate": 1.5960709632153956e-05, - "loss": 2.0858, + "epoch": 0.25, + "grad_norm": 12.461234092712402, + "learning_rate": 1.83140191607748e-05, + "loss": 2.3942, "step": 2016 }, { - "epoch": 0.61, - "grad_norm": 25.652822494506836, - "learning_rate": 1.5958705021549566e-05, - "loss": 3.0605, + "epoch": 0.25, + "grad_norm": 12.239439964294434, + "learning_rate": 1.8313182445718112e-05, + "loss": 3.2475, "step": 2017 }, { - "epoch": 0.61, - "grad_norm": 9.427448272705078, - "learning_rate": 1.5956700410945176e-05, - "loss": 2.0717, + "epoch": 0.25, + "grad_norm": 15.35310173034668, + "learning_rate": 1.8312345730661426e-05, + "loss": 2.0139, "step": 2018 }, { - "epoch": 0.61, - "grad_norm": 18.489501953125, - "learning_rate": 1.5954695800340786e-05, - "loss": 2.3319, + "epoch": 0.25, + "grad_norm": 11.228468894958496, + "learning_rate": 1.831150901560474e-05, + "loss": 2.1745, "step": 2019 }, { - "epoch": 0.61, - "grad_norm": 14.960678100585938, - "learning_rate": 1.5952691189736396e-05, - "loss": 2.3096, + "epoch": 0.25, + "grad_norm": 9.156081199645996, + "learning_rate": 1.831067230054805e-05, + "loss": 1.7031, "step": 2020 }, { - "epoch": 0.61, - "grad_norm": 17.850452423095703, - "learning_rate": 1.5950686579132003e-05, - "loss": 2.5948, + "epoch": 0.25, + "grad_norm": 16.42210578918457, + "learning_rate": 1.8309835585491363e-05, + "loss": 3.2483, "step": 2021 }, { - "epoch": 0.61, - "grad_norm": 13.320516586303711, - "learning_rate": 1.5948681968527616e-05, - "loss": 2.0685, + "epoch": 0.25, + "grad_norm": 12.818464279174805, + "learning_rate": 1.8308998870434677e-05, + "loss": 1.0547, "step": 2022 }, { - "epoch": 0.61, - "grad_norm": 12.267958641052246, - "learning_rate": 1.5946677357923223e-05, - "loss": 2.1863, + "epoch": 0.25, + "grad_norm": 9.632564544677734, + "learning_rate": 1.8308162155377987e-05, + "loss": 1.0357, "step": 2023 }, { - "epoch": 0.61, - "grad_norm": 17.197010040283203, - "learning_rate": 1.5944672747318833e-05, - "loss": 2.5286, + "epoch": 0.25, + "grad_norm": 14.38925838470459, + "learning_rate": 1.83073254403213e-05, + "loss": 3.4575, "step": 2024 }, { - "epoch": 0.61, - "grad_norm": 16.6329288482666, - "learning_rate": 1.5942668136714446e-05, - "loss": 2.0742, + "epoch": 0.25, + "grad_norm": 12.298242568969727, + "learning_rate": 1.8306488725264614e-05, + "loss": 1.9957, "step": 2025 }, { - "epoch": 0.61, - "grad_norm": 36.75046157836914, - "learning_rate": 1.5940663526110053e-05, - "loss": 2.7239, + "epoch": 0.25, + "grad_norm": 15.588695526123047, + "learning_rate": 1.8305652010207925e-05, + "loss": 2.809, "step": 2026 }, { - "epoch": 0.61, - "grad_norm": 14.18947982788086, - "learning_rate": 1.5938658915505663e-05, - "loss": 2.6615, + "epoch": 0.25, + "grad_norm": 12.689305305480957, + "learning_rate": 1.830481529515124e-05, + "loss": 1.6972, "step": 2027 }, { - "epoch": 0.61, - "grad_norm": 13.752023696899414, - "learning_rate": 1.5936654304901273e-05, - "loss": 1.8863, + "epoch": 0.25, + "grad_norm": 16.787702560424805, + "learning_rate": 1.830397858009455e-05, + "loss": 3.0474, "step": 2028 }, { - "epoch": 0.61, - "grad_norm": 17.094932556152344, - "learning_rate": 1.5934649694296883e-05, - "loss": 3.2093, + "epoch": 0.25, + "grad_norm": 10.33752155303955, + "learning_rate": 1.8303141865037862e-05, + "loss": 2.0853, "step": 2029 }, { - "epoch": 0.61, - "grad_norm": 16.756057739257812, - "learning_rate": 1.5932645083692497e-05, - "loss": 2.6765, + "epoch": 0.25, + "grad_norm": 12.861720085144043, + "learning_rate": 1.8302305149981176e-05, + "loss": 2.6356, "step": 2030 }, { - "epoch": 0.61, - "grad_norm": 21.48285484313965, - "learning_rate": 1.5930640473088103e-05, - "loss": 2.0404, + "epoch": 0.25, + "grad_norm": 6.6646809577941895, + "learning_rate": 1.8301468434924486e-05, + "loss": 3.4687, "step": 2031 }, { - "epoch": 0.61, - "grad_norm": 31.83589744567871, - "learning_rate": 1.5928635862483714e-05, - "loss": 2.708, + "epoch": 0.26, + "grad_norm": 14.956864356994629, + "learning_rate": 1.83006317198678e-05, + "loss": 2.253, "step": 2032 }, { - "epoch": 0.61, - "grad_norm": 50.02532958984375, - "learning_rate": 1.5926631251879324e-05, - "loss": 2.5092, + "epoch": 0.26, + "grad_norm": 14.440831184387207, + "learning_rate": 1.8299795004811113e-05, + "loss": 2.7597, "step": 2033 }, { - "epoch": 0.61, - "grad_norm": 11.736242294311523, - "learning_rate": 1.5924626641274934e-05, - "loss": 1.6545, + "epoch": 0.26, + "grad_norm": 23.195627212524414, + "learning_rate": 1.8298958289754424e-05, + "loss": 3.7566, "step": 2034 }, { - "epoch": 0.61, - "grad_norm": 22.306257247924805, - "learning_rate": 1.5922622030670544e-05, - "loss": 2.438, + "epoch": 0.26, + "grad_norm": 18.944862365722656, + "learning_rate": 1.8298121574697737e-05, + "loss": 1.6732, "step": 2035 }, { - "epoch": 0.61, - "grad_norm": 19.088542938232422, - "learning_rate": 1.5920617420066154e-05, - "loss": 2.5573, + "epoch": 0.26, + "grad_norm": 14.188807487487793, + "learning_rate": 1.829728485964105e-05, + "loss": 0.6534, "step": 2036 }, { - "epoch": 0.61, - "grad_norm": 43.53116989135742, - "learning_rate": 1.5918612809461764e-05, - "loss": 2.4721, + "epoch": 0.26, + "grad_norm": 14.124086380004883, + "learning_rate": 1.829644814458436e-05, + "loss": 4.062, "step": 2037 }, { - "epoch": 0.61, - "grad_norm": 16.74660873413086, - "learning_rate": 1.5916608198857374e-05, - "loss": 2.1669, + "epoch": 0.26, + "grad_norm": 11.936144828796387, + "learning_rate": 1.8295611429527675e-05, + "loss": 2.4579, "step": 2038 }, { - "epoch": 0.61, - "grad_norm": 18.83763885498047, - "learning_rate": 1.5914603588252984e-05, - "loss": 2.0639, + "epoch": 0.26, + "grad_norm": 17.59867286682129, + "learning_rate": 1.829477471447099e-05, + "loss": 4.4768, "step": 2039 }, { - "epoch": 0.61, - "grad_norm": 12.473159790039062, - "learning_rate": 1.591259897764859e-05, - "loss": 2.0599, - "step": 2040 - }, - { - "epoch": 0.61, - "eval_loss": 0.35515907406806946, - "eval_runtime": 43.2723, - "eval_samples_per_second": 34.179, - "eval_steps_per_second": 34.179, + "epoch": 0.26, + "grad_norm": 14.712457656860352, + "learning_rate": 1.8293937999414302e-05, + "loss": 3.6839, "step": 2040 }, { - "epoch": 0.61, - "grad_norm": 34.54806900024414, - "learning_rate": 1.5910594367044204e-05, - "loss": 2.8225, + "epoch": 0.26, + "grad_norm": 12.759182929992676, + "learning_rate": 1.8293101284357612e-05, + "loss": 1.928, "step": 2041 }, { - "epoch": 0.61, - "grad_norm": 11.377670288085938, - "learning_rate": 1.5908589756439814e-05, - "loss": 1.3387, + "epoch": 0.26, + "grad_norm": 10.04848861694336, + "learning_rate": 1.8292264569300926e-05, + "loss": 0.9844, "step": 2042 }, { - "epoch": 0.61, - "grad_norm": 21.465269088745117, - "learning_rate": 1.590658514583542e-05, - "loss": 3.1448, + "epoch": 0.26, + "grad_norm": 11.40523910522461, + "learning_rate": 1.829142785424424e-05, + "loss": 1.5692, "step": 2043 }, { - "epoch": 0.61, - "grad_norm": 13.400733947753906, - "learning_rate": 1.5904580535231035e-05, - "loss": 1.9627, + "epoch": 0.26, + "grad_norm": 13.77685832977295, + "learning_rate": 1.829059113918755e-05, + "loss": 2.7652, "step": 2044 }, { - "epoch": 0.61, - "grad_norm": 8.886704444885254, - "learning_rate": 1.590257592462664e-05, - "loss": 1.8101, + "epoch": 0.26, + "grad_norm": 7.332242012023926, + "learning_rate": 1.8289754424130864e-05, + "loss": 1.2132, "step": 2045 }, { - "epoch": 0.62, - "grad_norm": 17.08548355102539, - "learning_rate": 1.590057131402225e-05, - "loss": 2.1456, + "epoch": 0.26, + "grad_norm": 8.43189525604248, + "learning_rate": 1.8288917709074177e-05, + "loss": 0.9907, "step": 2046 }, { - "epoch": 0.62, - "grad_norm": 16.1516170501709, - "learning_rate": 1.589856670341786e-05, - "loss": 2.642, + "epoch": 0.26, + "grad_norm": 17.869178771972656, + "learning_rate": 1.828808099401749e-05, + "loss": 2.8893, "step": 2047 }, { - "epoch": 0.62, - "grad_norm": 16.168611526489258, - "learning_rate": 1.589656209281347e-05, - "loss": 2.2961, + "epoch": 0.26, + "grad_norm": 12.180258750915527, + "learning_rate": 1.82872442789608e-05, + "loss": 3.3625, "step": 2048 }, { - "epoch": 0.62, - "grad_norm": 17.722412109375, - "learning_rate": 1.589455748220908e-05, - "loss": 2.2299, + "epoch": 0.26, + "grad_norm": 16.094064712524414, + "learning_rate": 1.8286407563904115e-05, + "loss": 1.7809, "step": 2049 }, { - "epoch": 0.62, - "grad_norm": 17.987937927246094, - "learning_rate": 1.589255287160469e-05, - "loss": 2.1003, + "epoch": 0.26, + "grad_norm": 13.385570526123047, + "learning_rate": 1.828557084884743e-05, + "loss": 3.2022, "step": 2050 }, { - "epoch": 0.62, - "grad_norm": 13.64321517944336, - "learning_rate": 1.5890548261000302e-05, - "loss": 2.5651, + "epoch": 0.26, + "grad_norm": 11.876225471496582, + "learning_rate": 1.828473413379074e-05, + "loss": 1.3704, "step": 2051 }, { - "epoch": 0.62, - "grad_norm": 10.08339786529541, - "learning_rate": 1.5888543650395912e-05, - "loss": 2.679, + "epoch": 0.26, + "grad_norm": 15.90719985961914, + "learning_rate": 1.8283897418734052e-05, + "loss": 1.4698, "step": 2052 }, { - "epoch": 0.62, - "grad_norm": 19.540874481201172, - "learning_rate": 1.5886539039791522e-05, - "loss": 1.9754, + "epoch": 0.26, + "grad_norm": 22.79240608215332, + "learning_rate": 1.8283060703677366e-05, + "loss": 2.6591, "step": 2053 }, { - "epoch": 0.62, - "grad_norm": 11.823172569274902, - "learning_rate": 1.5884534429187132e-05, - "loss": 1.5079, + "epoch": 0.26, + "grad_norm": 13.034547805786133, + "learning_rate": 1.8282223988620676e-05, + "loss": 1.9659, "step": 2054 }, { - "epoch": 0.62, - "grad_norm": 14.114450454711914, - "learning_rate": 1.5882529818582742e-05, - "loss": 2.3742, + "epoch": 0.26, + "grad_norm": 17.19205093383789, + "learning_rate": 1.828138727356399e-05, + "loss": 3.2907, "step": 2055 }, { - "epoch": 0.62, - "grad_norm": 9.858566284179688, - "learning_rate": 1.5880525207978352e-05, - "loss": 3.4989, + "epoch": 0.26, + "grad_norm": 13.555797576904297, + "learning_rate": 1.82805505585073e-05, + "loss": 2.1088, "step": 2056 }, { - "epoch": 0.62, - "grad_norm": 14.25406551361084, - "learning_rate": 1.5878520597373962e-05, - "loss": 3.3511, + "epoch": 0.26, + "grad_norm": 13.456886291503906, + "learning_rate": 1.8279713843450614e-05, + "loss": 2.3089, "step": 2057 }, { - "epoch": 0.62, - "grad_norm": 16.247241973876953, - "learning_rate": 1.5876515986769572e-05, - "loss": 2.1215, + "epoch": 0.26, + "grad_norm": 15.142951965332031, + "learning_rate": 1.8278877128393928e-05, + "loss": 2.9177, "step": 2058 }, { - "epoch": 0.62, - "grad_norm": 13.218579292297363, - "learning_rate": 1.587451137616518e-05, - "loss": 2.4852, + "epoch": 0.26, + "grad_norm": 12.207382202148438, + "learning_rate": 1.8278040413337238e-05, + "loss": 3.182, "step": 2059 }, { - "epoch": 0.62, - "grad_norm": 13.075812339782715, - "learning_rate": 1.5872506765560793e-05, - "loss": 2.3496, + "epoch": 0.26, + "grad_norm": 5.149965286254883, + "learning_rate": 1.827720369828055e-05, + "loss": 0.2904, "step": 2060 }, { - "epoch": 0.62, - "grad_norm": 26.715604782104492, - "learning_rate": 1.5870502154956403e-05, - "loss": 2.1378, + "epoch": 0.26, + "grad_norm": 19.729907989501953, + "learning_rate": 1.8276366983223865e-05, + "loss": 3.0673, "step": 2061 }, { - "epoch": 0.62, - "grad_norm": 20.732059478759766, - "learning_rate": 1.586849754435201e-05, - "loss": 2.008, + "epoch": 0.26, + "grad_norm": 10.116591453552246, + "learning_rate": 1.8275530268167175e-05, + "loss": 1.1507, "step": 2062 }, { - "epoch": 0.62, - "grad_norm": 18.93284797668457, - "learning_rate": 1.5866492933747623e-05, - "loss": 3.4783, + "epoch": 0.26, + "grad_norm": 10.737142562866211, + "learning_rate": 1.827469355311049e-05, + "loss": 2.4385, "step": 2063 }, { - "epoch": 0.62, - "grad_norm": 9.962442398071289, - "learning_rate": 1.586448832314323e-05, - "loss": 2.3671, + "epoch": 0.26, + "grad_norm": 9.6014986038208, + "learning_rate": 1.8273856838053803e-05, + "loss": 2.2556, "step": 2064 }, { - "epoch": 0.62, - "grad_norm": 9.932574272155762, - "learning_rate": 1.586248371253884e-05, - "loss": 1.7524, + "epoch": 0.26, + "grad_norm": 16.13591957092285, + "learning_rate": 1.8273020122997113e-05, + "loss": 2.2409, "step": 2065 }, { - "epoch": 0.62, - "grad_norm": 16.17888069152832, - "learning_rate": 1.586047910193445e-05, - "loss": 1.6364, + "epoch": 0.26, + "grad_norm": 17.769683837890625, + "learning_rate": 1.8272183407940427e-05, + "loss": 3.6554, "step": 2066 }, { - "epoch": 0.62, - "grad_norm": 24.62932014465332, - "learning_rate": 1.585847449133006e-05, - "loss": 1.8643, + "epoch": 0.26, + "grad_norm": 21.505428314208984, + "learning_rate": 1.827134669288374e-05, + "loss": 3.0136, "step": 2067 }, { - "epoch": 0.62, - "grad_norm": 17.033721923828125, - "learning_rate": 1.585646988072567e-05, - "loss": 2.2953, + "epoch": 0.26, + "grad_norm": 10.269015312194824, + "learning_rate": 1.8270509977827054e-05, + "loss": 1.3763, "step": 2068 }, { - "epoch": 0.62, - "grad_norm": 24.557415008544922, - "learning_rate": 1.585446527012128e-05, - "loss": 3.0784, + "epoch": 0.26, + "grad_norm": 8.491415977478027, + "learning_rate": 1.8269673262770364e-05, + "loss": 1.7759, "step": 2069 }, { - "epoch": 0.62, - "grad_norm": 27.415164947509766, - "learning_rate": 1.585246065951689e-05, - "loss": 2.3325, + "epoch": 0.26, + "grad_norm": 21.678611755371094, + "learning_rate": 1.8268836547713678e-05, + "loss": 3.0247, "step": 2070 }, { - "epoch": 0.62, - "grad_norm": 18.90687370300293, - "learning_rate": 1.58504560489125e-05, - "loss": 2.4899, + "epoch": 0.26, + "grad_norm": 9.085183143615723, + "learning_rate": 1.826799983265699e-05, + "loss": 1.5463, "step": 2071 }, { - "epoch": 0.62, - "grad_norm": 12.064643859863281, - "learning_rate": 1.584845143830811e-05, - "loss": 2.6493, + "epoch": 0.26, + "grad_norm": 5.598382472991943, + "learning_rate": 1.8267163117600302e-05, + "loss": 0.2033, "step": 2072 }, { - "epoch": 0.62, - "grad_norm": 15.442275047302246, - "learning_rate": 1.584644682770372e-05, - "loss": 1.9658, + "epoch": 0.26, + "grad_norm": 12.32597827911377, + "learning_rate": 1.8266326402543615e-05, + "loss": 2.0717, "step": 2073 }, { - "epoch": 0.62, - "grad_norm": 14.710504531860352, - "learning_rate": 1.584444221709933e-05, - "loss": 2.2267, + "epoch": 0.26, + "grad_norm": 64.5411148071289, + "learning_rate": 1.826548968748693e-05, + "loss": 2.2075, "step": 2074 }, { - "epoch": 0.62, - "grad_norm": 9.406432151794434, - "learning_rate": 1.584243760649494e-05, - "loss": 1.4927, + "epoch": 0.26, + "grad_norm": 15.841133117675781, + "learning_rate": 1.8264652972430243e-05, + "loss": 1.7682, "step": 2075 }, { - "epoch": 0.62, - "grad_norm": 14.804086685180664, - "learning_rate": 1.584043299589055e-05, - "loss": 1.8073, + "epoch": 0.26, + "grad_norm": 13.668527603149414, + "learning_rate": 1.8263816257373553e-05, + "loss": 1.8377, "step": 2076 }, { - "epoch": 0.62, - "grad_norm": 15.176630973815918, - "learning_rate": 1.583842838528616e-05, - "loss": 1.8248, + "epoch": 0.26, + "grad_norm": 22.06974220275879, + "learning_rate": 1.8262979542316867e-05, + "loss": 3.6966, "step": 2077 }, { - "epoch": 0.62, - "grad_norm": 20.446643829345703, - "learning_rate": 1.5836423774681767e-05, - "loss": 2.3386, + "epoch": 0.26, + "grad_norm": 11.24099063873291, + "learning_rate": 1.826214282726018e-05, + "loss": 3.6092, "step": 2078 }, { - "epoch": 0.63, - "grad_norm": 20.13260269165039, - "learning_rate": 1.583441916407738e-05, - "loss": 2.7811, + "epoch": 0.26, + "grad_norm": 25.48215675354004, + "learning_rate": 1.826130611220349e-05, + "loss": 3.2326, "step": 2079 }, { - "epoch": 0.63, - "grad_norm": 18.64963150024414, - "learning_rate": 1.583241455347299e-05, - "loss": 2.9017, + "epoch": 0.26, + "grad_norm": 16.542388916015625, + "learning_rate": 1.8260469397146804e-05, + "loss": 1.4559, "step": 2080 }, { - "epoch": 0.63, - "grad_norm": 17.931795120239258, - "learning_rate": 1.5830409942868597e-05, - "loss": 2.3632, + "epoch": 0.26, + "grad_norm": 10.460676193237305, + "learning_rate": 1.8259632682090114e-05, + "loss": 2.46, "step": 2081 }, { - "epoch": 0.63, - "grad_norm": 19.928274154663086, - "learning_rate": 1.582840533226421e-05, - "loss": 1.7388, + "epoch": 0.26, + "grad_norm": 51.06991958618164, + "learning_rate": 1.8258795967033428e-05, + "loss": 3.0062, "step": 2082 }, { - "epoch": 0.63, - "grad_norm": 11.930707931518555, - "learning_rate": 1.5826400721659818e-05, - "loss": 1.5894, + "epoch": 0.26, + "grad_norm": 14.114103317260742, + "learning_rate": 1.825795925197674e-05, + "loss": 1.4743, "step": 2083 }, { - "epoch": 0.63, - "grad_norm": 24.95637321472168, - "learning_rate": 1.5824396111055428e-05, - "loss": 3.1926, + "epoch": 0.26, + "grad_norm": 31.35147476196289, + "learning_rate": 1.8257122536920052e-05, + "loss": 4.4442, "step": 2084 }, { - "epoch": 0.63, - "grad_norm": 14.697010040283203, - "learning_rate": 1.582239150045104e-05, - "loss": 3.0097, + "epoch": 0.26, + "grad_norm": 9.869153022766113, + "learning_rate": 1.8256285821863366e-05, + "loss": 2.6016, "step": 2085 }, { - "epoch": 0.63, - "grad_norm": 9.35490894317627, - "learning_rate": 1.5820386889846648e-05, - "loss": 1.5664, + "epoch": 0.26, + "grad_norm": 17.398767471313477, + "learning_rate": 1.8255449106806676e-05, + "loss": 1.9688, "step": 2086 }, { - "epoch": 0.63, - "grad_norm": 21.47760581970215, - "learning_rate": 1.5818382279242258e-05, - "loss": 2.8386, + "epoch": 0.26, + "grad_norm": 8.782936096191406, + "learning_rate": 1.825461239174999e-05, + "loss": 0.8068, "step": 2087 }, { - "epoch": 0.63, - "grad_norm": 16.634397506713867, - "learning_rate": 1.5816377668637868e-05, - "loss": 2.8702, + "epoch": 0.26, + "grad_norm": 10.660070419311523, + "learning_rate": 1.8253775676693303e-05, + "loss": 0.635, "step": 2088 }, { - "epoch": 0.63, - "grad_norm": 12.756913185119629, - "learning_rate": 1.5814373058033478e-05, - "loss": 2.456, + "epoch": 0.26, + "grad_norm": 16.110692977905273, + "learning_rate": 1.8252938961636617e-05, + "loss": 2.1616, "step": 2089 }, { - "epoch": 0.63, - "grad_norm": 13.137290000915527, - "learning_rate": 1.5812368447429088e-05, - "loss": 1.8993, + "epoch": 0.26, + "grad_norm": 9.807194709777832, + "learning_rate": 1.8252102246579927e-05, + "loss": 1.3718, "step": 2090 }, { - "epoch": 0.63, - "grad_norm": 23.007200241088867, - "learning_rate": 1.58103638368247e-05, - "loss": 2.7142, + "epoch": 0.26, + "grad_norm": 11.816697120666504, + "learning_rate": 1.825126553152324e-05, + "loss": 3.6432, "step": 2091 }, { - "epoch": 0.63, - "grad_norm": 10.685704231262207, - "learning_rate": 1.580835922622031e-05, - "loss": 1.872, + "epoch": 0.26, + "grad_norm": 14.993602752685547, + "learning_rate": 1.8250428816466554e-05, + "loss": 3.2196, "step": 2092 }, { - "epoch": 0.63, - "grad_norm": 34.11006164550781, - "learning_rate": 1.580635461561592e-05, - "loss": 2.8411, + "epoch": 0.26, + "grad_norm": 19.83965492248535, + "learning_rate": 1.8249592101409865e-05, + "loss": 1.3436, "step": 2093 }, { - "epoch": 0.63, - "grad_norm": 15.17668628692627, - "learning_rate": 1.580435000501153e-05, - "loss": 2.318, + "epoch": 0.26, + "grad_norm": 14.342920303344727, + "learning_rate": 1.8248755386353178e-05, + "loss": 1.0609, "step": 2094 }, { - "epoch": 0.63, - "grad_norm": 10.943110466003418, - "learning_rate": 1.580234539440714e-05, - "loss": 3.1694, + "epoch": 0.26, + "grad_norm": 43.393165588378906, + "learning_rate": 1.8247918671296492e-05, + "loss": 1.8596, "step": 2095 }, { - "epoch": 0.63, - "grad_norm": 17.957090377807617, - "learning_rate": 1.580034078380275e-05, - "loss": 2.8144, + "epoch": 0.26, + "grad_norm": 16.4021053314209, + "learning_rate": 1.8247081956239806e-05, + "loss": 4.3693, "step": 2096 }, { - "epoch": 0.63, - "grad_norm": 21.06376075744629, - "learning_rate": 1.5798336173198355e-05, - "loss": 2.4499, + "epoch": 0.26, + "grad_norm": 10.059845924377441, + "learning_rate": 1.8246245241183116e-05, + "loss": 1.4066, "step": 2097 }, { - "epoch": 0.63, - "grad_norm": 17.400442123413086, - "learning_rate": 1.579633156259397e-05, - "loss": 1.1497, + "epoch": 0.26, + "grad_norm": 16.443490982055664, + "learning_rate": 1.824540852612643e-05, + "loss": 1.9437, "step": 2098 }, { - "epoch": 0.63, - "grad_norm": 14.588909149169922, - "learning_rate": 1.579432695198958e-05, - "loss": 3.2946, + "epoch": 0.26, + "grad_norm": 10.47969913482666, + "learning_rate": 1.8244571811069743e-05, + "loss": 2.004, "step": 2099 }, { - "epoch": 0.63, - "grad_norm": 13.294282913208008, - "learning_rate": 1.5792322341385186e-05, - "loss": 1.6623, + "epoch": 0.26, + "grad_norm": 11.944048881530762, + "learning_rate": 1.8243735096013053e-05, + "loss": 3.0902, "step": 2100 }, { - "epoch": 0.63, - "grad_norm": 23.731582641601562, - "learning_rate": 1.57903177307808e-05, - "loss": 3.3506, + "epoch": 0.26, + "grad_norm": 12.932525634765625, + "learning_rate": 1.8242898380956367e-05, + "loss": 2.3718, "step": 2101 }, { - "epoch": 0.63, - "grad_norm": 15.64577579498291, - "learning_rate": 1.5788313120176406e-05, - "loss": 2.6303, + "epoch": 0.26, + "grad_norm": 11.039018630981445, + "learning_rate": 1.824206166589968e-05, + "loss": 4.8056, "step": 2102 }, { - "epoch": 0.63, - "grad_norm": 19.747045516967773, - "learning_rate": 1.5786308509572016e-05, - "loss": 2.5166, + "epoch": 0.26, + "grad_norm": 24.806896209716797, + "learning_rate": 1.8241224950842994e-05, + "loss": 2.4179, "step": 2103 }, { - "epoch": 0.63, - "grad_norm": 42.6046142578125, - "learning_rate": 1.578430389896763e-05, - "loss": 2.8453, + "epoch": 0.26, + "grad_norm": 13.5587158203125, + "learning_rate": 1.8240388235786305e-05, + "loss": 3.4312, "step": 2104 }, { - "epoch": 0.63, - "grad_norm": 8.715738296508789, - "learning_rate": 1.5782299288363236e-05, - "loss": 1.9525, + "epoch": 0.26, + "grad_norm": 9.747724533081055, + "learning_rate": 1.8239551520729618e-05, + "loss": 1.2014, "step": 2105 }, { - "epoch": 0.63, - "grad_norm": 31.00543975830078, - "learning_rate": 1.5780294677758846e-05, - "loss": 3.0852, + "epoch": 0.26, + "grad_norm": 8.556553840637207, + "learning_rate": 1.823871480567293e-05, + "loss": 2.2537, "step": 2106 }, { - "epoch": 0.63, - "grad_norm": 10.329021453857422, - "learning_rate": 1.5778290067154456e-05, - "loss": 2.1254, + "epoch": 0.26, + "grad_norm": 13.39060115814209, + "learning_rate": 1.8237878090616242e-05, + "loss": 2.0256, "step": 2107 }, { - "epoch": 0.63, - "grad_norm": 12.255505561828613, - "learning_rate": 1.5776285456550066e-05, - "loss": 2.1254, + "epoch": 0.26, + "grad_norm": 17.98282241821289, + "learning_rate": 1.8237041375559556e-05, + "loss": 3.5958, "step": 2108 }, { - "epoch": 0.63, - "grad_norm": 13.093355178833008, - "learning_rate": 1.5774280845945676e-05, - "loss": 2.1318, + "epoch": 0.26, + "grad_norm": 15.861956596374512, + "learning_rate": 1.8236204660502866e-05, + "loss": 3.77, "step": 2109 }, { - "epoch": 0.63, - "grad_norm": 34.728302001953125, - "learning_rate": 1.5772276235341287e-05, - "loss": 3.1983, + "epoch": 0.26, + "grad_norm": 19.806100845336914, + "learning_rate": 1.823536794544618e-05, + "loss": 1.8967, "step": 2110 }, { - "epoch": 0.63, - "grad_norm": 13.47143840789795, - "learning_rate": 1.5770271624736897e-05, - "loss": 2.0851, + "epoch": 0.26, + "grad_norm": 13.889492988586426, + "learning_rate": 1.8234531230389493e-05, + "loss": 1.456, "step": 2111 }, { - "epoch": 0.63, - "grad_norm": 13.41025161743164, - "learning_rate": 1.5768267014132507e-05, - "loss": 2.8133, + "epoch": 0.27, + "grad_norm": 19.10417366027832, + "learning_rate": 1.8233694515332804e-05, + "loss": 3.4118, "step": 2112 }, { - "epoch": 0.64, - "grad_norm": 15.229413986206055, - "learning_rate": 1.5766262403528117e-05, - "loss": 2.8769, + "epoch": 0.27, + "grad_norm": 9.383771896362305, + "learning_rate": 1.8232857800276117e-05, + "loss": 1.2884, "step": 2113 }, { - "epoch": 0.64, - "grad_norm": 29.792980194091797, - "learning_rate": 1.5764257792923724e-05, - "loss": 2.6786, + "epoch": 0.27, + "grad_norm": 12.032513618469238, + "learning_rate": 1.8232021085219428e-05, + "loss": 2.0497, "step": 2114 }, { - "epoch": 0.64, - "grad_norm": 12.121004104614258, - "learning_rate": 1.5762253182319337e-05, - "loss": 2.3366, + "epoch": 0.27, + "grad_norm": 15.174636840820312, + "learning_rate": 1.823118437016274e-05, + "loss": 3.8727, "step": 2115 }, { - "epoch": 0.64, - "grad_norm": 17.383033752441406, - "learning_rate": 1.5760248571714947e-05, - "loss": 2.2895, + "epoch": 0.27, + "grad_norm": 47.917633056640625, + "learning_rate": 1.8230347655106055e-05, + "loss": 2.8027, "step": 2116 }, { - "epoch": 0.64, - "grad_norm": 28.382801055908203, - "learning_rate": 1.5758243961110554e-05, - "loss": 2.5891, + "epoch": 0.27, + "grad_norm": 6.924861431121826, + "learning_rate": 1.822951094004937e-05, + "loss": 1.0783, "step": 2117 }, { - "epoch": 0.64, - "grad_norm": 14.136621475219727, - "learning_rate": 1.5756239350506167e-05, - "loss": 2.4559, + "epoch": 0.27, + "grad_norm": 11.44529914855957, + "learning_rate": 1.822867422499268e-05, + "loss": 2.6785, "step": 2118 }, { - "epoch": 0.64, - "grad_norm": 12.245827674865723, - "learning_rate": 1.5754234739901774e-05, - "loss": 1.3721, + "epoch": 0.27, + "grad_norm": 26.966922760009766, + "learning_rate": 1.8227837509935992e-05, + "loss": 3.2264, "step": 2119 }, { - "epoch": 0.64, - "grad_norm": 10.706925392150879, - "learning_rate": 1.5752230129297384e-05, - "loss": 2.2164, + "epoch": 0.27, + "grad_norm": 16.654945373535156, + "learning_rate": 1.8227000794879306e-05, + "loss": 3.6384, "step": 2120 }, { - "epoch": 0.64, - "grad_norm": 16.694616317749023, - "learning_rate": 1.5750225518692994e-05, - "loss": 3.428, + "epoch": 0.27, + "grad_norm": 12.091151237487793, + "learning_rate": 1.8226164079822616e-05, + "loss": 0.3314, "step": 2121 }, { - "epoch": 0.64, - "grad_norm": 33.90262985229492, - "learning_rate": 1.5748220908088604e-05, - "loss": 3.3125, + "epoch": 0.27, + "grad_norm": 22.83612632751465, + "learning_rate": 1.822532736476593e-05, + "loss": 2.3924, "step": 2122 }, { - "epoch": 0.64, - "grad_norm": 12.597078323364258, - "learning_rate": 1.5746216297484214e-05, - "loss": 1.9381, + "epoch": 0.27, + "grad_norm": 10.339336395263672, + "learning_rate": 1.8224490649709244e-05, + "loss": 1.4609, "step": 2123 }, { - "epoch": 0.64, - "grad_norm": 13.725189208984375, - "learning_rate": 1.5744211686879824e-05, - "loss": 1.9921, + "epoch": 0.27, + "grad_norm": 12.055213928222656, + "learning_rate": 1.8223653934652557e-05, + "loss": 2.463, "step": 2124 }, { - "epoch": 0.64, - "grad_norm": 13.38775634765625, - "learning_rate": 1.5742207076275434e-05, - "loss": 2.3561, + "epoch": 0.27, + "grad_norm": 16.07337188720703, + "learning_rate": 1.8222817219595867e-05, + "loss": 1.9958, "step": 2125 }, { - "epoch": 0.64, - "grad_norm": 15.606164932250977, - "learning_rate": 1.5740202465671045e-05, - "loss": 3.1076, + "epoch": 0.27, + "grad_norm": 8.979070663452148, + "learning_rate": 1.822198050453918e-05, + "loss": 2.3272, "step": 2126 }, { - "epoch": 0.64, - "grad_norm": 17.07231330871582, - "learning_rate": 1.5738197855066655e-05, - "loss": 2.0324, + "epoch": 0.27, + "grad_norm": 8.645263671875, + "learning_rate": 1.8221143789482495e-05, + "loss": 2.1392, "step": 2127 }, { - "epoch": 0.64, - "grad_norm": 18.28791618347168, - "learning_rate": 1.5736193244462265e-05, - "loss": 2.8287, + "epoch": 0.27, + "grad_norm": 16.874176025390625, + "learning_rate": 1.8220307074425805e-05, + "loss": 2.6234, "step": 2128 }, { - "epoch": 0.64, - "grad_norm": 20.407081604003906, - "learning_rate": 1.5734188633857875e-05, - "loss": 2.3472, + "epoch": 0.27, + "grad_norm": 12.335731506347656, + "learning_rate": 1.821947035936912e-05, + "loss": 2.9914, "step": 2129 }, { - "epoch": 0.64, - "grad_norm": 12.980356216430664, - "learning_rate": 1.5732184023253485e-05, - "loss": 3.1184, + "epoch": 0.27, + "grad_norm": 21.420886993408203, + "learning_rate": 1.8218633644312432e-05, + "loss": 2.7485, "step": 2130 }, { - "epoch": 0.64, - "grad_norm": 19.251155853271484, - "learning_rate": 1.5730179412649095e-05, - "loss": 2.2474, + "epoch": 0.27, + "grad_norm": 13.961335182189941, + "learning_rate": 1.8217796929255746e-05, + "loss": 1.9668, "step": 2131 }, { - "epoch": 0.64, - "grad_norm": 8.661418914794922, - "learning_rate": 1.5728174802044705e-05, - "loss": 1.4024, + "epoch": 0.27, + "grad_norm": 13.38197135925293, + "learning_rate": 1.8216960214199056e-05, + "loss": 2.7635, "step": 2132 }, { - "epoch": 0.64, - "grad_norm": 24.52813148498535, - "learning_rate": 1.5726170191440312e-05, - "loss": 2.4654, + "epoch": 0.27, + "grad_norm": 9.04694652557373, + "learning_rate": 1.821612349914237e-05, + "loss": 1.0293, "step": 2133 }, { - "epoch": 0.64, - "grad_norm": 12.824371337890625, - "learning_rate": 1.5724165580835925e-05, - "loss": 1.5831, + "epoch": 0.27, + "grad_norm": 25.329591751098633, + "learning_rate": 1.821528678408568e-05, + "loss": 2.931, "step": 2134 }, { - "epoch": 0.64, - "grad_norm": 11.373884201049805, - "learning_rate": 1.5722160970231535e-05, - "loss": 2.043, + "epoch": 0.27, + "grad_norm": 9.510565757751465, + "learning_rate": 1.8214450069028994e-05, + "loss": 3.2181, "step": 2135 }, { - "epoch": 0.64, - "grad_norm": 24.369014739990234, - "learning_rate": 1.5720156359627142e-05, - "loss": 2.4182, + "epoch": 0.27, + "grad_norm": 17.192440032958984, + "learning_rate": 1.8213613353972307e-05, + "loss": 1.9588, "step": 2136 }, { - "epoch": 0.64, - "grad_norm": 12.161373138427734, - "learning_rate": 1.5718151749022755e-05, - "loss": 1.8717, + "epoch": 0.27, + "grad_norm": 8.89498519897461, + "learning_rate": 1.8212776638915618e-05, + "loss": 0.9073, "step": 2137 }, { - "epoch": 0.64, - "grad_norm": 22.5483341217041, - "learning_rate": 1.5716147138418362e-05, - "loss": 1.8705, + "epoch": 0.27, + "grad_norm": 8.90566349029541, + "learning_rate": 1.821193992385893e-05, + "loss": 2.5312, "step": 2138 }, { - "epoch": 0.64, - "grad_norm": 16.376750946044922, - "learning_rate": 1.5714142527813972e-05, - "loss": 2.359, + "epoch": 0.27, + "grad_norm": 7.776777267456055, + "learning_rate": 1.821110320880224e-05, + "loss": 1.8593, "step": 2139 }, { - "epoch": 0.64, - "grad_norm": 12.507425308227539, - "learning_rate": 1.5712137917209582e-05, - "loss": 1.8627, + "epoch": 0.27, + "grad_norm": 10.225785255432129, + "learning_rate": 1.8210266493745555e-05, + "loss": 3.0292, "step": 2140 }, { - "epoch": 0.64, - "grad_norm": 12.094954490661621, - "learning_rate": 1.5710133306605192e-05, - "loss": 1.5144, + "epoch": 0.27, + "grad_norm": 8.556358337402344, + "learning_rate": 1.820942977868887e-05, + "loss": 2.0255, "step": 2141 }, { - "epoch": 0.64, - "grad_norm": 12.721661567687988, - "learning_rate": 1.5708128696000802e-05, - "loss": 2.4135, + "epoch": 0.27, + "grad_norm": 11.723958969116211, + "learning_rate": 1.820859306363218e-05, + "loss": 3.7294, "step": 2142 }, { - "epoch": 0.64, - "grad_norm": 16.270231246948242, - "learning_rate": 1.5706124085396413e-05, - "loss": 1.5671, + "epoch": 0.27, + "grad_norm": 11.870851516723633, + "learning_rate": 1.8207756348575493e-05, + "loss": 1.7791, "step": 2143 }, { - "epoch": 0.64, - "grad_norm": 22.51713752746582, - "learning_rate": 1.5704119474792023e-05, - "loss": 2.8953, + "epoch": 0.27, + "grad_norm": 9.81495189666748, + "learning_rate": 1.8206919633518806e-05, + "loss": 2.0573, "step": 2144 }, { - "epoch": 0.64, - "grad_norm": 14.330571174621582, - "learning_rate": 1.5702114864187633e-05, - "loss": 1.948, + "epoch": 0.27, + "grad_norm": 10.432618141174316, + "learning_rate": 1.820608291846212e-05, + "loss": 3.0315, "step": 2145 }, { - "epoch": 0.65, - "grad_norm": 16.00999641418457, - "learning_rate": 1.5700110253583243e-05, - "loss": 2.0595, + "epoch": 0.27, + "grad_norm": 20.963991165161133, + "learning_rate": 1.820524620340543e-05, + "loss": 1.2123, "step": 2146 }, { - "epoch": 0.65, - "grad_norm": 23.264543533325195, - "learning_rate": 1.5698105642978853e-05, - "loss": 2.7985, + "epoch": 0.27, + "grad_norm": 10.1554594039917, + "learning_rate": 1.8204409488348744e-05, + "loss": 1.1561, "step": 2147 }, { - "epoch": 0.65, - "grad_norm": 12.527949333190918, - "learning_rate": 1.5696101032374463e-05, - "loss": 2.2662, + "epoch": 0.27, + "grad_norm": 28.186546325683594, + "learning_rate": 1.8203572773292058e-05, + "loss": 2.2849, "step": 2148 }, { - "epoch": 0.65, - "grad_norm": 23.13153076171875, - "learning_rate": 1.5694096421770073e-05, - "loss": 3.2819, + "epoch": 0.27, + "grad_norm": 14.456684112548828, + "learning_rate": 1.8202736058235368e-05, + "loss": 1.4981, "step": 2149 }, { - "epoch": 0.65, - "grad_norm": 11.828460693359375, - "learning_rate": 1.5692091811165683e-05, - "loss": 1.9632, + "epoch": 0.27, + "grad_norm": 14.477010726928711, + "learning_rate": 1.820189934317868e-05, + "loss": 3.8286, "step": 2150 }, { - "epoch": 0.65, - "grad_norm": 30.631380081176758, - "learning_rate": 1.5690087200561293e-05, - "loss": 2.5282, + "epoch": 0.27, + "grad_norm": 12.221043586730957, + "learning_rate": 1.8201062628121995e-05, + "loss": 1.7472, "step": 2151 }, { - "epoch": 0.65, - "grad_norm": 16.613862991333008, - "learning_rate": 1.56880825899569e-05, - "loss": 2.6759, + "epoch": 0.27, + "grad_norm": 11.797484397888184, + "learning_rate": 1.820022591306531e-05, + "loss": 2.4295, "step": 2152 }, { - "epoch": 0.65, - "grad_norm": 36.72877502441406, - "learning_rate": 1.5686077979352513e-05, - "loss": 2.0691, + "epoch": 0.27, + "grad_norm": 12.780076026916504, + "learning_rate": 1.819938919800862e-05, + "loss": 1.7441, "step": 2153 }, { - "epoch": 0.65, - "grad_norm": 11.451091766357422, - "learning_rate": 1.5684073368748123e-05, - "loss": 1.2804, + "epoch": 0.27, + "grad_norm": 35.99589157104492, + "learning_rate": 1.8198552482951933e-05, + "loss": 2.7663, "step": 2154 }, { - "epoch": 0.65, - "grad_norm": 13.62600326538086, - "learning_rate": 1.568206875814373e-05, - "loss": 1.9118, + "epoch": 0.27, + "grad_norm": 18.572803497314453, + "learning_rate": 1.8197715767895246e-05, + "loss": 2.4358, "step": 2155 }, { - "epoch": 0.65, - "grad_norm": 21.245529174804688, - "learning_rate": 1.5680064147539344e-05, - "loss": 2.7814, + "epoch": 0.27, + "grad_norm": 21.449501037597656, + "learning_rate": 1.8196879052838557e-05, + "loss": 2.8693, "step": 2156 }, { - "epoch": 0.65, - "grad_norm": 22.127309799194336, - "learning_rate": 1.567805953693495e-05, - "loss": 1.8782, + "epoch": 0.27, + "grad_norm": 11.441527366638184, + "learning_rate": 1.819604233778187e-05, + "loss": 0.954, "step": 2157 }, { - "epoch": 0.65, - "grad_norm": 11.301019668579102, - "learning_rate": 1.567605492633056e-05, - "loss": 3.1518, + "epoch": 0.27, + "grad_norm": 12.363370895385742, + "learning_rate": 1.8195205622725184e-05, + "loss": 1.6574, "step": 2158 }, { - "epoch": 0.65, - "grad_norm": 11.62056827545166, - "learning_rate": 1.5674050315726174e-05, - "loss": 2.3903, + "epoch": 0.27, + "grad_norm": 6.905948162078857, + "learning_rate": 1.8194368907668494e-05, + "loss": 0.8388, "step": 2159 }, { - "epoch": 0.65, - "grad_norm": 15.686822891235352, - "learning_rate": 1.567204570512178e-05, - "loss": 2.4537, - "step": 2160 - }, - { - "epoch": 0.65, - "eval_loss": 0.3099038302898407, - "eval_runtime": 43.3953, - "eval_samples_per_second": 34.082, - "eval_steps_per_second": 34.082, + "epoch": 0.27, + "grad_norm": 13.503583908081055, + "learning_rate": 1.8193532192611808e-05, + "loss": 3.3977, "step": 2160 }, { - "epoch": 0.65, - "grad_norm": 14.51310920715332, - "learning_rate": 1.567004109451739e-05, - "loss": 2.5808, + "epoch": 0.27, + "grad_norm": 9.192841529846191, + "learning_rate": 1.819269547755512e-05, + "loss": 1.6161, "step": 2161 }, { - "epoch": 0.65, - "grad_norm": 15.057507514953613, - "learning_rate": 1.5668036483913e-05, - "loss": 2.8596, + "epoch": 0.27, + "grad_norm": 24.90398406982422, + "learning_rate": 1.8191858762498432e-05, + "loss": 2.0326, "step": 2162 }, { - "epoch": 0.65, - "grad_norm": 12.982969284057617, - "learning_rate": 1.566603187330861e-05, - "loss": 2.5042, + "epoch": 0.27, + "grad_norm": 10.266580581665039, + "learning_rate": 1.8191022047441745e-05, + "loss": 2.0516, "step": 2163 }, { - "epoch": 0.65, - "grad_norm": 11.829024314880371, - "learning_rate": 1.566402726270422e-05, - "loss": 1.9732, + "epoch": 0.27, + "grad_norm": 16.922138214111328, + "learning_rate": 1.819018533238506e-05, + "loss": 2.1421, "step": 2164 }, { - "epoch": 0.65, - "grad_norm": 14.070128440856934, - "learning_rate": 1.566202265209983e-05, - "loss": 2.6257, + "epoch": 0.27, + "grad_norm": 20.214248657226562, + "learning_rate": 1.818934861732837e-05, + "loss": 2.6699, "step": 2165 }, { - "epoch": 0.65, - "grad_norm": 10.464433670043945, - "learning_rate": 1.566001804149544e-05, - "loss": 2.178, + "epoch": 0.27, + "grad_norm": 10.546091079711914, + "learning_rate": 1.8188511902271683e-05, + "loss": 3.4302, "step": 2166 }, { - "epoch": 0.65, - "grad_norm": 11.11734390258789, - "learning_rate": 1.565801343089105e-05, - "loss": 1.5285, + "epoch": 0.27, + "grad_norm": 14.921259880065918, + "learning_rate": 1.8187675187214993e-05, + "loss": 2.9676, "step": 2167 }, { - "epoch": 0.65, - "grad_norm": 14.957688331604004, - "learning_rate": 1.565600882028666e-05, - "loss": 2.1816, + "epoch": 0.27, + "grad_norm": 18.569194793701172, + "learning_rate": 1.8186838472158307e-05, + "loss": 2.3316, "step": 2168 }, { - "epoch": 0.65, - "grad_norm": 16.353477478027344, - "learning_rate": 1.565400420968227e-05, - "loss": 2.2476, + "epoch": 0.27, + "grad_norm": 21.49285888671875, + "learning_rate": 1.818600175710162e-05, + "loss": 1.8356, "step": 2169 }, { - "epoch": 0.65, - "grad_norm": 9.818117141723633, - "learning_rate": 1.565199959907788e-05, - "loss": 2.1336, + "epoch": 0.27, + "grad_norm": 10.457573890686035, + "learning_rate": 1.818516504204493e-05, + "loss": 1.3546, "step": 2170 }, { - "epoch": 0.65, - "grad_norm": 20.093345642089844, - "learning_rate": 1.564999498847349e-05, - "loss": 2.6549, + "epoch": 0.27, + "grad_norm": 25.030502319335938, + "learning_rate": 1.8184328326988245e-05, + "loss": 2.2491, "step": 2171 }, { - "epoch": 0.65, - "grad_norm": 18.352497100830078, - "learning_rate": 1.56479903778691e-05, - "loss": 2.6063, + "epoch": 0.27, + "grad_norm": 12.664494514465332, + "learning_rate": 1.8183491611931558e-05, + "loss": 2.7344, "step": 2172 }, { - "epoch": 0.65, - "grad_norm": 26.143571853637695, - "learning_rate": 1.5645985767264712e-05, - "loss": 2.8258, + "epoch": 0.27, + "grad_norm": 11.065912246704102, + "learning_rate": 1.8182654896874872e-05, + "loss": 1.6745, "step": 2173 }, { - "epoch": 0.65, - "grad_norm": 18.577131271362305, - "learning_rate": 1.564398115666032e-05, - "loss": 1.8137, + "epoch": 0.27, + "grad_norm": 30.694799423217773, + "learning_rate": 1.8181818181818182e-05, + "loss": 3.3537, "step": 2174 }, { - "epoch": 0.65, - "grad_norm": 11.283987045288086, - "learning_rate": 1.5641976546055932e-05, - "loss": 1.9124, + "epoch": 0.27, + "grad_norm": 11.940272331237793, + "learning_rate": 1.8180981466761496e-05, + "loss": 2.2207, "step": 2175 }, { - "epoch": 0.65, - "grad_norm": 19.12086296081543, - "learning_rate": 1.563997193545154e-05, - "loss": 2.3072, + "epoch": 0.27, + "grad_norm": 20.852827072143555, + "learning_rate": 1.818014475170481e-05, + "loss": 4.1757, "step": 2176 }, { - "epoch": 0.65, - "grad_norm": 11.322100639343262, - "learning_rate": 1.563796732484715e-05, - "loss": 1.8458, + "epoch": 0.27, + "grad_norm": 15.129714012145996, + "learning_rate": 1.817930803664812e-05, + "loss": 2.8836, "step": 2177 }, { - "epoch": 0.65, - "grad_norm": 16.309368133544922, - "learning_rate": 1.5635962714242762e-05, - "loss": 2.3407, + "epoch": 0.27, + "grad_norm": 20.83080291748047, + "learning_rate": 1.8178471321591433e-05, + "loss": 2.491, "step": 2178 }, { - "epoch": 0.66, - "grad_norm": 42.83634567260742, - "learning_rate": 1.563395810363837e-05, - "loss": 2.9269, + "epoch": 0.27, + "grad_norm": 10.54280948638916, + "learning_rate": 1.8177634606534747e-05, + "loss": 2.227, "step": 2179 }, { - "epoch": 0.66, - "grad_norm": 16.00682830810547, - "learning_rate": 1.563195349303398e-05, - "loss": 2.2228, + "epoch": 0.27, + "grad_norm": 13.582934379577637, + "learning_rate": 1.817679789147806e-05, + "loss": 4.0549, "step": 2180 }, { - "epoch": 0.66, - "grad_norm": 10.3877592086792, - "learning_rate": 1.562994888242959e-05, - "loss": 2.0959, + "epoch": 0.27, + "grad_norm": 11.53133773803711, + "learning_rate": 1.817596117642137e-05, + "loss": 3.4779, "step": 2181 }, { - "epoch": 0.66, - "grad_norm": 23.170366287231445, - "learning_rate": 1.56279442718252e-05, - "loss": 2.8749, + "epoch": 0.27, + "grad_norm": 14.917023658752441, + "learning_rate": 1.8175124461364684e-05, + "loss": 1.7925, "step": 2182 }, { - "epoch": 0.66, - "grad_norm": 10.100824356079102, - "learning_rate": 1.562593966122081e-05, - "loss": 2.0848, + "epoch": 0.27, + "grad_norm": 23.791954040527344, + "learning_rate": 1.8174287746307998e-05, + "loss": 2.4726, "step": 2183 }, { - "epoch": 0.66, - "grad_norm": 17.805927276611328, - "learning_rate": 1.562393505061642e-05, - "loss": 2.6061, + "epoch": 0.27, + "grad_norm": 6.664028644561768, + "learning_rate": 1.817345103125131e-05, + "loss": 0.2051, "step": 2184 }, { - "epoch": 0.66, - "grad_norm": 18.987218856811523, - "learning_rate": 1.562193044001203e-05, - "loss": 2.7442, + "epoch": 0.27, + "grad_norm": 14.408364295959473, + "learning_rate": 1.8172614316194622e-05, + "loss": 3.4575, "step": 2185 }, { - "epoch": 0.66, - "grad_norm": 44.760196685791016, - "learning_rate": 1.561992582940764e-05, - "loss": 2.3028, + "epoch": 0.27, + "grad_norm": 13.059243202209473, + "learning_rate": 1.8171777601137936e-05, + "loss": 3.5806, "step": 2186 }, { - "epoch": 0.66, - "grad_norm": 17.44770622253418, - "learning_rate": 1.561792121880325e-05, - "loss": 3.0411, + "epoch": 0.27, + "grad_norm": 12.887029647827148, + "learning_rate": 1.8170940886081246e-05, + "loss": 1.003, "step": 2187 }, { - "epoch": 0.66, - "grad_norm": 15.366558074951172, - "learning_rate": 1.5615916608198856e-05, - "loss": 2.4066, + "epoch": 0.27, + "grad_norm": 14.370407104492188, + "learning_rate": 1.817010417102456e-05, + "loss": 4.1245, "step": 2188 }, { - "epoch": 0.66, - "grad_norm": 14.17463493347168, - "learning_rate": 1.561391199759447e-05, - "loss": 1.7436, + "epoch": 0.27, + "grad_norm": 21.14535903930664, + "learning_rate": 1.8169267455967873e-05, + "loss": 1.0258, "step": 2189 }, { - "epoch": 0.66, - "grad_norm": 11.214127540588379, - "learning_rate": 1.561190738699008e-05, - "loss": 1.9083, + "epoch": 0.27, + "grad_norm": 16.043947219848633, + "learning_rate": 1.8168430740911184e-05, + "loss": 2.7536, "step": 2190 }, { - "epoch": 0.66, - "grad_norm": 14.602449417114258, - "learning_rate": 1.5609902776385686e-05, - "loss": 2.8559, + "epoch": 0.27, + "grad_norm": 21.75202178955078, + "learning_rate": 1.8167594025854497e-05, + "loss": 2.3325, "step": 2191 }, { - "epoch": 0.66, - "grad_norm": 16.023332595825195, - "learning_rate": 1.56078981657813e-05, - "loss": 1.8949, + "epoch": 0.28, + "grad_norm": 18.115833282470703, + "learning_rate": 1.8166757310797807e-05, + "loss": 2.7141, "step": 2192 }, { - "epoch": 0.66, - "grad_norm": 20.160932540893555, - "learning_rate": 1.5605893555176907e-05, - "loss": 2.0295, + "epoch": 0.28, + "grad_norm": 12.496354103088379, + "learning_rate": 1.816592059574112e-05, + "loss": 1.4812, "step": 2193 }, { - "epoch": 0.66, - "grad_norm": 12.389837265014648, - "learning_rate": 1.5603888944572517e-05, - "loss": 2.2384, + "epoch": 0.28, + "grad_norm": 17.79195785522461, + "learning_rate": 1.8165083880684435e-05, + "loss": 2.2022, "step": 2194 }, { - "epoch": 0.66, - "grad_norm": 18.265491485595703, - "learning_rate": 1.5601884333968127e-05, - "loss": 2.4571, + "epoch": 0.28, + "grad_norm": 16.170005798339844, + "learning_rate": 1.8164247165627745e-05, + "loss": 3.9978, "step": 2195 }, { - "epoch": 0.66, - "grad_norm": 14.70784854888916, - "learning_rate": 1.5599879723363737e-05, - "loss": 2.1679, + "epoch": 0.28, + "grad_norm": 7.060325622558594, + "learning_rate": 1.816341045057106e-05, + "loss": 0.5294, "step": 2196 }, { - "epoch": 0.66, - "grad_norm": 16.466394424438477, - "learning_rate": 1.5597875112759347e-05, - "loss": 1.9488, + "epoch": 0.28, + "grad_norm": 25.24850082397461, + "learning_rate": 1.8162573735514372e-05, + "loss": 2.0798, "step": 2197 }, { - "epoch": 0.66, - "grad_norm": 21.658117294311523, - "learning_rate": 1.5595870502154957e-05, - "loss": 2.7079, + "epoch": 0.28, + "grad_norm": 10.852115631103516, + "learning_rate": 1.8161737020457683e-05, + "loss": 2.0862, "step": 2198 }, { - "epoch": 0.66, - "grad_norm": 17.353038787841797, - "learning_rate": 1.5593865891550567e-05, - "loss": 2.9999, + "epoch": 0.28, + "grad_norm": 17.65023422241211, + "learning_rate": 1.8160900305400996e-05, + "loss": 2.637, "step": 2199 }, { - "epoch": 0.66, - "grad_norm": 15.819283485412598, - "learning_rate": 1.5591861280946177e-05, - "loss": 2.1287, + "epoch": 0.28, + "grad_norm": 10.618857383728027, + "learning_rate": 1.816006359034431e-05, + "loss": 1.9751, "step": 2200 }, { - "epoch": 0.66, - "grad_norm": 13.62539291381836, - "learning_rate": 1.5589856670341787e-05, - "loss": 1.9569, + "epoch": 0.28, + "grad_norm": 13.295710563659668, + "learning_rate": 1.8159226875287623e-05, + "loss": 2.2849, "step": 2201 }, { - "epoch": 0.66, - "grad_norm": 18.662357330322266, - "learning_rate": 1.5587852059737397e-05, - "loss": 2.51, + "epoch": 0.28, + "grad_norm": 29.24386215209961, + "learning_rate": 1.8158390160230934e-05, + "loss": 4.5359, "step": 2202 }, { - "epoch": 0.66, - "grad_norm": 26.726213455200195, - "learning_rate": 1.5585847449133007e-05, - "loss": 1.9874, + "epoch": 0.28, + "grad_norm": 18.27402114868164, + "learning_rate": 1.8157553445174247e-05, + "loss": 3.1177, "step": 2203 }, { - "epoch": 0.66, - "grad_norm": 16.964014053344727, - "learning_rate": 1.5583842838528618e-05, - "loss": 3.0422, + "epoch": 0.28, + "grad_norm": 21.276901245117188, + "learning_rate": 1.815671673011756e-05, + "loss": 1.0634, "step": 2204 }, { - "epoch": 0.66, - "grad_norm": 24.55567169189453, - "learning_rate": 1.5581838227924228e-05, - "loss": 2.4687, + "epoch": 0.28, + "grad_norm": 12.932526588439941, + "learning_rate": 1.815588001506087e-05, + "loss": 1.5843, "step": 2205 }, { - "epoch": 0.66, - "grad_norm": 12.898763656616211, - "learning_rate": 1.5579833617319838e-05, - "loss": 1.8264, + "epoch": 0.28, + "grad_norm": 13.635990142822266, + "learning_rate": 1.8155043300004185e-05, + "loss": 2.964, "step": 2206 }, { - "epoch": 0.66, - "grad_norm": 14.676507949829102, - "learning_rate": 1.5577829006715444e-05, - "loss": 1.942, + "epoch": 0.28, + "grad_norm": 8.697279930114746, + "learning_rate": 1.81542065849475e-05, + "loss": 3.8756, "step": 2207 }, { - "epoch": 0.66, - "grad_norm": 21.591907501220703, - "learning_rate": 1.5575824396111058e-05, - "loss": 2.193, + "epoch": 0.28, + "grad_norm": 12.8943452835083, + "learning_rate": 1.8153369869890812e-05, + "loss": 2.171, "step": 2208 }, { - "epoch": 0.66, - "grad_norm": 22.116395950317383, - "learning_rate": 1.5573819785506668e-05, - "loss": 1.8098, + "epoch": 0.28, + "grad_norm": 14.581306457519531, + "learning_rate": 1.8152533154834123e-05, + "loss": 2.451, "step": 2209 }, { - "epoch": 0.66, - "grad_norm": 17.048828125, - "learning_rate": 1.5571815174902275e-05, - "loss": 2.4309, + "epoch": 0.28, + "grad_norm": 14.167716979980469, + "learning_rate": 1.8151696439777436e-05, + "loss": 4.8333, "step": 2210 }, { - "epoch": 0.66, - "grad_norm": 17.50075912475586, - "learning_rate": 1.5569810564297888e-05, - "loss": 3.5984, + "epoch": 0.28, + "grad_norm": 40.78544998168945, + "learning_rate": 1.815085972472075e-05, + "loss": 3.447, "step": 2211 }, { - "epoch": 0.67, - "grad_norm": 42.22193145751953, - "learning_rate": 1.5567805953693495e-05, - "loss": 2.6408, + "epoch": 0.28, + "grad_norm": 13.381462097167969, + "learning_rate": 1.815002300966406e-05, + "loss": 1.2005, "step": 2212 }, { - "epoch": 0.67, - "grad_norm": 16.618879318237305, - "learning_rate": 1.5565801343089105e-05, - "loss": 1.9168, + "epoch": 0.28, + "grad_norm": 12.654776573181152, + "learning_rate": 1.8149186294607374e-05, + "loss": 3.1519, "step": 2213 }, { - "epoch": 0.67, - "grad_norm": 20.115299224853516, - "learning_rate": 1.5563796732484715e-05, - "loss": 2.5345, + "epoch": 0.28, + "grad_norm": 15.005374908447266, + "learning_rate": 1.8148349579550687e-05, + "loss": 3.3541, "step": 2214 }, { - "epoch": 0.67, - "grad_norm": 9.575844764709473, - "learning_rate": 1.5561792121880325e-05, - "loss": 2.5817, + "epoch": 0.28, + "grad_norm": 14.365894317626953, + "learning_rate": 1.8147512864493998e-05, + "loss": 2.2676, "step": 2215 }, { - "epoch": 0.67, - "grad_norm": 16.084333419799805, - "learning_rate": 1.5559787511275935e-05, - "loss": 2.8486, + "epoch": 0.28, + "grad_norm": 28.20368766784668, + "learning_rate": 1.814667614943731e-05, + "loss": 2.9791, "step": 2216 }, { - "epoch": 0.67, - "grad_norm": 9.678840637207031, - "learning_rate": 1.5557782900671545e-05, - "loss": 2.0502, + "epoch": 0.28, + "grad_norm": 9.189824104309082, + "learning_rate": 1.814583943438062e-05, + "loss": 0.9645, "step": 2217 }, { - "epoch": 0.67, - "grad_norm": 25.424049377441406, - "learning_rate": 1.5555778290067155e-05, - "loss": 2.3241, + "epoch": 0.28, + "grad_norm": 10.372204780578613, + "learning_rate": 1.8145002719323935e-05, + "loss": 1.2545, "step": 2218 }, { - "epoch": 0.67, - "grad_norm": 14.452130317687988, - "learning_rate": 1.5553773679462765e-05, - "loss": 2.2107, + "epoch": 0.28, + "grad_norm": 11.494043350219727, + "learning_rate": 1.814416600426725e-05, + "loss": 1.083, "step": 2219 }, { - "epoch": 0.67, - "grad_norm": 34.95116424560547, - "learning_rate": 1.5551769068858375e-05, - "loss": 2.9797, + "epoch": 0.28, + "grad_norm": 14.215128898620605, + "learning_rate": 1.814332928921056e-05, + "loss": 1.2213, "step": 2220 }, { - "epoch": 0.67, - "grad_norm": 10.273086547851562, - "learning_rate": 1.5549764458253986e-05, - "loss": 1.676, + "epoch": 0.28, + "grad_norm": 13.489676475524902, + "learning_rate": 1.8142492574153873e-05, + "loss": 3.0496, "step": 2221 }, { - "epoch": 0.67, - "grad_norm": 16.252601623535156, - "learning_rate": 1.5547759847649596e-05, - "loss": 2.6206, + "epoch": 0.28, + "grad_norm": 15.215855598449707, + "learning_rate": 1.8141655859097186e-05, + "loss": 2.7327, "step": 2222 }, { - "epoch": 0.67, - "grad_norm": 10.310708045959473, - "learning_rate": 1.5545755237045206e-05, - "loss": 2.1657, + "epoch": 0.28, + "grad_norm": 9.644189834594727, + "learning_rate": 1.8140819144040497e-05, + "loss": 0.9311, "step": 2223 }, { - "epoch": 0.67, - "grad_norm": 11.262001037597656, - "learning_rate": 1.5543750626440816e-05, - "loss": 1.9435, + "epoch": 0.28, + "grad_norm": 7.358780860900879, + "learning_rate": 1.813998242898381e-05, + "loss": 1.696, "step": 2224 }, { - "epoch": 0.67, - "grad_norm": 20.996740341186523, - "learning_rate": 1.5541746015836426e-05, - "loss": 2.7842, + "epoch": 0.28, + "grad_norm": 22.613245010375977, + "learning_rate": 1.8139145713927124e-05, + "loss": 2.0394, "step": 2225 }, { - "epoch": 0.67, - "grad_norm": 25.680212020874023, - "learning_rate": 1.5539741405232033e-05, - "loss": 2.4817, + "epoch": 0.28, + "grad_norm": 11.782620429992676, + "learning_rate": 1.8138308998870434e-05, + "loss": 1.9239, "step": 2226 }, { - "epoch": 0.67, - "grad_norm": 11.317098617553711, - "learning_rate": 1.5537736794627646e-05, - "loss": 1.2717, + "epoch": 0.28, + "grad_norm": 25.47810935974121, + "learning_rate": 1.8137472283813748e-05, + "loss": 3.1678, "step": 2227 }, { - "epoch": 0.67, - "grad_norm": 14.966314315795898, - "learning_rate": 1.5535732184023256e-05, - "loss": 2.5357, + "epoch": 0.28, + "grad_norm": 12.637117385864258, + "learning_rate": 1.813663556875706e-05, + "loss": 2.666, "step": 2228 }, { - "epoch": 0.67, - "grad_norm": 22.922651290893555, - "learning_rate": 1.5533727573418863e-05, - "loss": 3.0417, + "epoch": 0.28, + "grad_norm": 18.55261993408203, + "learning_rate": 1.8135798853700375e-05, + "loss": 2.0364, "step": 2229 }, { - "epoch": 0.67, - "grad_norm": 21.57832908630371, - "learning_rate": 1.5531722962814476e-05, - "loss": 2.1712, + "epoch": 0.28, + "grad_norm": 16.093608856201172, + "learning_rate": 1.8134962138643685e-05, + "loss": 4.8497, "step": 2230 }, { - "epoch": 0.67, - "grad_norm": 22.31909942626953, - "learning_rate": 1.5529718352210083e-05, - "loss": 2.7337, + "epoch": 0.28, + "grad_norm": 14.051105499267578, + "learning_rate": 1.8134125423587e-05, + "loss": 2.0579, "step": 2231 }, { - "epoch": 0.67, - "grad_norm": 10.842183113098145, - "learning_rate": 1.5527713741605693e-05, - "loss": 1.886, + "epoch": 0.28, + "grad_norm": 17.554685592651367, + "learning_rate": 1.8133288708530313e-05, + "loss": 2.6494, "step": 2232 }, { - "epoch": 0.67, - "grad_norm": 17.99361228942871, - "learning_rate": 1.5525709131001307e-05, - "loss": 2.3479, + "epoch": 0.28, + "grad_norm": 22.024080276489258, + "learning_rate": 1.8132451993473623e-05, + "loss": 3.6693, "step": 2233 }, { - "epoch": 0.67, - "grad_norm": 36.20960998535156, - "learning_rate": 1.5523704520396913e-05, - "loss": 2.8728, + "epoch": 0.28, + "grad_norm": 19.70965003967285, + "learning_rate": 1.8131615278416937e-05, + "loss": 2.4351, "step": 2234 }, { - "epoch": 0.67, - "grad_norm": 14.748534202575684, - "learning_rate": 1.5521699909792523e-05, - "loss": 2.0896, + "epoch": 0.28, + "grad_norm": 12.084982872009277, + "learning_rate": 1.813077856336025e-05, + "loss": 1.5988, "step": 2235 }, { - "epoch": 0.67, - "grad_norm": 10.2194242477417, - "learning_rate": 1.5519695299188133e-05, - "loss": 2.0784, + "epoch": 0.28, + "grad_norm": 9.847620010375977, + "learning_rate": 1.8129941848303564e-05, + "loss": 1.1611, "step": 2236 }, { - "epoch": 0.67, - "grad_norm": 19.986635208129883, - "learning_rate": 1.5517690688583744e-05, - "loss": 2.4334, + "epoch": 0.28, + "grad_norm": 18.409879684448242, + "learning_rate": 1.8129105133246874e-05, + "loss": 2.0993, "step": 2237 }, { - "epoch": 0.67, - "grad_norm": 19.66067886352539, - "learning_rate": 1.5515686077979354e-05, - "loss": 2.684, + "epoch": 0.28, + "grad_norm": 8.946453094482422, + "learning_rate": 1.8128268418190188e-05, + "loss": 1.5504, "step": 2238 }, { - "epoch": 0.67, - "grad_norm": 70.83124542236328, - "learning_rate": 1.5513681467374964e-05, - "loss": 2.5154, + "epoch": 0.28, + "grad_norm": 18.910524368286133, + "learning_rate": 1.81274317031335e-05, + "loss": 1.4382, "step": 2239 }, { - "epoch": 0.67, - "grad_norm": 23.612783432006836, - "learning_rate": 1.5511676856770574e-05, - "loss": 2.886, + "epoch": 0.28, + "grad_norm": 10.438148498535156, + "learning_rate": 1.8126594988076812e-05, + "loss": 1.585, "step": 2240 }, { - "epoch": 0.67, - "grad_norm": 15.822755813598633, - "learning_rate": 1.5509672246166184e-05, - "loss": 2.0681, + "epoch": 0.28, + "grad_norm": 22.281301498413086, + "learning_rate": 1.8125758273020125e-05, + "loss": 1.9504, "step": 2241 }, { - "epoch": 0.67, - "grad_norm": 25.98380470275879, - "learning_rate": 1.5507667635561794e-05, - "loss": 2.8038, + "epoch": 0.28, + "grad_norm": 9.4960355758667, + "learning_rate": 1.812492155796344e-05, + "loss": 1.915, "step": 2242 }, { - "epoch": 0.67, - "grad_norm": 27.250049591064453, - "learning_rate": 1.5505663024957404e-05, - "loss": 2.5481, + "epoch": 0.28, + "grad_norm": 18.760650634765625, + "learning_rate": 1.812408484290675e-05, + "loss": 1.9631, "step": 2243 }, { - "epoch": 0.67, - "grad_norm": 36.15570831298828, - "learning_rate": 1.5503658414353014e-05, - "loss": 2.1858, + "epoch": 0.28, + "grad_norm": 50.82057189941406, + "learning_rate": 1.8123248127850063e-05, + "loss": 2.4465, "step": 2244 }, { - "epoch": 0.67, - "grad_norm": 10.935135841369629, - "learning_rate": 1.5501653803748624e-05, - "loss": 1.1376, + "epoch": 0.28, + "grad_norm": 14.840272903442383, + "learning_rate": 1.8122411412793373e-05, + "loss": 4.0736, "step": 2245 }, { - "epoch": 0.68, - "grad_norm": 21.463272094726562, - "learning_rate": 1.5499649193144234e-05, - "loss": 2.5708, + "epoch": 0.28, + "grad_norm": 27.440397262573242, + "learning_rate": 1.8121574697736687e-05, + "loss": 3.074, "step": 2246 }, { - "epoch": 0.68, - "grad_norm": 12.168373107910156, - "learning_rate": 1.5497644582539844e-05, - "loss": 2.3351, + "epoch": 0.28, + "grad_norm": 11.00921630859375, + "learning_rate": 1.812073798268e-05, + "loss": 3.1562, "step": 2247 }, { - "epoch": 0.68, - "grad_norm": 20.284778594970703, - "learning_rate": 1.549563997193545e-05, - "loss": 2.2741, + "epoch": 0.28, + "grad_norm": 9.403813362121582, + "learning_rate": 1.811990126762331e-05, + "loss": 2.7584, "step": 2248 }, { - "epoch": 0.68, - "grad_norm": 21.697620391845703, - "learning_rate": 1.5493635361331065e-05, - "loss": 3.3955, + "epoch": 0.28, + "grad_norm": 9.529999732971191, + "learning_rate": 1.8119064552566624e-05, + "loss": 2.7479, "step": 2249 }, { - "epoch": 0.68, - "grad_norm": 15.422367095947266, - "learning_rate": 1.549163075072667e-05, - "loss": 2.879, + "epoch": 0.28, + "grad_norm": 9.866724967956543, + "learning_rate": 1.8118227837509938e-05, + "loss": 2.7984, "step": 2250 }, { - "epoch": 0.68, - "grad_norm": 17.34280776977539, - "learning_rate": 1.548962614012228e-05, - "loss": 2.5866, + "epoch": 0.28, + "grad_norm": 23.09242057800293, + "learning_rate": 1.811739112245325e-05, + "loss": 3.4462, "step": 2251 }, { - "epoch": 0.68, - "grad_norm": 17.975322723388672, - "learning_rate": 1.5487621529517895e-05, - "loss": 2.3053, + "epoch": 0.28, + "grad_norm": 10.374611854553223, + "learning_rate": 1.8116554407396562e-05, + "loss": 1.1058, "step": 2252 }, { - "epoch": 0.68, - "grad_norm": 30.330408096313477, - "learning_rate": 1.54856169189135e-05, - "loss": 3.002, + "epoch": 0.28, + "grad_norm": 22.321758270263672, + "learning_rate": 1.8115717692339876e-05, + "loss": 1.7572, "step": 2253 }, { - "epoch": 0.68, - "grad_norm": 17.363624572753906, - "learning_rate": 1.548361230830911e-05, - "loss": 2.323, + "epoch": 0.28, + "grad_norm": 21.010087966918945, + "learning_rate": 1.8114880977283186e-05, + "loss": 3.2659, "step": 2254 }, { - "epoch": 0.68, - "grad_norm": 14.340827941894531, - "learning_rate": 1.548160769770472e-05, - "loss": 1.8183, + "epoch": 0.28, + "grad_norm": 11.888099670410156, + "learning_rate": 1.81140442622265e-05, + "loss": 1.7821, "step": 2255 }, { - "epoch": 0.68, - "grad_norm": 11.480280876159668, - "learning_rate": 1.5479603087100332e-05, - "loss": 1.9501, + "epoch": 0.28, + "grad_norm": 14.751751899719238, + "learning_rate": 1.8113207547169813e-05, + "loss": 3.6896, "step": 2256 }, { - "epoch": 0.68, - "grad_norm": 10.279202461242676, - "learning_rate": 1.5477598476495942e-05, - "loss": 2.1804, + "epoch": 0.28, + "grad_norm": 19.622156143188477, + "learning_rate": 1.8112370832113127e-05, + "loss": 1.9646, "step": 2257 }, { - "epoch": 0.68, - "grad_norm": 7.849579334259033, - "learning_rate": 1.5475593865891552e-05, - "loss": 2.1096, + "epoch": 0.28, + "grad_norm": 24.655155181884766, + "learning_rate": 1.8111534117056437e-05, + "loss": 3.0499, "step": 2258 }, { - "epoch": 0.68, - "grad_norm": 15.246369361877441, - "learning_rate": 1.5473589255287162e-05, - "loss": 2.5827, + "epoch": 0.28, + "grad_norm": 22.66373062133789, + "learning_rate": 1.811069740199975e-05, + "loss": 2.765, "step": 2259 }, { - "epoch": 0.68, - "grad_norm": 11.893373489379883, - "learning_rate": 1.5471584644682772e-05, - "loss": 1.4826, + "epoch": 0.28, + "grad_norm": 14.57512378692627, + "learning_rate": 1.8109860686943064e-05, + "loss": 3.0029, "step": 2260 }, { - "epoch": 0.68, - "grad_norm": 22.876691818237305, - "learning_rate": 1.5469580034078382e-05, - "loss": 2.7584, + "epoch": 0.28, + "grad_norm": 7.522922039031982, + "learning_rate": 1.8109023971886375e-05, + "loss": 0.8426, "step": 2261 }, { - "epoch": 0.68, - "grad_norm": 22.17548179626465, - "learning_rate": 1.546757542347399e-05, - "loss": 2.8086, + "epoch": 0.28, + "grad_norm": 19.920307159423828, + "learning_rate": 1.8108187256829688e-05, + "loss": 3.0463, "step": 2262 }, { - "epoch": 0.68, - "grad_norm": 19.29637908935547, - "learning_rate": 1.5465570812869602e-05, - "loss": 2.4008, + "epoch": 0.28, + "grad_norm": 7.890814304351807, + "learning_rate": 1.8107350541773002e-05, + "loss": 3.4674, "step": 2263 }, { - "epoch": 0.68, - "grad_norm": 18.765453338623047, - "learning_rate": 1.5463566202265212e-05, - "loss": 2.7496, + "epoch": 0.28, + "grad_norm": 16.865583419799805, + "learning_rate": 1.8106513826716316e-05, + "loss": 1.4047, "step": 2264 }, { - "epoch": 0.68, - "grad_norm": 15.087578773498535, - "learning_rate": 1.5461561591660823e-05, - "loss": 2.6039, + "epoch": 0.28, + "grad_norm": 24.833620071411133, + "learning_rate": 1.8105677111659626e-05, + "loss": 3.3807, "step": 2265 }, { - "epoch": 0.68, - "grad_norm": 9.94806957244873, - "learning_rate": 1.5459556981056433e-05, - "loss": 1.0356, + "epoch": 0.28, + "grad_norm": 24.19931411743164, + "learning_rate": 1.810484039660294e-05, + "loss": 1.5681, "step": 2266 }, { - "epoch": 0.68, - "grad_norm": 20.82723617553711, - "learning_rate": 1.545755237045204e-05, - "loss": 2.3768, + "epoch": 0.28, + "grad_norm": 13.846046447753906, + "learning_rate": 1.8104003681546253e-05, + "loss": 3.3777, "step": 2267 }, { - "epoch": 0.68, - "grad_norm": 11.183643341064453, - "learning_rate": 1.5455547759847653e-05, - "loss": 2.0118, + "epoch": 0.28, + "grad_norm": 9.359046936035156, + "learning_rate": 1.8103166966489563e-05, + "loss": 1.6452, "step": 2268 }, { - "epoch": 0.68, - "grad_norm": 11.922686576843262, - "learning_rate": 1.545354314924326e-05, - "loss": 2.3468, + "epoch": 0.28, + "grad_norm": 11.376022338867188, + "learning_rate": 1.8102330251432877e-05, + "loss": 1.0759, "step": 2269 }, { - "epoch": 0.68, - "grad_norm": 10.599013328552246, - "learning_rate": 1.545153853863887e-05, - "loss": 1.6547, + "epoch": 0.28, + "grad_norm": 21.50180435180664, + "learning_rate": 1.8101493536376187e-05, + "loss": 1.9246, "step": 2270 }, { - "epoch": 0.68, - "grad_norm": 9.613250732421875, - "learning_rate": 1.5449533928034483e-05, - "loss": 2.3148, + "epoch": 0.29, + "grad_norm": 12.495444297790527, + "learning_rate": 1.81006568213195e-05, + "loss": 2.3863, "step": 2271 }, { - "epoch": 0.68, - "grad_norm": 20.166654586791992, - "learning_rate": 1.544752931743009e-05, - "loss": 2.3727, + "epoch": 0.29, + "grad_norm": 8.674577713012695, + "learning_rate": 1.8099820106262815e-05, + "loss": 0.7363, "step": 2272 }, { - "epoch": 0.68, - "grad_norm": 14.838998794555664, - "learning_rate": 1.54455247068257e-05, - "loss": 1.8119, + "epoch": 0.29, + "grad_norm": 10.121570587158203, + "learning_rate": 1.8098983391206125e-05, + "loss": 1.8153, "step": 2273 }, { - "epoch": 0.68, - "grad_norm": 15.088861465454102, - "learning_rate": 1.544352009622131e-05, - "loss": 1.2957, + "epoch": 0.29, + "grad_norm": 18.356172561645508, + "learning_rate": 1.809814667614944e-05, + "loss": 1.9094, "step": 2274 }, { - "epoch": 0.68, - "grad_norm": 16.902069091796875, - "learning_rate": 1.544151548561692e-05, - "loss": 2.637, + "epoch": 0.29, + "grad_norm": 10.023639678955078, + "learning_rate": 1.8097309961092752e-05, + "loss": 0.9665, "step": 2275 }, { - "epoch": 0.68, - "grad_norm": 39.02754592895508, - "learning_rate": 1.543951087501253e-05, - "loss": 2.1298, + "epoch": 0.29, + "grad_norm": 27.681039810180664, + "learning_rate": 1.8096473246036062e-05, + "loss": 1.8041, "step": 2276 }, { - "epoch": 0.68, - "grad_norm": 13.487964630126953, - "learning_rate": 1.543750626440814e-05, - "loss": 2.4923, + "epoch": 0.29, + "grad_norm": 17.335979461669922, + "learning_rate": 1.8095636530979376e-05, + "loss": 1.7052, "step": 2277 }, { - "epoch": 0.68, - "grad_norm": 28.00465965270996, - "learning_rate": 1.543550165380375e-05, - "loss": 2.9849, + "epoch": 0.29, + "grad_norm": 8.04034423828125, + "learning_rate": 1.809479981592269e-05, + "loss": 2.1678, "step": 2278 }, { - "epoch": 0.69, - "grad_norm": 12.890748977661133, - "learning_rate": 1.543349704319936e-05, - "loss": 2.6115, + "epoch": 0.29, + "grad_norm": 10.94822883605957, + "learning_rate": 1.8093963100866e-05, + "loss": 2.656, "step": 2279 }, { - "epoch": 0.69, - "grad_norm": 14.795134544372559, - "learning_rate": 1.543149243259497e-05, - "loss": 1.9501, - "step": 2280 - }, - { - "epoch": 0.69, - "eval_loss": 0.27775686979293823, - "eval_runtime": 43.6573, - "eval_samples_per_second": 33.878, - "eval_steps_per_second": 33.878, + "epoch": 0.29, + "grad_norm": 12.786932945251465, + "learning_rate": 1.8093126385809314e-05, + "loss": 1.507, "step": 2280 }, { - "epoch": 0.69, - "grad_norm": 17.335163116455078, - "learning_rate": 1.5429487821990577e-05, - "loss": 2.4545, + "epoch": 0.29, + "grad_norm": 14.343307495117188, + "learning_rate": 1.8092289670752627e-05, + "loss": 2.3808, "step": 2281 }, { - "epoch": 0.69, - "grad_norm": 19.485933303833008, - "learning_rate": 1.542748321138619e-05, - "loss": 1.9536, + "epoch": 0.29, + "grad_norm": 18.072303771972656, + "learning_rate": 1.8091452955695938e-05, + "loss": 1.1889, "step": 2282 }, { - "epoch": 0.69, - "grad_norm": 17.225101470947266, - "learning_rate": 1.54254786007818e-05, - "loss": 1.7893, + "epoch": 0.29, + "grad_norm": 20.963382720947266, + "learning_rate": 1.809061624063925e-05, + "loss": 3.0112, "step": 2283 }, { - "epoch": 0.69, - "grad_norm": 12.618383407592773, - "learning_rate": 1.5423473990177407e-05, - "loss": 2.0978, + "epoch": 0.29, + "grad_norm": 13.710307121276855, + "learning_rate": 1.8089779525582565e-05, + "loss": 2.0044, "step": 2284 }, { - "epoch": 0.69, - "grad_norm": 21.85410499572754, - "learning_rate": 1.542146937957302e-05, - "loss": 2.1725, + "epoch": 0.29, + "grad_norm": 10.80042839050293, + "learning_rate": 1.808894281052588e-05, + "loss": 1.8148, "step": 2285 }, { - "epoch": 0.69, - "grad_norm": 13.22624397277832, - "learning_rate": 1.5419464768968628e-05, - "loss": 1.9183, + "epoch": 0.29, + "grad_norm": 11.106392860412598, + "learning_rate": 1.808810609546919e-05, + "loss": 3.2806, "step": 2286 }, { - "epoch": 0.69, - "grad_norm": 14.312644004821777, - "learning_rate": 1.5417460158364238e-05, - "loss": 1.764, + "epoch": 0.29, + "grad_norm": 12.298428535461426, + "learning_rate": 1.8087269380412502e-05, + "loss": 3.5851, "step": 2287 }, { - "epoch": 0.69, - "grad_norm": 17.018287658691406, - "learning_rate": 1.541545554775985e-05, - "loss": 1.6649, + "epoch": 0.29, + "grad_norm": 16.31171989440918, + "learning_rate": 1.8086432665355816e-05, + "loss": 2.3223, "step": 2288 }, { - "epoch": 0.69, - "grad_norm": 19.239450454711914, - "learning_rate": 1.5413450937155458e-05, - "loss": 2.6163, + "epoch": 0.29, + "grad_norm": 29.622272491455078, + "learning_rate": 1.8085595950299126e-05, + "loss": 1.8154, "step": 2289 }, { - "epoch": 0.69, - "grad_norm": 8.873007774353027, - "learning_rate": 1.5411446326551068e-05, - "loss": 2.4475, + "epoch": 0.29, + "grad_norm": 8.188494682312012, + "learning_rate": 1.808475923524244e-05, + "loss": 0.9744, "step": 2290 }, { - "epoch": 0.69, - "grad_norm": 34.43484878540039, - "learning_rate": 1.5409441715946678e-05, - "loss": 3.171, + "epoch": 0.29, + "grad_norm": 16.53975486755371, + "learning_rate": 1.8083922520185754e-05, + "loss": 3.5978, "step": 2291 }, { - "epoch": 0.69, - "grad_norm": 13.385025024414062, - "learning_rate": 1.5407437105342288e-05, - "loss": 2.4151, + "epoch": 0.29, + "grad_norm": 13.368452072143555, + "learning_rate": 1.8083085805129067e-05, + "loss": 3.1355, "step": 2292 }, { - "epoch": 0.69, - "grad_norm": 23.839799880981445, - "learning_rate": 1.5405432494737898e-05, - "loss": 3.4928, + "epoch": 0.29, + "grad_norm": 15.011205673217773, + "learning_rate": 1.8082249090072378e-05, + "loss": 3.6453, "step": 2293 }, { - "epoch": 0.69, - "grad_norm": 13.791393280029297, - "learning_rate": 1.5403427884133508e-05, - "loss": 1.5945, + "epoch": 0.29, + "grad_norm": 7.424562931060791, + "learning_rate": 1.808141237501569e-05, + "loss": 0.7775, "step": 2294 }, { - "epoch": 0.69, - "grad_norm": 15.341072082519531, - "learning_rate": 1.5401423273529118e-05, - "loss": 1.6761, + "epoch": 0.29, + "grad_norm": 16.634830474853516, + "learning_rate": 1.8080575659959005e-05, + "loss": 1.8949, "step": 2295 }, { - "epoch": 0.69, - "grad_norm": 16.246986389160156, - "learning_rate": 1.539941866292473e-05, - "loss": 1.9111, + "epoch": 0.29, + "grad_norm": 17.14111328125, + "learning_rate": 1.8079738944902315e-05, + "loss": 3.6253, "step": 2296 }, { - "epoch": 0.69, - "grad_norm": 21.339174270629883, - "learning_rate": 1.539741405232034e-05, - "loss": 2.3112, + "epoch": 0.29, + "grad_norm": 16.829402923583984, + "learning_rate": 1.807890222984563e-05, + "loss": 2.47, "step": 2297 }, { - "epoch": 0.69, - "grad_norm": 19.16057777404785, - "learning_rate": 1.539540944171595e-05, - "loss": 1.8671, + "epoch": 0.29, + "grad_norm": 21.543121337890625, + "learning_rate": 1.807806551478894e-05, + "loss": 2.6683, "step": 2298 }, { - "epoch": 0.69, - "grad_norm": 10.452445983886719, - "learning_rate": 1.539340483111156e-05, - "loss": 1.1408, + "epoch": 0.29, + "grad_norm": 19.138608932495117, + "learning_rate": 1.8077228799732253e-05, + "loss": 1.9961, "step": 2299 }, { - "epoch": 0.69, - "grad_norm": 12.354015350341797, - "learning_rate": 1.5391400220507165e-05, - "loss": 2.4143, + "epoch": 0.29, + "grad_norm": 10.469704627990723, + "learning_rate": 1.8076392084675566e-05, + "loss": 1.7902, "step": 2300 }, { - "epoch": 0.69, - "grad_norm": 12.557357788085938, - "learning_rate": 1.538939560990278e-05, - "loss": 1.8995, + "epoch": 0.29, + "grad_norm": 17.763158798217773, + "learning_rate": 1.8075555369618877e-05, + "loss": 2.2397, "step": 2301 }, { - "epoch": 0.69, - "grad_norm": 16.86603546142578, - "learning_rate": 1.538739099929839e-05, - "loss": 2.0175, + "epoch": 0.29, + "grad_norm": 22.89780616760254, + "learning_rate": 1.807471865456219e-05, + "loss": 3.7139, "step": 2302 }, { - "epoch": 0.69, - "grad_norm": 22.319530487060547, - "learning_rate": 1.5385386388693996e-05, - "loss": 3.6267, + "epoch": 0.29, + "grad_norm": 10.7036714553833, + "learning_rate": 1.80738819395055e-05, + "loss": 2.5254, "step": 2303 }, { - "epoch": 0.69, - "grad_norm": 21.675506591796875, - "learning_rate": 1.538338177808961e-05, - "loss": 2.8276, + "epoch": 0.29, + "grad_norm": 13.151268005371094, + "learning_rate": 1.8073045224448814e-05, + "loss": 2.0414, "step": 2304 }, { - "epoch": 0.69, - "grad_norm": 25.780988693237305, - "learning_rate": 1.5381377167485216e-05, - "loss": 2.1939, + "epoch": 0.29, + "grad_norm": 13.832414627075195, + "learning_rate": 1.8072208509392128e-05, + "loss": 1.5838, "step": 2305 }, { - "epoch": 0.69, - "grad_norm": 21.725067138671875, - "learning_rate": 1.5379372556880826e-05, - "loss": 2.4273, + "epoch": 0.29, + "grad_norm": 15.708240509033203, + "learning_rate": 1.807137179433544e-05, + "loss": 1.7294, "step": 2306 }, { - "epoch": 0.69, - "grad_norm": 13.079160690307617, - "learning_rate": 1.537736794627644e-05, - "loss": 2.748, + "epoch": 0.29, + "grad_norm": 13.90760326385498, + "learning_rate": 1.807053507927875e-05, + "loss": 2.8137, "step": 2307 }, { - "epoch": 0.69, - "grad_norm": 70.98606872558594, - "learning_rate": 1.5375363335672046e-05, - "loss": 3.8798, + "epoch": 0.29, + "grad_norm": 9.946630477905273, + "learning_rate": 1.8069698364222065e-05, + "loss": 2.2482, "step": 2308 }, { - "epoch": 0.69, - "grad_norm": 30.048757553100586, - "learning_rate": 1.5373358725067656e-05, - "loss": 1.797, + "epoch": 0.29, + "grad_norm": 10.19784927368164, + "learning_rate": 1.806886164916538e-05, + "loss": 1.0116, "step": 2309 }, { - "epoch": 0.69, - "grad_norm": 19.938982009887695, - "learning_rate": 1.5371354114463266e-05, - "loss": 2.9236, + "epoch": 0.29, + "grad_norm": 8.03913688659668, + "learning_rate": 1.806802493410869e-05, + "loss": 1.6907, "step": 2310 }, { - "epoch": 0.69, - "grad_norm": 19.096269607543945, - "learning_rate": 1.5369349503858876e-05, - "loss": 3.0036, + "epoch": 0.29, + "grad_norm": 10.337860107421875, + "learning_rate": 1.8067188219052003e-05, + "loss": 2.5385, "step": 2311 }, { - "epoch": 0.7, - "grad_norm": 16.5783634185791, - "learning_rate": 1.5367344893254486e-05, - "loss": 2.3479, + "epoch": 0.29, + "grad_norm": 18.989763259887695, + "learning_rate": 1.8066351503995317e-05, + "loss": 3.5781, "step": 2312 }, { - "epoch": 0.7, - "grad_norm": 11.655875205993652, - "learning_rate": 1.5365340282650096e-05, - "loss": 2.0129, + "epoch": 0.29, + "grad_norm": 12.300849914550781, + "learning_rate": 1.806551478893863e-05, + "loss": 3.7757, "step": 2313 }, { - "epoch": 0.7, - "grad_norm": 12.814486503601074, - "learning_rate": 1.5363335672045706e-05, - "loss": 2.283, + "epoch": 0.29, + "grad_norm": 83.98790740966797, + "learning_rate": 1.806467807388194e-05, + "loss": 1.8347, "step": 2314 }, { - "epoch": 0.7, - "grad_norm": 12.23183822631836, - "learning_rate": 1.5361331061441317e-05, - "loss": 2.0969, + "epoch": 0.29, + "grad_norm": 12.340867042541504, + "learning_rate": 1.8063841358825254e-05, + "loss": 2.302, "step": 2315 }, { - "epoch": 0.7, - "grad_norm": 23.845361709594727, - "learning_rate": 1.5359326450836927e-05, - "loss": 2.6363, + "epoch": 0.29, + "grad_norm": 15.20331859588623, + "learning_rate": 1.8063004643768568e-05, + "loss": 1.1444, "step": 2316 }, { - "epoch": 0.7, - "grad_norm": 22.525117874145508, - "learning_rate": 1.5357321840232537e-05, - "loss": 2.7907, + "epoch": 0.29, + "grad_norm": 8.978998184204102, + "learning_rate": 1.8062167928711878e-05, + "loss": 1.4229, "step": 2317 }, { - "epoch": 0.7, - "grad_norm": 35.314666748046875, - "learning_rate": 1.5355317229628147e-05, - "loss": 2.434, + "epoch": 0.29, + "grad_norm": 13.404335021972656, + "learning_rate": 1.806133121365519e-05, + "loss": 2.6539, "step": 2318 }, { - "epoch": 0.7, - "grad_norm": 10.943061828613281, - "learning_rate": 1.5353312619023757e-05, - "loss": 1.8863, + "epoch": 0.29, + "grad_norm": 13.901432991027832, + "learning_rate": 1.8060494498598505e-05, + "loss": 1.7427, "step": 2319 }, { - "epoch": 0.7, - "grad_norm": 16.291006088256836, - "learning_rate": 1.5351308008419367e-05, - "loss": 1.8541, + "epoch": 0.29, + "grad_norm": 17.40442657470703, + "learning_rate": 1.8059657783541816e-05, + "loss": 3.412, "step": 2320 }, { - "epoch": 0.7, - "grad_norm": 12.258851051330566, - "learning_rate": 1.5349303397814977e-05, - "loss": 2.3217, + "epoch": 0.29, + "grad_norm": 13.368928909301758, + "learning_rate": 1.805882106848513e-05, + "loss": 1.7645, "step": 2321 }, { - "epoch": 0.7, - "grad_norm": 12.439240455627441, - "learning_rate": 1.5347298787210584e-05, - "loss": 2.1214, + "epoch": 0.29, + "grad_norm": 31.82679557800293, + "learning_rate": 1.8057984353428443e-05, + "loss": 2.8897, "step": 2322 }, { - "epoch": 0.7, - "grad_norm": 24.879018783569336, - "learning_rate": 1.5345294176606197e-05, - "loss": 2.4778, + "epoch": 0.29, + "grad_norm": 18.478469848632812, + "learning_rate": 1.8057147638371753e-05, + "loss": 2.7191, "step": 2323 }, { - "epoch": 0.7, - "grad_norm": 14.103094100952148, - "learning_rate": 1.5343289566001804e-05, - "loss": 3.2186, + "epoch": 0.29, + "grad_norm": 19.266084671020508, + "learning_rate": 1.8056310923315067e-05, + "loss": 1.6193, "step": 2324 }, { - "epoch": 0.7, - "grad_norm": 16.0571346282959, - "learning_rate": 1.5341284955397414e-05, - "loss": 1.7382, + "epoch": 0.29, + "grad_norm": 8.752995491027832, + "learning_rate": 1.805547420825838e-05, + "loss": 0.6984, "step": 2325 }, { - "epoch": 0.7, - "grad_norm": 14.569293975830078, - "learning_rate": 1.5339280344793027e-05, - "loss": 2.6728, + "epoch": 0.29, + "grad_norm": 6.644970893859863, + "learning_rate": 1.805463749320169e-05, + "loss": 1.3478, "step": 2326 }, { - "epoch": 0.7, - "grad_norm": 19.245534896850586, - "learning_rate": 1.5337275734188634e-05, - "loss": 2.1571, + "epoch": 0.29, + "grad_norm": 9.999451637268066, + "learning_rate": 1.8053800778145004e-05, + "loss": 1.3756, "step": 2327 }, { - "epoch": 0.7, - "grad_norm": 35.83250427246094, - "learning_rate": 1.5335271123584244e-05, - "loss": 3.574, + "epoch": 0.29, + "grad_norm": 9.796958923339844, + "learning_rate": 1.8052964063088318e-05, + "loss": 1.3429, "step": 2328 }, { - "epoch": 0.7, - "grad_norm": 22.268566131591797, - "learning_rate": 1.5333266512979854e-05, - "loss": 2.8414, + "epoch": 0.29, + "grad_norm": 13.106298446655273, + "learning_rate": 1.8052127348031628e-05, + "loss": 1.7764, "step": 2329 }, { - "epoch": 0.7, - "grad_norm": 13.335097312927246, - "learning_rate": 1.5331261902375464e-05, - "loss": 2.2446, + "epoch": 0.29, + "grad_norm": 14.986753463745117, + "learning_rate": 1.8051290632974942e-05, + "loss": 0.9655, "step": 2330 }, { - "epoch": 0.7, - "grad_norm": 21.128978729248047, - "learning_rate": 1.5329257291771075e-05, - "loss": 2.1243, + "epoch": 0.29, + "grad_norm": 15.127579689025879, + "learning_rate": 1.8050453917918252e-05, + "loss": 2.3554, "step": 2331 }, { - "epoch": 0.7, - "grad_norm": 15.031562805175781, - "learning_rate": 1.5327252681166685e-05, - "loss": 2.1856, + "epoch": 0.29, + "grad_norm": 11.969268798828125, + "learning_rate": 1.8049617202861566e-05, + "loss": 2.6626, "step": 2332 }, { - "epoch": 0.7, - "grad_norm": 24.040231704711914, - "learning_rate": 1.5325248070562295e-05, - "loss": 2.7496, + "epoch": 0.29, + "grad_norm": 10.662618637084961, + "learning_rate": 1.804878048780488e-05, + "loss": 2.7009, "step": 2333 }, { - "epoch": 0.7, - "grad_norm": 21.210121154785156, - "learning_rate": 1.5323243459957905e-05, - "loss": 2.1727, + "epoch": 0.29, + "grad_norm": 10.531999588012695, + "learning_rate": 1.804794377274819e-05, + "loss": 1.5971, "step": 2334 }, { - "epoch": 0.7, - "grad_norm": 14.999757766723633, - "learning_rate": 1.5321238849353515e-05, - "loss": 1.8486, + "epoch": 0.29, + "grad_norm": 11.093401908874512, + "learning_rate": 1.8047107057691503e-05, + "loss": 2.4525, "step": 2335 }, { - "epoch": 0.7, - "grad_norm": 21.83721160888672, - "learning_rate": 1.5319234238749125e-05, - "loss": 2.4315, + "epoch": 0.29, + "grad_norm": 15.833708763122559, + "learning_rate": 1.8046270342634817e-05, + "loss": 3.6285, "step": 2336 }, { - "epoch": 0.7, - "grad_norm": 10.136129379272461, - "learning_rate": 1.5317229628144735e-05, - "loss": 1.8972, + "epoch": 0.29, + "grad_norm": 12.989253044128418, + "learning_rate": 1.804543362757813e-05, + "loss": 1.9026, "step": 2337 }, { - "epoch": 0.7, - "grad_norm": 14.037278175354004, - "learning_rate": 1.5315225017540345e-05, - "loss": 1.9107, + "epoch": 0.29, + "grad_norm": 15.443399429321289, + "learning_rate": 1.804459691252144e-05, + "loss": 2.1652, "step": 2338 }, { - "epoch": 0.7, - "grad_norm": 12.793769836425781, - "learning_rate": 1.5313220406935955e-05, - "loss": 1.8938, + "epoch": 0.29, + "grad_norm": 18.758634567260742, + "learning_rate": 1.8043760197464755e-05, + "loss": 3.9224, "step": 2339 }, { - "epoch": 0.7, - "grad_norm": 46.66535568237305, - "learning_rate": 1.5311215796331565e-05, - "loss": 3.1435, + "epoch": 0.29, + "grad_norm": 14.861361503601074, + "learning_rate": 1.8042923482408068e-05, + "loss": 3.4067, "step": 2340 }, { - "epoch": 0.7, - "grad_norm": 24.93342399597168, - "learning_rate": 1.5309211185727172e-05, - "loss": 2.3244, + "epoch": 0.29, + "grad_norm": 50.64338684082031, + "learning_rate": 1.804208676735138e-05, + "loss": 3.8945, "step": 2341 }, { - "epoch": 0.7, - "grad_norm": 17.550573348999023, - "learning_rate": 1.5307206575122785e-05, - "loss": 1.4479, + "epoch": 0.29, + "grad_norm": 10.846446990966797, + "learning_rate": 1.8041250052294692e-05, + "loss": 3.1811, "step": 2342 }, { - "epoch": 0.7, - "grad_norm": 23.929763793945312, - "learning_rate": 1.5305201964518392e-05, - "loss": 2.6339, + "epoch": 0.29, + "grad_norm": 8.02611255645752, + "learning_rate": 1.8040413337238006e-05, + "loss": 0.9488, "step": 2343 }, { - "epoch": 0.7, - "grad_norm": 18.37746238708496, - "learning_rate": 1.5303197353914002e-05, - "loss": 1.916, + "epoch": 0.29, + "grad_norm": 23.65349578857422, + "learning_rate": 1.803957662218132e-05, + "loss": 1.5523, "step": 2344 }, { - "epoch": 0.71, - "grad_norm": 32.94269943237305, - "learning_rate": 1.5301192743309616e-05, - "loss": 2.7435, + "epoch": 0.29, + "grad_norm": 25.175683975219727, + "learning_rate": 1.803873990712463e-05, + "loss": 3.753, "step": 2345 }, { - "epoch": 0.71, - "grad_norm": 14.292465209960938, - "learning_rate": 1.5299188132705222e-05, - "loss": 1.8105, + "epoch": 0.29, + "grad_norm": 14.84681510925293, + "learning_rate": 1.8037903192067943e-05, + "loss": 1.4025, "step": 2346 }, { - "epoch": 0.71, - "grad_norm": 12.711530685424805, - "learning_rate": 1.5297183522100832e-05, - "loss": 1.9106, + "epoch": 0.29, + "grad_norm": 8.496464729309082, + "learning_rate": 1.8037066477011257e-05, + "loss": 1.9409, "step": 2347 }, { - "epoch": 0.71, - "grad_norm": 16.853271484375, - "learning_rate": 1.5295178911496443e-05, - "loss": 1.9825, + "epoch": 0.29, + "grad_norm": 9.734086036682129, + "learning_rate": 1.8036229761954567e-05, + "loss": 1.5214, "step": 2348 }, { - "epoch": 0.71, - "grad_norm": 17.740068435668945, - "learning_rate": 1.5293174300892053e-05, - "loss": 1.5591, + "epoch": 0.29, + "grad_norm": 10.568015098571777, + "learning_rate": 1.803539304689788e-05, + "loss": 1.6711, "step": 2349 }, { - "epoch": 0.71, - "grad_norm": 19.151620864868164, - "learning_rate": 1.5291169690287663e-05, - "loss": 3.9344, + "epoch": 0.29, + "grad_norm": 18.141674041748047, + "learning_rate": 1.8034556331841195e-05, + "loss": 2.8501, "step": 2350 }, { - "epoch": 0.71, - "grad_norm": 14.547835350036621, - "learning_rate": 1.5289165079683273e-05, - "loss": 2.0179, + "epoch": 0.3, + "grad_norm": 10.642548561096191, + "learning_rate": 1.8033719616784505e-05, + "loss": 0.7835, "step": 2351 }, { - "epoch": 0.71, - "grad_norm": 14.701192855834961, - "learning_rate": 1.5287160469078883e-05, - "loss": 2.1844, + "epoch": 0.3, + "grad_norm": 11.74319839477539, + "learning_rate": 1.803288290172782e-05, + "loss": 1.8802, "step": 2352 }, { - "epoch": 0.71, - "grad_norm": 20.547576904296875, - "learning_rate": 1.5285155858474493e-05, - "loss": 2.5008, + "epoch": 0.3, + "grad_norm": 19.375816345214844, + "learning_rate": 1.8032046186671132e-05, + "loss": 1.8404, "step": 2353 }, { - "epoch": 0.71, - "grad_norm": 13.004501342773438, - "learning_rate": 1.5283151247870103e-05, - "loss": 1.8492, + "epoch": 0.3, + "grad_norm": 8.290349006652832, + "learning_rate": 1.8031209471614442e-05, + "loss": 0.4438, "step": 2354 }, { - "epoch": 0.71, - "grad_norm": 18.574153900146484, - "learning_rate": 1.528114663726571e-05, - "loss": 1.7889, + "epoch": 0.3, + "grad_norm": 10.130553245544434, + "learning_rate": 1.8030372756557756e-05, + "loss": 1.3496, "step": 2355 }, { - "epoch": 0.71, - "grad_norm": 21.489116668701172, - "learning_rate": 1.5279142026661323e-05, - "loss": 2.2442, + "epoch": 0.3, + "grad_norm": 13.821660995483398, + "learning_rate": 1.8029536041501066e-05, + "loss": 1.9425, "step": 2356 }, { - "epoch": 0.71, - "grad_norm": 11.905816078186035, - "learning_rate": 1.5277137416056933e-05, - "loss": 1.6658, + "epoch": 0.3, + "grad_norm": 15.139850616455078, + "learning_rate": 1.802869932644438e-05, + "loss": 4.0812, "step": 2357 }, { - "epoch": 0.71, - "grad_norm": 16.373111724853516, - "learning_rate": 1.527513280545254e-05, - "loss": 1.7949, + "epoch": 0.3, + "grad_norm": 22.017791748046875, + "learning_rate": 1.8027862611387694e-05, + "loss": 3.3954, "step": 2358 }, { - "epoch": 0.71, - "grad_norm": 14.20676326751709, - "learning_rate": 1.5273128194848153e-05, - "loss": 2.0334, + "epoch": 0.3, + "grad_norm": 5.533475875854492, + "learning_rate": 1.8027025896331004e-05, + "loss": 0.5178, "step": 2359 }, { - "epoch": 0.71, - "grad_norm": 14.752305030822754, - "learning_rate": 1.527112358424376e-05, - "loss": 2.1005, + "epoch": 0.3, + "grad_norm": 17.990325927734375, + "learning_rate": 1.8026189181274317e-05, + "loss": 2.4957, "step": 2360 }, { - "epoch": 0.71, - "grad_norm": 13.963676452636719, - "learning_rate": 1.526911897363937e-05, - "loss": 2.9856, + "epoch": 0.3, + "grad_norm": 13.587029457092285, + "learning_rate": 1.802535246621763e-05, + "loss": 1.8408, "step": 2361 }, { - "epoch": 0.71, - "grad_norm": 11.170400619506836, - "learning_rate": 1.5267114363034984e-05, - "loss": 1.8121, + "epoch": 0.3, + "grad_norm": 6.42975378036499, + "learning_rate": 1.802451575116094e-05, + "loss": 1.965, "step": 2362 }, { - "epoch": 0.71, - "grad_norm": 17.13726043701172, - "learning_rate": 1.526510975243059e-05, - "loss": 2.1476, + "epoch": 0.3, + "grad_norm": 24.11678123474121, + "learning_rate": 1.8023679036104255e-05, + "loss": 2.8355, "step": 2363 }, { - "epoch": 0.71, - "grad_norm": 79.9980697631836, - "learning_rate": 1.52631051418262e-05, - "loss": 3.3246, + "epoch": 0.3, + "grad_norm": 7.636974334716797, + "learning_rate": 1.802284232104757e-05, + "loss": 0.7662, "step": 2364 }, { - "epoch": 0.71, - "grad_norm": 21.73980712890625, - "learning_rate": 1.526110053122181e-05, - "loss": 2.9358, + "epoch": 0.3, + "grad_norm": 9.104048728942871, + "learning_rate": 1.8022005605990882e-05, + "loss": 1.3221, "step": 2365 }, { - "epoch": 0.71, - "grad_norm": 13.559164047241211, - "learning_rate": 1.525909592061742e-05, - "loss": 2.0114, + "epoch": 0.3, + "grad_norm": 20.817089080810547, + "learning_rate": 1.8021168890934193e-05, + "loss": 2.1917, "step": 2366 }, { - "epoch": 0.71, - "grad_norm": 14.406414031982422, - "learning_rate": 1.525709131001303e-05, - "loss": 1.1554, + "epoch": 0.3, + "grad_norm": 12.99535846710205, + "learning_rate": 1.8020332175877506e-05, + "loss": 1.9057, "step": 2367 }, { - "epoch": 0.71, - "grad_norm": 11.396732330322266, - "learning_rate": 1.5255086699408641e-05, - "loss": 2.1299, + "epoch": 0.3, + "grad_norm": 17.81003761291504, + "learning_rate": 1.801949546082082e-05, + "loss": 1.3834, "step": 2368 }, { - "epoch": 0.71, - "grad_norm": 22.816299438476562, - "learning_rate": 1.5253082088804253e-05, - "loss": 2.8568, + "epoch": 0.3, + "grad_norm": 23.86208152770996, + "learning_rate": 1.801865874576413e-05, + "loss": 3.3293, "step": 2369 }, { - "epoch": 0.71, - "grad_norm": 19.31545066833496, - "learning_rate": 1.5251077478199861e-05, - "loss": 2.9191, + "epoch": 0.3, + "grad_norm": 12.152969360351562, + "learning_rate": 1.8017822030707444e-05, + "loss": 1.6765, "step": 2370 }, { - "epoch": 0.71, - "grad_norm": 20.314498901367188, - "learning_rate": 1.5249072867595471e-05, - "loss": 1.6643, + "epoch": 0.3, + "grad_norm": 38.21599197387695, + "learning_rate": 1.8016985315650757e-05, + "loss": 1.3122, "step": 2371 }, { - "epoch": 0.71, - "grad_norm": 21.570932388305664, - "learning_rate": 1.524706825699108e-05, - "loss": 2.4345, + "epoch": 0.3, + "grad_norm": 19.01609230041504, + "learning_rate": 1.801614860059407e-05, + "loss": 3.2396, "step": 2372 }, { - "epoch": 0.71, - "grad_norm": 12.214997291564941, - "learning_rate": 1.5245063646386691e-05, - "loss": 1.8357, + "epoch": 0.3, + "grad_norm": 30.038503646850586, + "learning_rate": 1.801531188553738e-05, + "loss": 2.8291, "step": 2373 }, { - "epoch": 0.71, - "grad_norm": 15.286229133605957, - "learning_rate": 1.5243059035782301e-05, - "loss": 1.6836, + "epoch": 0.3, + "grad_norm": 16.662433624267578, + "learning_rate": 1.8014475170480695e-05, + "loss": 2.5137, "step": 2374 }, { - "epoch": 0.71, - "grad_norm": 18.07011604309082, - "learning_rate": 1.524105442517791e-05, - "loss": 2.6134, + "epoch": 0.3, + "grad_norm": 13.939742088317871, + "learning_rate": 1.801363845542401e-05, + "loss": 1.557, "step": 2375 }, { - "epoch": 0.71, - "grad_norm": 13.150678634643555, - "learning_rate": 1.5239049814573522e-05, - "loss": 2.336, + "epoch": 0.3, + "grad_norm": 8.272865295410156, + "learning_rate": 1.801280174036732e-05, + "loss": 2.2698, "step": 2376 }, { - "epoch": 0.71, - "grad_norm": 14.786459922790527, - "learning_rate": 1.523704520396913e-05, - "loss": 1.8392, + "epoch": 0.3, + "grad_norm": 22.545406341552734, + "learning_rate": 1.8011965025310633e-05, + "loss": 4.3373, "step": 2377 }, { - "epoch": 0.71, - "grad_norm": 12.723509788513184, - "learning_rate": 1.523504059336474e-05, - "loss": 2.4568, + "epoch": 0.3, + "grad_norm": 18.655996322631836, + "learning_rate": 1.8011128310253946e-05, + "loss": 3.117, "step": 2378 }, { - "epoch": 0.72, - "grad_norm": 17.796947479248047, - "learning_rate": 1.5233035982760348e-05, - "loss": 2.8035, + "epoch": 0.3, + "grad_norm": 9.008397102355957, + "learning_rate": 1.8010291595197256e-05, + "loss": 2.2974, "step": 2379 }, { - "epoch": 0.72, - "grad_norm": 7.9330854415893555, - "learning_rate": 1.523103137215596e-05, - "loss": 1.0556, + "epoch": 0.3, + "grad_norm": 11.4547758102417, + "learning_rate": 1.800945488014057e-05, + "loss": 2.4967, "step": 2380 }, { - "epoch": 0.72, - "grad_norm": 20.23641586303711, - "learning_rate": 1.522902676155157e-05, - "loss": 2.4805, + "epoch": 0.3, + "grad_norm": 15.299418449401855, + "learning_rate": 1.800861816508388e-05, + "loss": 3.1593, "step": 2381 }, { - "epoch": 0.72, - "grad_norm": 14.428129196166992, - "learning_rate": 1.5227022150947179e-05, - "loss": 2.3162, + "epoch": 0.3, + "grad_norm": 9.479822158813477, + "learning_rate": 1.8007781450027194e-05, + "loss": 1.0239, "step": 2382 }, { - "epoch": 0.72, - "grad_norm": 19.485916137695312, - "learning_rate": 1.522501754034279e-05, - "loss": 2.8538, + "epoch": 0.3, + "grad_norm": 17.806331634521484, + "learning_rate": 1.8006944734970508e-05, + "loss": 3.0007, "step": 2383 }, { - "epoch": 0.72, - "grad_norm": 23.651782989501953, - "learning_rate": 1.5223012929738399e-05, - "loss": 2.2296, + "epoch": 0.3, + "grad_norm": 8.056856155395508, + "learning_rate": 1.8006108019913818e-05, + "loss": 1.9608, "step": 2384 }, { - "epoch": 0.72, - "grad_norm": 20.420745849609375, - "learning_rate": 1.5221008319134009e-05, - "loss": 2.8712, + "epoch": 0.3, + "grad_norm": 24.79678726196289, + "learning_rate": 1.800527130485713e-05, + "loss": 4.0215, "step": 2385 }, { - "epoch": 0.72, - "grad_norm": 13.436408042907715, - "learning_rate": 1.5219003708529619e-05, - "loss": 2.4493, + "epoch": 0.3, + "grad_norm": 7.317874908447266, + "learning_rate": 1.8004434589800445e-05, + "loss": 2.7529, "step": 2386 }, { - "epoch": 0.72, - "grad_norm": 9.011188507080078, - "learning_rate": 1.5216999097925229e-05, - "loss": 1.8621, + "epoch": 0.3, + "grad_norm": 11.379854202270508, + "learning_rate": 1.8003597874743755e-05, + "loss": 3.1902, "step": 2387 }, { - "epoch": 0.72, - "grad_norm": 24.090200424194336, - "learning_rate": 1.521499448732084e-05, - "loss": 2.7823, + "epoch": 0.3, + "grad_norm": 7.822103023529053, + "learning_rate": 1.800276115968707e-05, + "loss": 0.639, "step": 2388 }, { - "epoch": 0.72, - "grad_norm": 22.469762802124023, - "learning_rate": 1.521298987671645e-05, - "loss": 2.1563, + "epoch": 0.3, + "grad_norm": 10.701630592346191, + "learning_rate": 1.8001924444630383e-05, + "loss": 1.1973, "step": 2389 }, { - "epoch": 0.72, - "grad_norm": 20.96700668334961, - "learning_rate": 1.521098526611206e-05, - "loss": 2.3123, + "epoch": 0.3, + "grad_norm": 4.938593864440918, + "learning_rate": 1.8001087729573693e-05, + "loss": 0.2573, "step": 2390 }, { - "epoch": 0.72, - "grad_norm": 13.538233757019043, - "learning_rate": 1.5208980655507668e-05, - "loss": 1.8336, + "epoch": 0.3, + "grad_norm": 14.3922119140625, + "learning_rate": 1.8000251014517007e-05, + "loss": 3.0215, "step": 2391 }, { - "epoch": 0.72, - "grad_norm": 19.34774398803711, - "learning_rate": 1.520697604490328e-05, - "loss": 2.2524, + "epoch": 0.3, + "grad_norm": 32.6680908203125, + "learning_rate": 1.799941429946032e-05, + "loss": 3.7568, "step": 2392 }, { - "epoch": 0.72, - "grad_norm": 23.267269134521484, - "learning_rate": 1.520497143429889e-05, - "loss": 3.195, + "epoch": 0.3, + "grad_norm": 61.6491584777832, + "learning_rate": 1.7998577584403634e-05, + "loss": 3.0698, "step": 2393 }, { - "epoch": 0.72, - "grad_norm": 43.13511657714844, - "learning_rate": 1.5202966823694498e-05, - "loss": 2.9556, + "epoch": 0.3, + "grad_norm": 9.55639362335205, + "learning_rate": 1.7997740869346944e-05, + "loss": 1.6788, "step": 2394 }, { - "epoch": 0.72, - "grad_norm": 12.41313362121582, - "learning_rate": 1.520096221309011e-05, - "loss": 2.4461, + "epoch": 0.3, + "grad_norm": 16.137413024902344, + "learning_rate": 1.7996904154290258e-05, + "loss": 1.9237, "step": 2395 }, { - "epoch": 0.72, - "grad_norm": 16.363807678222656, - "learning_rate": 1.5198957602485718e-05, - "loss": 2.2686, + "epoch": 0.3, + "grad_norm": 22.509485244750977, + "learning_rate": 1.799606743923357e-05, + "loss": 3.7312, "step": 2396 }, { - "epoch": 0.72, - "grad_norm": 18.332998275756836, - "learning_rate": 1.5196952991881328e-05, - "loss": 2.3713, + "epoch": 0.3, + "grad_norm": 15.136279106140137, + "learning_rate": 1.7995230724176882e-05, + "loss": 2.7475, "step": 2397 }, { - "epoch": 0.72, - "grad_norm": 15.461286544799805, - "learning_rate": 1.5194948381276937e-05, - "loss": 2.0574, + "epoch": 0.3, + "grad_norm": 8.282188415527344, + "learning_rate": 1.7994394009120195e-05, + "loss": 2.5397, "step": 2398 }, { - "epoch": 0.72, - "grad_norm": 32.771728515625, - "learning_rate": 1.5192943770672548e-05, - "loss": 2.7732, + "epoch": 0.3, + "grad_norm": 21.439027786254883, + "learning_rate": 1.799355729406351e-05, + "loss": 2.3746, "step": 2399 }, { - "epoch": 0.72, - "grad_norm": 15.005850791931152, - "learning_rate": 1.5190939160068158e-05, - "loss": 2.0059, + "epoch": 0.3, + "grad_norm": 15.517571449279785, + "learning_rate": 1.7992720579006823e-05, + "loss": 1.6803, "step": 2400 }, { - "epoch": 0.72, - "eval_loss": 0.2799599766731262, - "eval_runtime": 43.2863, - "eval_samples_per_second": 34.168, - "eval_steps_per_second": 34.168, + "epoch": 0.3, + "eval_loss": 0.14937622845172882, + "eval_runtime": 93.6313, + "eval_samples_per_second": 37.829, + "eval_steps_per_second": 37.829, "step": 2400 }, { - "epoch": 0.72, - "grad_norm": 18.537982940673828, - "learning_rate": 1.5188934549463767e-05, - "loss": 2.3335, + "epoch": 0.3, + "grad_norm": 6.837953090667725, + "learning_rate": 1.7991883863950133e-05, + "loss": 1.6215, "step": 2401 }, { - "epoch": 0.72, - "grad_norm": 28.75159454345703, - "learning_rate": 1.5186929938859379e-05, - "loss": 2.5293, + "epoch": 0.3, + "grad_norm": 9.428163528442383, + "learning_rate": 1.7991047148893447e-05, + "loss": 1.1436, "step": 2402 }, { - "epoch": 0.72, - "grad_norm": 19.689199447631836, - "learning_rate": 1.5184925328254987e-05, - "loss": 3.4202, + "epoch": 0.3, + "grad_norm": 12.235206604003906, + "learning_rate": 1.799021043383676e-05, + "loss": 2.8743, "step": 2403 }, { - "epoch": 0.72, - "grad_norm": 16.85619354248047, - "learning_rate": 1.5182920717650597e-05, - "loss": 2.1249, + "epoch": 0.3, + "grad_norm": 16.27605628967285, + "learning_rate": 1.798937371878007e-05, + "loss": 4.4169, "step": 2404 }, { - "epoch": 0.72, - "grad_norm": 15.209796905517578, - "learning_rate": 1.5180916107046209e-05, - "loss": 2.1942, + "epoch": 0.3, + "grad_norm": 23.758119583129883, + "learning_rate": 1.7988537003723384e-05, + "loss": 5.8766, "step": 2405 }, { - "epoch": 0.72, - "grad_norm": 19.145883560180664, - "learning_rate": 1.5178911496441817e-05, - "loss": 2.3506, + "epoch": 0.3, + "grad_norm": 7.420042514801025, + "learning_rate": 1.7987700288666698e-05, + "loss": 3.2284, "step": 2406 }, { - "epoch": 0.72, - "grad_norm": 13.587536811828613, - "learning_rate": 1.5176906885837427e-05, - "loss": 2.483, + "epoch": 0.3, + "grad_norm": 10.003791809082031, + "learning_rate": 1.7986863573610008e-05, + "loss": 0.8627, "step": 2407 }, { - "epoch": 0.72, - "grad_norm": 14.813488960266113, - "learning_rate": 1.5174902275233036e-05, - "loss": 1.6431, + "epoch": 0.3, + "grad_norm": 9.849807739257812, + "learning_rate": 1.7986026858553322e-05, + "loss": 3.7947, "step": 2408 }, { - "epoch": 0.72, - "grad_norm": 7.390352249145508, - "learning_rate": 1.5172897664628648e-05, - "loss": 1.329, + "epoch": 0.3, + "grad_norm": 13.273680686950684, + "learning_rate": 1.7985190143496632e-05, + "loss": 3.0434, "step": 2409 }, { - "epoch": 0.72, - "grad_norm": 124.1446304321289, - "learning_rate": 1.5170893054024256e-05, - "loss": 2.8408, + "epoch": 0.3, + "grad_norm": 23.513259887695312, + "learning_rate": 1.7984353428439946e-05, + "loss": 1.9259, "step": 2410 }, { - "epoch": 0.72, - "grad_norm": 31.081998825073242, - "learning_rate": 1.5168888443419868e-05, - "loss": 2.8232, + "epoch": 0.3, + "grad_norm": 5.763058662414551, + "learning_rate": 1.798351671338326e-05, + "loss": 0.232, "step": 2411 }, { - "epoch": 0.73, - "grad_norm": 18.641647338867188, - "learning_rate": 1.5166883832815478e-05, - "loss": 1.8497, + "epoch": 0.3, + "grad_norm": 27.04803466796875, + "learning_rate": 1.798267999832657e-05, + "loss": 3.0844, "step": 2412 }, { - "epoch": 0.73, - "grad_norm": 17.504941940307617, - "learning_rate": 1.5164879222211086e-05, - "loss": 2.4579, + "epoch": 0.3, + "grad_norm": 10.611763954162598, + "learning_rate": 1.7981843283269883e-05, + "loss": 3.6497, "step": 2413 }, { - "epoch": 0.73, - "grad_norm": 11.096992492675781, - "learning_rate": 1.5162874611606698e-05, - "loss": 2.1797, + "epoch": 0.3, + "grad_norm": 8.736261367797852, + "learning_rate": 1.7981006568213197e-05, + "loss": 2.5193, "step": 2414 }, { - "epoch": 0.73, - "grad_norm": 14.132731437683105, - "learning_rate": 1.5160870001002306e-05, - "loss": 1.5483, + "epoch": 0.3, + "grad_norm": 27.361827850341797, + "learning_rate": 1.7980169853156507e-05, + "loss": 1.9007, "step": 2415 }, { - "epoch": 0.73, - "grad_norm": 49.675132751464844, - "learning_rate": 1.5158865390397916e-05, - "loss": 2.4514, + "epoch": 0.3, + "grad_norm": 6.731814861297607, + "learning_rate": 1.797933313809982e-05, + "loss": 1.5845, "step": 2416 }, { - "epoch": 0.73, - "grad_norm": 12.76675796508789, - "learning_rate": 1.5156860779793528e-05, - "loss": 1.2279, + "epoch": 0.3, + "grad_norm": 12.201180458068848, + "learning_rate": 1.7978496423043134e-05, + "loss": 2.0134, "step": 2417 }, { - "epoch": 0.73, - "grad_norm": 10.700181007385254, - "learning_rate": 1.5154856169189137e-05, - "loss": 2.0745, + "epoch": 0.3, + "grad_norm": 16.464221954345703, + "learning_rate": 1.7977659707986445e-05, + "loss": 2.1593, "step": 2418 }, { - "epoch": 0.73, - "grad_norm": 17.42853546142578, - "learning_rate": 1.5152851558584747e-05, - "loss": 2.7441, + "epoch": 0.3, + "grad_norm": 10.752031326293945, + "learning_rate": 1.797682299292976e-05, + "loss": 1.6055, "step": 2419 }, { - "epoch": 0.73, - "grad_norm": 8.670104026794434, - "learning_rate": 1.5150846947980355e-05, - "loss": 1.8953, + "epoch": 0.3, + "grad_norm": 15.68919849395752, + "learning_rate": 1.7975986277873072e-05, + "loss": 1.6983, "step": 2420 }, { - "epoch": 0.73, - "grad_norm": 24.383087158203125, - "learning_rate": 1.5148842337375967e-05, - "loss": 2.6507, + "epoch": 0.3, + "grad_norm": 15.785876274108887, + "learning_rate": 1.7975149562816386e-05, + "loss": 1.7882, "step": 2421 }, { - "epoch": 0.73, - "grad_norm": 12.565937042236328, - "learning_rate": 1.5146837726771575e-05, - "loss": 1.5887, + "epoch": 0.3, + "grad_norm": 7.236278533935547, + "learning_rate": 1.7974312847759696e-05, + "loss": 1.3744, "step": 2422 }, { - "epoch": 0.73, - "grad_norm": 22.554595947265625, - "learning_rate": 1.5144833116167185e-05, - "loss": 2.4942, + "epoch": 0.3, + "grad_norm": 17.342849731445312, + "learning_rate": 1.797347613270301e-05, + "loss": 4.0136, "step": 2423 }, { - "epoch": 0.73, - "grad_norm": 8.884187698364258, - "learning_rate": 1.5142828505562797e-05, - "loss": 1.2549, + "epoch": 0.3, + "grad_norm": 12.334209442138672, + "learning_rate": 1.7972639417646323e-05, + "loss": 2.6935, "step": 2424 }, { - "epoch": 0.73, - "grad_norm": 15.001145362854004, - "learning_rate": 1.5140823894958406e-05, - "loss": 2.1248, + "epoch": 0.3, + "grad_norm": 20.487749099731445, + "learning_rate": 1.7971802702589633e-05, + "loss": 1.966, "step": 2425 }, { - "epoch": 0.73, - "grad_norm": 13.459707260131836, - "learning_rate": 1.5138819284354016e-05, - "loss": 2.0756, + "epoch": 0.3, + "grad_norm": 36.53834533691406, + "learning_rate": 1.7970965987532947e-05, + "loss": 3.8524, "step": 2426 }, { - "epoch": 0.73, - "grad_norm": 15.579310417175293, - "learning_rate": 1.5136814673749624e-05, - "loss": 2.8265, + "epoch": 0.3, + "grad_norm": 10.720416069030762, + "learning_rate": 1.797012927247626e-05, + "loss": 0.6583, "step": 2427 }, { - "epoch": 0.73, - "grad_norm": 12.242110252380371, - "learning_rate": 1.5134810063145236e-05, - "loss": 2.2286, + "epoch": 0.3, + "grad_norm": 11.420807838439941, + "learning_rate": 1.7969292557419574e-05, + "loss": 2.1776, "step": 2428 }, { - "epoch": 0.73, - "grad_norm": 30.915904998779297, - "learning_rate": 1.5132805452540844e-05, - "loss": 2.2951, + "epoch": 0.3, + "grad_norm": 22.484323501586914, + "learning_rate": 1.7968455842362885e-05, + "loss": 4.2214, "step": 2429 }, { - "epoch": 0.73, - "grad_norm": 15.82615852355957, - "learning_rate": 1.5130800841936454e-05, - "loss": 2.0723, + "epoch": 0.3, + "grad_norm": 12.813691139221191, + "learning_rate": 1.79676191273062e-05, + "loss": 3.3768, "step": 2430 }, { - "epoch": 0.73, - "grad_norm": 15.196529388427734, - "learning_rate": 1.5128796231332066e-05, - "loss": 2.2142, + "epoch": 0.31, + "grad_norm": 30.47164535522461, + "learning_rate": 1.7966782412249512e-05, + "loss": 2.4702, "step": 2431 }, { - "epoch": 0.73, - "grad_norm": 43.0781135559082, - "learning_rate": 1.5126791620727674e-05, - "loss": 3.249, + "epoch": 0.31, + "grad_norm": 12.788331985473633, + "learning_rate": 1.7965945697192822e-05, + "loss": 2.0694, "step": 2432 }, { - "epoch": 0.73, - "grad_norm": 10.74176025390625, - "learning_rate": 1.5124787010123284e-05, - "loss": 1.763, + "epoch": 0.31, + "grad_norm": 14.737342834472656, + "learning_rate": 1.7965108982136136e-05, + "loss": 1.2319, "step": 2433 }, { - "epoch": 0.73, - "grad_norm": 14.15688705444336, - "learning_rate": 1.5122782399518895e-05, - "loss": 1.8488, + "epoch": 0.31, + "grad_norm": 16.493003845214844, + "learning_rate": 1.7964272267079446e-05, + "loss": 1.5493, "step": 2434 }, { - "epoch": 0.73, - "grad_norm": 16.205501556396484, - "learning_rate": 1.5120777788914505e-05, - "loss": 2.8312, + "epoch": 0.31, + "grad_norm": 8.839437484741211, + "learning_rate": 1.796343555202276e-05, + "loss": 2.0695, "step": 2435 }, { - "epoch": 0.73, - "grad_norm": 23.575292587280273, - "learning_rate": 1.5118773178310115e-05, - "loss": 1.8974, + "epoch": 0.31, + "grad_norm": 11.862302780151367, + "learning_rate": 1.7962598836966073e-05, + "loss": 2.0112, "step": 2436 }, { - "epoch": 0.73, - "grad_norm": 12.507033348083496, - "learning_rate": 1.5116768567705725e-05, - "loss": 2.0892, + "epoch": 0.31, + "grad_norm": 9.451587677001953, + "learning_rate": 1.7961762121909384e-05, + "loss": 2.5243, "step": 2437 }, { - "epoch": 0.73, - "grad_norm": 11.459385871887207, - "learning_rate": 1.5114763957101335e-05, - "loss": 2.232, + "epoch": 0.31, + "grad_norm": 17.41968536376953, + "learning_rate": 1.7960925406852697e-05, + "loss": 1.4975, "step": 2438 }, { - "epoch": 0.73, - "grad_norm": 13.064870834350586, - "learning_rate": 1.5112759346496943e-05, - "loss": 2.9855, + "epoch": 0.31, + "grad_norm": 8.577614784240723, + "learning_rate": 1.796008869179601e-05, + "loss": 3.4461, "step": 2439 }, { - "epoch": 0.73, - "grad_norm": 21.083112716674805, - "learning_rate": 1.5110754735892555e-05, - "loss": 2.7381, + "epoch": 0.31, + "grad_norm": 8.34268856048584, + "learning_rate": 1.795925197673932e-05, + "loss": 0.9853, "step": 2440 }, { - "epoch": 0.73, - "grad_norm": 16.428953170776367, - "learning_rate": 1.5108750125288163e-05, - "loss": 2.8427, + "epoch": 0.31, + "grad_norm": 19.16912841796875, + "learning_rate": 1.7958415261682635e-05, + "loss": 2.7435, "step": 2441 }, { - "epoch": 0.73, - "grad_norm": 13.690245628356934, - "learning_rate": 1.5106745514683774e-05, - "loss": 1.9179, + "epoch": 0.31, + "grad_norm": 14.721293449401855, + "learning_rate": 1.795757854662595e-05, + "loss": 4.1871, "step": 2442 }, { - "epoch": 0.73, - "grad_norm": 13.37304401397705, - "learning_rate": 1.5104740904079385e-05, - "loss": 1.8193, + "epoch": 0.31, + "grad_norm": 18.573949813842773, + "learning_rate": 1.795674183156926e-05, + "loss": 2.5489, "step": 2443 }, { - "epoch": 0.73, - "grad_norm": 10.359079360961914, - "learning_rate": 1.5102736293474994e-05, - "loss": 2.5558, + "epoch": 0.31, + "grad_norm": 14.529594421386719, + "learning_rate": 1.7955905116512572e-05, + "loss": 3.8118, "step": 2444 }, { - "epoch": 0.74, - "grad_norm": 12.4136381149292, - "learning_rate": 1.5100731682870604e-05, - "loss": 2.191, + "epoch": 0.31, + "grad_norm": 8.199951171875, + "learning_rate": 1.7955068401455886e-05, + "loss": 2.007, "step": 2445 }, { - "epoch": 0.74, - "grad_norm": 9.771052360534668, - "learning_rate": 1.5098727072266212e-05, - "loss": 1.5917, + "epoch": 0.31, + "grad_norm": 196.73123168945312, + "learning_rate": 1.7954231686399196e-05, + "loss": 3.6715, "step": 2446 }, { - "epoch": 0.74, - "grad_norm": 12.872698783874512, - "learning_rate": 1.5096722461661824e-05, - "loss": 1.6524, + "epoch": 0.31, + "grad_norm": 13.598657608032227, + "learning_rate": 1.795339497134251e-05, + "loss": 0.8439, "step": 2447 }, { - "epoch": 0.74, - "grad_norm": 28.35697364807129, - "learning_rate": 1.5094717851057434e-05, - "loss": 2.569, + "epoch": 0.31, + "grad_norm": 15.713129997253418, + "learning_rate": 1.7952558256285824e-05, + "loss": 4.4424, "step": 2448 }, { - "epoch": 0.74, - "grad_norm": 18.459850311279297, - "learning_rate": 1.5092713240453042e-05, - "loss": 2.1667, + "epoch": 0.31, + "grad_norm": 11.949092864990234, + "learning_rate": 1.7951721541229137e-05, + "loss": 2.1954, "step": 2449 }, { - "epoch": 0.74, - "grad_norm": 12.4850492477417, - "learning_rate": 1.5090708629848654e-05, - "loss": 2.2356, + "epoch": 0.31, + "grad_norm": 17.65532112121582, + "learning_rate": 1.7950884826172448e-05, + "loss": 1.9608, "step": 2450 }, { - "epoch": 0.74, - "grad_norm": 21.161081314086914, - "learning_rate": 1.5088704019244263e-05, - "loss": 2.1366, + "epoch": 0.31, + "grad_norm": 25.382020950317383, + "learning_rate": 1.795004811111576e-05, + "loss": 3.9136, "step": 2451 }, { - "epoch": 0.74, - "grad_norm": 22.59266471862793, - "learning_rate": 1.5086699408639873e-05, - "loss": 2.4504, + "epoch": 0.31, + "grad_norm": 10.749935150146484, + "learning_rate": 1.7949211396059075e-05, + "loss": 2.5799, "step": 2452 }, { - "epoch": 0.74, - "grad_norm": 11.43664836883545, - "learning_rate": 1.5084694798035481e-05, - "loss": 2.434, + "epoch": 0.31, + "grad_norm": 9.420044898986816, + "learning_rate": 1.7948374681002385e-05, + "loss": 0.2144, "step": 2453 }, { - "epoch": 0.74, - "grad_norm": 12.218484878540039, - "learning_rate": 1.5082690187431093e-05, - "loss": 1.2119, + "epoch": 0.31, + "grad_norm": 20.088491439819336, + "learning_rate": 1.79475379659457e-05, + "loss": 3.2122, "step": 2454 }, { - "epoch": 0.74, - "grad_norm": 10.82869815826416, - "learning_rate": 1.5080685576826703e-05, - "loss": 1.8022, + "epoch": 0.31, + "grad_norm": 10.93820858001709, + "learning_rate": 1.7946701250889012e-05, + "loss": 1.3192, "step": 2455 }, { - "epoch": 0.74, - "grad_norm": 17.006765365600586, - "learning_rate": 1.5078680966222311e-05, - "loss": 2.1377, + "epoch": 0.31, + "grad_norm": 13.77704906463623, + "learning_rate": 1.7945864535832326e-05, + "loss": 1.2174, "step": 2456 }, { - "epoch": 0.74, - "grad_norm": 21.112869262695312, - "learning_rate": 1.5076676355617923e-05, - "loss": 2.799, + "epoch": 0.31, + "grad_norm": 11.582486152648926, + "learning_rate": 1.7945027820775636e-05, + "loss": 2.6627, "step": 2457 }, { - "epoch": 0.74, - "grad_norm": 24.335466384887695, - "learning_rate": 1.5074671745013532e-05, - "loss": 2.4533, + "epoch": 0.31, + "grad_norm": 13.67541790008545, + "learning_rate": 1.794419110571895e-05, + "loss": 2.7674, "step": 2458 }, { - "epoch": 0.74, - "grad_norm": 20.010364532470703, - "learning_rate": 1.5072667134409142e-05, - "loss": 1.9907, + "epoch": 0.31, + "grad_norm": 14.400947570800781, + "learning_rate": 1.7943354390662264e-05, + "loss": 0.8326, "step": 2459 }, { - "epoch": 0.74, - "grad_norm": 34.753047943115234, - "learning_rate": 1.5070662523804753e-05, - "loss": 2.0948, + "epoch": 0.31, + "grad_norm": 11.022433280944824, + "learning_rate": 1.7942517675605574e-05, + "loss": 2.6278, "step": 2460 }, { - "epoch": 0.74, - "grad_norm": 19.572534561157227, - "learning_rate": 1.5068657913200362e-05, - "loss": 2.1058, + "epoch": 0.31, + "grad_norm": 8.584240913391113, + "learning_rate": 1.7941680960548888e-05, + "loss": 1.2164, "step": 2461 }, { - "epoch": 0.74, - "grad_norm": 13.721003532409668, - "learning_rate": 1.5066653302595972e-05, - "loss": 1.4905, + "epoch": 0.31, + "grad_norm": 11.213518142700195, + "learning_rate": 1.7940844245492198e-05, + "loss": 1.2233, "step": 2462 }, { - "epoch": 0.74, - "grad_norm": 14.475379943847656, - "learning_rate": 1.5064648691991582e-05, - "loss": 1.7658, + "epoch": 0.31, + "grad_norm": 17.76910400390625, + "learning_rate": 1.794000753043551e-05, + "loss": 2.8389, "step": 2463 }, { - "epoch": 0.74, - "grad_norm": 24.027629852294922, - "learning_rate": 1.5062644081387192e-05, - "loss": 2.6237, + "epoch": 0.31, + "grad_norm": 10.86572265625, + "learning_rate": 1.7939170815378825e-05, + "loss": 1.7707, "step": 2464 }, { - "epoch": 0.74, - "grad_norm": 15.826460838317871, - "learning_rate": 1.50606394707828e-05, - "loss": 2.6549, + "epoch": 0.31, + "grad_norm": 16.505216598510742, + "learning_rate": 1.7938334100322135e-05, + "loss": 1.9726, "step": 2465 }, { - "epoch": 0.74, - "grad_norm": 43.245018005371094, - "learning_rate": 1.5058634860178412e-05, - "loss": 2.9669, + "epoch": 0.31, + "grad_norm": 18.240514755249023, + "learning_rate": 1.793749738526545e-05, + "loss": 4.2534, "step": 2466 }, { - "epoch": 0.74, - "grad_norm": 13.400920867919922, - "learning_rate": 1.5056630249574022e-05, - "loss": 1.9453, + "epoch": 0.31, + "grad_norm": 11.698887825012207, + "learning_rate": 1.793666067020876e-05, + "loss": 2.9967, "step": 2467 }, { - "epoch": 0.74, - "grad_norm": 15.512686729431152, - "learning_rate": 1.505462563896963e-05, - "loss": 1.9563, + "epoch": 0.31, + "grad_norm": 16.417949676513672, + "learning_rate": 1.7935823955152073e-05, + "loss": 3.3745, "step": 2468 }, { - "epoch": 0.74, - "grad_norm": 18.510339736938477, - "learning_rate": 1.5052621028365242e-05, - "loss": 1.5118, + "epoch": 0.31, + "grad_norm": 9.704050064086914, + "learning_rate": 1.7934987240095387e-05, + "loss": 3.0882, "step": 2469 }, { - "epoch": 0.74, - "grad_norm": 13.163358688354492, - "learning_rate": 1.505061641776085e-05, - "loss": 1.8231, + "epoch": 0.31, + "grad_norm": 12.788137435913086, + "learning_rate": 1.79341505250387e-05, + "loss": 3.1269, "step": 2470 }, { - "epoch": 0.74, - "grad_norm": 8.8781156539917, - "learning_rate": 1.5048611807156461e-05, - "loss": 1.1656, + "epoch": 0.31, + "grad_norm": 12.552366256713867, + "learning_rate": 1.793331380998201e-05, + "loss": 2.8718, "step": 2471 }, { - "epoch": 0.74, - "grad_norm": 25.674057006835938, - "learning_rate": 1.504660719655207e-05, - "loss": 2.7279, + "epoch": 0.31, + "grad_norm": 9.499432563781738, + "learning_rate": 1.7932477094925324e-05, + "loss": 3.5561, "step": 2472 }, { - "epoch": 0.74, - "grad_norm": 16.811283111572266, - "learning_rate": 1.5044602585947681e-05, - "loss": 2.0451, + "epoch": 0.31, + "grad_norm": 7.19352388381958, + "learning_rate": 1.7931640379868638e-05, + "loss": 1.654, "step": 2473 }, { - "epoch": 0.74, - "grad_norm": 15.99625301361084, - "learning_rate": 1.5042597975343291e-05, - "loss": 1.7554, + "epoch": 0.31, + "grad_norm": 9.706038475036621, + "learning_rate": 1.7930803664811948e-05, + "loss": 0.6615, "step": 2474 }, { - "epoch": 0.74, - "grad_norm": 16.740371704101562, - "learning_rate": 1.50405933647389e-05, - "loss": 2.9046, + "epoch": 0.31, + "grad_norm": 22.5076847076416, + "learning_rate": 1.7929966949755262e-05, + "loss": 2.4768, "step": 2475 }, { - "epoch": 0.74, - "grad_norm": 11.545669555664062, - "learning_rate": 1.5038588754134511e-05, - "loss": 1.1914, + "epoch": 0.31, + "grad_norm": 8.739575386047363, + "learning_rate": 1.7929130234698575e-05, + "loss": 0.5049, "step": 2476 }, { - "epoch": 0.74, - "grad_norm": 12.485255241394043, - "learning_rate": 1.503658414353012e-05, - "loss": 2.0212, + "epoch": 0.31, + "grad_norm": 12.416593551635742, + "learning_rate": 1.792829351964189e-05, + "loss": 2.2825, "step": 2477 }, { - "epoch": 0.75, - "grad_norm": 28.34869956970215, - "learning_rate": 1.503457953292573e-05, - "loss": 1.4173, + "epoch": 0.31, + "grad_norm": 12.056394577026367, + "learning_rate": 1.79274568045852e-05, + "loss": 1.8361, "step": 2478 }, { - "epoch": 0.75, - "grad_norm": 14.046238899230957, - "learning_rate": 1.5032574922321342e-05, - "loss": 2.3209, + "epoch": 0.31, + "grad_norm": 13.777711868286133, + "learning_rate": 1.7926620089528513e-05, + "loss": 2.752, "step": 2479 }, { - "epoch": 0.75, - "grad_norm": 40.98884201049805, - "learning_rate": 1.503057031171695e-05, - "loss": 2.7137, + "epoch": 0.31, + "grad_norm": 8.878487586975098, + "learning_rate": 1.7925783374471827e-05, + "loss": 1.4853, "step": 2480 }, { - "epoch": 0.75, - "grad_norm": 16.54846954345703, - "learning_rate": 1.502856570111256e-05, - "loss": 1.7074, + "epoch": 0.31, + "grad_norm": 16.995180130004883, + "learning_rate": 1.7924946659415137e-05, + "loss": 1.9121, "step": 2481 }, { - "epoch": 0.75, - "grad_norm": 34.91267776489258, - "learning_rate": 1.502656109050817e-05, - "loss": 3.7392, + "epoch": 0.31, + "grad_norm": 22.008657455444336, + "learning_rate": 1.792410994435845e-05, + "loss": 2.4208, "step": 2482 }, { - "epoch": 0.75, - "grad_norm": 11.860535621643066, - "learning_rate": 1.502455647990378e-05, - "loss": 2.6046, + "epoch": 0.31, + "grad_norm": 14.444175720214844, + "learning_rate": 1.7923273229301764e-05, + "loss": 2.655, "step": 2483 }, { - "epoch": 0.75, - "grad_norm": 14.997228622436523, - "learning_rate": 1.5022551869299389e-05, - "loss": 2.761, + "epoch": 0.31, + "grad_norm": 10.106785774230957, + "learning_rate": 1.7922436514245078e-05, + "loss": 1.8945, "step": 2484 }, { - "epoch": 0.75, - "grad_norm": 15.930320739746094, - "learning_rate": 1.5020547258695e-05, - "loss": 2.0798, + "epoch": 0.31, + "grad_norm": 7.286982536315918, + "learning_rate": 1.7921599799188388e-05, + "loss": 1.2408, "step": 2485 }, { - "epoch": 0.75, - "grad_norm": 28.564027786254883, - "learning_rate": 1.501854264809061e-05, - "loss": 2.5476, + "epoch": 0.31, + "grad_norm": 19.98948097229004, + "learning_rate": 1.7920763084131702e-05, + "loss": 1.4448, "step": 2486 }, { - "epoch": 0.75, - "grad_norm": 18.10663604736328, - "learning_rate": 1.5016538037486219e-05, - "loss": 2.6392, + "epoch": 0.31, + "grad_norm": 12.024795532226562, + "learning_rate": 1.7919926369075012e-05, + "loss": 2.46, "step": 2487 }, { - "epoch": 0.75, - "grad_norm": 22.97527313232422, - "learning_rate": 1.501453342688183e-05, - "loss": 2.0554, + "epoch": 0.31, + "grad_norm": 20.353967666625977, + "learning_rate": 1.7919089654018326e-05, + "loss": 2.8392, "step": 2488 }, { - "epoch": 0.75, - "grad_norm": 18.91280174255371, - "learning_rate": 1.5012528816277439e-05, - "loss": 2.5951, + "epoch": 0.31, + "grad_norm": 11.455394744873047, + "learning_rate": 1.791825293896164e-05, + "loss": 1.6463, "step": 2489 }, { - "epoch": 0.75, - "grad_norm": 22.503339767456055, - "learning_rate": 1.5010524205673049e-05, - "loss": 1.7317, + "epoch": 0.31, + "grad_norm": 24.082109451293945, + "learning_rate": 1.791741622390495e-05, + "loss": 1.7333, "step": 2490 }, { - "epoch": 0.75, - "grad_norm": 20.57047462463379, - "learning_rate": 1.5008519595068661e-05, - "loss": 2.3906, + "epoch": 0.31, + "grad_norm": 15.034388542175293, + "learning_rate": 1.7916579508848263e-05, + "loss": 3.2048, "step": 2491 }, { - "epoch": 0.75, - "grad_norm": 17.44088363647461, - "learning_rate": 1.500651498446427e-05, - "loss": 2.3833, + "epoch": 0.31, + "grad_norm": 8.346967697143555, + "learning_rate": 1.7915742793791573e-05, + "loss": 0.8005, "step": 2492 }, { - "epoch": 0.75, - "grad_norm": 11.56372356414795, - "learning_rate": 1.500451037385988e-05, - "loss": 2.2393, + "epoch": 0.31, + "grad_norm": 19.777572631835938, + "learning_rate": 1.7914906078734887e-05, + "loss": 3.237, "step": 2493 }, { - "epoch": 0.75, - "grad_norm": 36.66616439819336, - "learning_rate": 1.5002505763255488e-05, - "loss": 2.0996, + "epoch": 0.31, + "grad_norm": 10.796478271484375, + "learning_rate": 1.79140693636782e-05, + "loss": 2.6851, "step": 2494 }, { - "epoch": 0.75, - "grad_norm": 59.76615905761719, - "learning_rate": 1.50005011526511e-05, - "loss": 1.3206, + "epoch": 0.31, + "grad_norm": 28.78697967529297, + "learning_rate": 1.791323264862151e-05, + "loss": 2.892, "step": 2495 }, { - "epoch": 0.75, - "grad_norm": 10.487852096557617, - "learning_rate": 1.4998496542046708e-05, - "loss": 1.2596, + "epoch": 0.31, + "grad_norm": 13.504036903381348, + "learning_rate": 1.7912395933564825e-05, + "loss": 2.1438, "step": 2496 }, { - "epoch": 0.75, - "grad_norm": 17.123096466064453, - "learning_rate": 1.4996491931442318e-05, - "loss": 2.1562, + "epoch": 0.31, + "grad_norm": 16.800512313842773, + "learning_rate": 1.7911559218508138e-05, + "loss": 2.822, "step": 2497 }, { - "epoch": 0.75, - "grad_norm": 13.120617866516113, - "learning_rate": 1.499448732083793e-05, - "loss": 1.2493, + "epoch": 0.31, + "grad_norm": 15.418956756591797, + "learning_rate": 1.7910722503451452e-05, + "loss": 2.4324, "step": 2498 }, { - "epoch": 0.75, - "grad_norm": 11.029601097106934, - "learning_rate": 1.4992482710233538e-05, - "loss": 0.918, + "epoch": 0.31, + "grad_norm": 15.057035446166992, + "learning_rate": 1.7909885788394762e-05, + "loss": 2.5772, "step": 2499 }, { - "epoch": 0.75, - "grad_norm": 38.185813903808594, - "learning_rate": 1.4990478099629148e-05, - "loss": 3.1123, + "epoch": 0.31, + "grad_norm": 19.443241119384766, + "learning_rate": 1.7909049073338076e-05, + "loss": 4.5712, "step": 2500 }, { - "epoch": 0.75, - "grad_norm": 21.2143611907959, - "learning_rate": 1.4988473489024757e-05, - "loss": 1.9136, + "epoch": 0.31, + "grad_norm": 18.084739685058594, + "learning_rate": 1.790821235828139e-05, + "loss": 4.0203, "step": 2501 }, { - "epoch": 0.75, - "grad_norm": 18.287214279174805, - "learning_rate": 1.4986468878420368e-05, - "loss": 2.3058, + "epoch": 0.31, + "grad_norm": 16.893775939941406, + "learning_rate": 1.79073756432247e-05, + "loss": 1.9783, "step": 2502 }, { - "epoch": 0.75, - "grad_norm": 20.503644943237305, - "learning_rate": 1.4984464267815979e-05, - "loss": 3.3248, + "epoch": 0.31, + "grad_norm": 17.322256088256836, + "learning_rate": 1.7906538928168013e-05, + "loss": 2.7375, "step": 2503 }, { - "epoch": 0.75, - "grad_norm": 18.117403030395508, - "learning_rate": 1.4982459657211587e-05, - "loss": 2.5097, + "epoch": 0.31, + "grad_norm": 14.303987503051758, + "learning_rate": 1.7905702213111327e-05, + "loss": 1.305, "step": 2504 }, { - "epoch": 0.75, - "grad_norm": 16.31001853942871, - "learning_rate": 1.4980455046607199e-05, - "loss": 1.7266, + "epoch": 0.31, + "grad_norm": 9.188926696777344, + "learning_rate": 1.790486549805464e-05, + "loss": 1.6407, "step": 2505 }, { - "epoch": 0.75, - "grad_norm": 12.371569633483887, - "learning_rate": 1.4978450436002807e-05, - "loss": 1.9252, + "epoch": 0.31, + "grad_norm": 12.038647651672363, + "learning_rate": 1.790402878299795e-05, + "loss": 2.5819, "step": 2506 }, { - "epoch": 0.75, - "grad_norm": 11.64188289642334, - "learning_rate": 1.4976445825398417e-05, - "loss": 2.0164, + "epoch": 0.31, + "grad_norm": 12.675663948059082, + "learning_rate": 1.7903192067941265e-05, + "loss": 2.2682, "step": 2507 }, { - "epoch": 0.75, - "grad_norm": 17.393896102905273, - "learning_rate": 1.4974441214794027e-05, - "loss": 2.4099, + "epoch": 0.31, + "grad_norm": 7.899895191192627, + "learning_rate": 1.7902355352884578e-05, + "loss": 0.2706, "step": 2508 }, { - "epoch": 0.75, - "grad_norm": 16.229440689086914, - "learning_rate": 1.4972436604189637e-05, - "loss": 2.8192, + "epoch": 0.31, + "grad_norm": 14.513761520385742, + "learning_rate": 1.790151863782789e-05, + "loss": 3.308, "step": 2509 }, { - "epoch": 0.75, - "grad_norm": 19.683637619018555, - "learning_rate": 1.4970431993585247e-05, - "loss": 2.0972, + "epoch": 0.32, + "grad_norm": 7.7307000160217285, + "learning_rate": 1.7900681922771202e-05, + "loss": 1.3174, "step": 2510 }, { - "epoch": 0.75, - "grad_norm": 23.35310173034668, - "learning_rate": 1.4968427382980858e-05, - "loss": 2.0364, + "epoch": 0.32, + "grad_norm": 9.518709182739258, + "learning_rate": 1.7899845207714516e-05, + "loss": 0.598, "step": 2511 }, { - "epoch": 0.76, - "grad_norm": 10.506977081298828, - "learning_rate": 1.4966422772376468e-05, - "loss": 1.575, + "epoch": 0.32, + "grad_norm": 23.85696029663086, + "learning_rate": 1.789900849265783e-05, + "loss": 3.9654, "step": 2512 }, { - "epoch": 0.76, - "grad_norm": 24.63909339904785, - "learning_rate": 1.4964418161772076e-05, - "loss": 1.9545, + "epoch": 0.32, + "grad_norm": 6.402545928955078, + "learning_rate": 1.789817177760114e-05, + "loss": 0.2189, "step": 2513 }, { - "epoch": 0.76, - "grad_norm": 18.600322723388672, - "learning_rate": 1.4962413551167688e-05, - "loss": 2.0238, + "epoch": 0.32, + "grad_norm": 8.172592163085938, + "learning_rate": 1.7897335062544453e-05, + "loss": 1.0552, "step": 2514 }, { - "epoch": 0.76, - "grad_norm": 25.803688049316406, - "learning_rate": 1.4960408940563296e-05, - "loss": 1.7973, + "epoch": 0.32, + "grad_norm": 16.517786026000977, + "learning_rate": 1.7896498347487764e-05, + "loss": 3.0275, "step": 2515 }, { - "epoch": 0.76, - "grad_norm": 13.325533866882324, - "learning_rate": 1.4958404329958906e-05, - "loss": 1.5489, + "epoch": 0.32, + "grad_norm": 17.168933868408203, + "learning_rate": 1.7895661632431077e-05, + "loss": 1.0541, "step": 2516 }, { - "epoch": 0.76, - "grad_norm": 17.538631439208984, - "learning_rate": 1.4956399719354518e-05, - "loss": 1.6318, + "epoch": 0.32, + "grad_norm": 5.158097743988037, + "learning_rate": 1.789482491737439e-05, + "loss": 1.6071, "step": 2517 }, { - "epoch": 0.76, - "grad_norm": 20.671701431274414, - "learning_rate": 1.4954395108750126e-05, - "loss": 2.2905, + "epoch": 0.32, + "grad_norm": 11.919731140136719, + "learning_rate": 1.78939882023177e-05, + "loss": 1.4199, "step": 2518 }, { - "epoch": 0.76, - "grad_norm": 18.96885871887207, - "learning_rate": 1.4952390498145736e-05, - "loss": 1.2961, + "epoch": 0.32, + "grad_norm": 11.247430801391602, + "learning_rate": 1.7893151487261015e-05, + "loss": 2.6482, "step": 2519 }, { - "epoch": 0.76, - "grad_norm": 17.124177932739258, - "learning_rate": 1.4950385887541345e-05, - "loss": 2.1604, - "step": 2520 - }, - { - "epoch": 0.76, - "eval_loss": 0.269967257976532, - "eval_runtime": 43.2806, - "eval_samples_per_second": 34.172, - "eval_steps_per_second": 34.172, + "epoch": 0.32, + "grad_norm": 36.31636428833008, + "learning_rate": 1.7892314772204325e-05, + "loss": 1.5908, "step": 2520 }, { - "epoch": 0.76, - "grad_norm": 22.06162452697754, - "learning_rate": 1.4948381276936957e-05, - "loss": 2.1107, + "epoch": 0.32, + "grad_norm": 12.56596565246582, + "learning_rate": 1.789147805714764e-05, + "loss": 2.6619, "step": 2521 }, { - "epoch": 0.76, - "grad_norm": 15.108476638793945, - "learning_rate": 1.4946376666332567e-05, - "loss": 1.7432, + "epoch": 0.32, + "grad_norm": 6.203104496002197, + "learning_rate": 1.7890641342090952e-05, + "loss": 0.7091, "step": 2522 }, { - "epoch": 0.76, - "grad_norm": 13.295374870300293, - "learning_rate": 1.4944372055728175e-05, - "loss": 2.03, + "epoch": 0.32, + "grad_norm": 17.171630859375, + "learning_rate": 1.7889804627034263e-05, + "loss": 1.9813, "step": 2523 }, { - "epoch": 0.76, - "grad_norm": 19.436033248901367, - "learning_rate": 1.4942367445123787e-05, - "loss": 1.6658, + "epoch": 0.32, + "grad_norm": 6.632760047912598, + "learning_rate": 1.7888967911977576e-05, + "loss": 1.1465, "step": 2524 }, { - "epoch": 0.76, - "grad_norm": 36.37664031982422, - "learning_rate": 1.4940362834519395e-05, - "loss": 3.2737, + "epoch": 0.32, + "grad_norm": 24.890317916870117, + "learning_rate": 1.788813119692089e-05, + "loss": 2.9452, "step": 2525 }, { - "epoch": 0.76, - "grad_norm": 11.738432884216309, - "learning_rate": 1.4938358223915005e-05, - "loss": 1.6208, + "epoch": 0.32, + "grad_norm": 10.765260696411133, + "learning_rate": 1.7887294481864204e-05, + "loss": 2.3937, "step": 2526 }, { - "epoch": 0.76, - "grad_norm": 17.715194702148438, - "learning_rate": 1.4936353613310614e-05, - "loss": 2.3697, + "epoch": 0.32, + "grad_norm": 12.482851028442383, + "learning_rate": 1.7886457766807514e-05, + "loss": 2.7714, "step": 2527 }, { - "epoch": 0.76, - "grad_norm": 21.08336639404297, - "learning_rate": 1.4934349002706226e-05, - "loss": 2.4212, + "epoch": 0.32, + "grad_norm": 11.03923511505127, + "learning_rate": 1.7885621051750828e-05, + "loss": 3.0968, "step": 2528 }, { - "epoch": 0.76, - "grad_norm": 19.754703521728516, - "learning_rate": 1.4932344392101836e-05, - "loss": 1.71, + "epoch": 0.32, + "grad_norm": 13.121932029724121, + "learning_rate": 1.788478433669414e-05, + "loss": 1.7945, "step": 2529 }, { - "epoch": 0.76, - "grad_norm": 21.412639617919922, - "learning_rate": 1.4930339781497446e-05, - "loss": 2.807, + "epoch": 0.32, + "grad_norm": 16.075044631958008, + "learning_rate": 1.788394762163745e-05, + "loss": 2.1452, "step": 2530 }, { - "epoch": 0.76, - "grad_norm": 18.40741729736328, - "learning_rate": 1.4928335170893056e-05, - "loss": 2.2224, + "epoch": 0.32, + "grad_norm": 21.7185115814209, + "learning_rate": 1.7883110906580765e-05, + "loss": 3.6425, "step": 2531 }, { - "epoch": 0.76, - "grad_norm": 33.87398147583008, - "learning_rate": 1.4926330560288664e-05, - "loss": 2.8005, + "epoch": 0.32, + "grad_norm": 22.87784767150879, + "learning_rate": 1.788227419152408e-05, + "loss": 1.8277, "step": 2532 }, { - "epoch": 0.76, - "grad_norm": 14.195319175720215, - "learning_rate": 1.4924325949684276e-05, - "loss": 2.4552, + "epoch": 0.32, + "grad_norm": 21.376379013061523, + "learning_rate": 1.7881437476467392e-05, + "loss": 2.8571, "step": 2533 }, { - "epoch": 0.76, - "grad_norm": 14.160918235778809, - "learning_rate": 1.4922321339079886e-05, - "loss": 2.4736, + "epoch": 0.32, + "grad_norm": 10.283143997192383, + "learning_rate": 1.7880600761410703e-05, + "loss": 1.6794, "step": 2534 }, { - "epoch": 0.76, - "grad_norm": 31.502965927124023, - "learning_rate": 1.4920316728475494e-05, - "loss": 2.5237, + "epoch": 0.32, + "grad_norm": 20.66465187072754, + "learning_rate": 1.7879764046354016e-05, + "loss": 2.2436, "step": 2535 }, { - "epoch": 0.76, - "grad_norm": 17.290319442749023, - "learning_rate": 1.4918312117871106e-05, - "loss": 2.2399, + "epoch": 0.32, + "grad_norm": 8.404132843017578, + "learning_rate": 1.787892733129733e-05, + "loss": 1.4817, "step": 2536 }, { - "epoch": 0.76, - "grad_norm": 13.980632781982422, - "learning_rate": 1.4916307507266715e-05, - "loss": 2.3494, + "epoch": 0.32, + "grad_norm": 10.486577033996582, + "learning_rate": 1.787809061624064e-05, + "loss": 2.5069, "step": 2537 }, { - "epoch": 0.76, - "grad_norm": 14.719244956970215, - "learning_rate": 1.4914302896662325e-05, - "loss": 2.5073, + "epoch": 0.32, + "grad_norm": 8.028034210205078, + "learning_rate": 1.7877253901183954e-05, + "loss": 1.0015, "step": 2538 }, { - "epoch": 0.76, - "grad_norm": 19.83281707763672, - "learning_rate": 1.4912298286057933e-05, - "loss": 2.4892, + "epoch": 0.32, + "grad_norm": 23.063966751098633, + "learning_rate": 1.7876417186127267e-05, + "loss": 3.4067, "step": 2539 }, { - "epoch": 0.76, - "grad_norm": 13.933186531066895, - "learning_rate": 1.4910293675453545e-05, - "loss": 2.6791, + "epoch": 0.32, + "grad_norm": 23.007061004638672, + "learning_rate": 1.7875580471070578e-05, + "loss": 2.9092, "step": 2540 }, { - "epoch": 0.76, - "grad_norm": 20.118640899658203, - "learning_rate": 1.4908289064849155e-05, - "loss": 1.996, + "epoch": 0.32, + "grad_norm": 7.917148590087891, + "learning_rate": 1.787474375601389e-05, + "loss": 1.5232, "step": 2541 }, { - "epoch": 0.76, - "grad_norm": 20.20134162902832, - "learning_rate": 1.4906284454244763e-05, - "loss": 2.2007, + "epoch": 0.32, + "grad_norm": 9.655314445495605, + "learning_rate": 1.7873907040957205e-05, + "loss": 1.5226, "step": 2542 }, { - "epoch": 0.76, - "grad_norm": 14.23250961303711, - "learning_rate": 1.4904279843640375e-05, - "loss": 2.2191, + "epoch": 0.32, + "grad_norm": 11.38919734954834, + "learning_rate": 1.7873070325900515e-05, + "loss": 1.2438, "step": 2543 }, { - "epoch": 0.76, - "grad_norm": 14.728852272033691, - "learning_rate": 1.4902275233035984e-05, - "loss": 1.9154, + "epoch": 0.32, + "grad_norm": 9.895431518554688, + "learning_rate": 1.787223361084383e-05, + "loss": 2.0275, "step": 2544 }, { - "epoch": 0.77, - "grad_norm": 38.36711120605469, - "learning_rate": 1.4900270622431594e-05, - "loss": 2.367, + "epoch": 0.32, + "grad_norm": 11.29218864440918, + "learning_rate": 1.787139689578714e-05, + "loss": 2.4134, "step": 2545 }, { - "epoch": 0.77, - "grad_norm": 17.614601135253906, - "learning_rate": 1.4898266011827205e-05, - "loss": 1.1838, + "epoch": 0.32, + "grad_norm": 39.96311569213867, + "learning_rate": 1.7870560180730453e-05, + "loss": 2.4428, "step": 2546 }, { - "epoch": 0.77, - "grad_norm": 23.657424926757812, - "learning_rate": 1.4896261401222814e-05, - "loss": 2.6376, + "epoch": 0.32, + "grad_norm": 27.714492797851562, + "learning_rate": 1.7869723465673767e-05, + "loss": 3.6484, "step": 2547 }, { - "epoch": 0.77, - "grad_norm": 10.91047191619873, - "learning_rate": 1.4894256790618424e-05, - "loss": 2.0975, + "epoch": 0.32, + "grad_norm": 7.043478012084961, + "learning_rate": 1.7868886750617077e-05, + "loss": 1.2048, "step": 2548 }, { - "epoch": 0.77, - "grad_norm": 19.83192253112793, - "learning_rate": 1.4892252180014032e-05, - "loss": 2.7231, + "epoch": 0.32, + "grad_norm": 13.39302921295166, + "learning_rate": 1.786805003556039e-05, + "loss": 2.9228, "step": 2549 }, { - "epoch": 0.77, - "grad_norm": 6.836325645446777, - "learning_rate": 1.4890247569409644e-05, - "loss": 1.3988, + "epoch": 0.32, + "grad_norm": 20.650859832763672, + "learning_rate": 1.7867213320503704e-05, + "loss": 3.4154, "step": 2550 }, { - "epoch": 0.77, - "grad_norm": 16.332334518432617, - "learning_rate": 1.4888242958805252e-05, - "loss": 1.9877, + "epoch": 0.32, + "grad_norm": 17.851686477661133, + "learning_rate": 1.7866376605447014e-05, + "loss": 2.9386, "step": 2551 }, { - "epoch": 0.77, - "grad_norm": 23.387727737426758, - "learning_rate": 1.4886238348200862e-05, - "loss": 2.8849, + "epoch": 0.32, + "grad_norm": 10.273046493530273, + "learning_rate": 1.7865539890390328e-05, + "loss": 1.0958, "step": 2552 }, { - "epoch": 0.77, - "grad_norm": 33.43765640258789, - "learning_rate": 1.4884233737596474e-05, - "loss": 2.3284, + "epoch": 0.32, + "grad_norm": 25.07984161376953, + "learning_rate": 1.786470317533364e-05, + "loss": 4.0964, "step": 2553 }, { - "epoch": 0.77, - "grad_norm": 10.308141708374023, - "learning_rate": 1.4882229126992083e-05, - "loss": 2.6766, + "epoch": 0.32, + "grad_norm": 21.487104415893555, + "learning_rate": 1.7863866460276955e-05, + "loss": 2.4228, "step": 2554 }, { - "epoch": 0.77, - "grad_norm": 20.615131378173828, - "learning_rate": 1.4880224516387693e-05, - "loss": 2.4127, + "epoch": 0.32, + "grad_norm": 11.11598014831543, + "learning_rate": 1.7863029745220266e-05, + "loss": 1.9753, "step": 2555 }, { - "epoch": 0.77, - "grad_norm": 18.643369674682617, - "learning_rate": 1.4878219905783303e-05, - "loss": 1.9807, + "epoch": 0.32, + "grad_norm": 13.04572868347168, + "learning_rate": 1.786219303016358e-05, + "loss": 2.3264, "step": 2556 }, { - "epoch": 0.77, - "grad_norm": 29.241533279418945, - "learning_rate": 1.4876215295178913e-05, - "loss": 2.6441, + "epoch": 0.32, + "grad_norm": 8.850125312805176, + "learning_rate": 1.7861356315106893e-05, + "loss": 1.6477, "step": 2557 }, { - "epoch": 0.77, - "grad_norm": 19.864036560058594, - "learning_rate": 1.4874210684574521e-05, - "loss": 2.237, + "epoch": 0.32, + "grad_norm": 14.255194664001465, + "learning_rate": 1.7860519600050203e-05, + "loss": 3.6128, "step": 2558 }, { - "epoch": 0.77, - "grad_norm": 9.182751655578613, - "learning_rate": 1.4872206073970133e-05, - "loss": 1.6721, + "epoch": 0.32, + "grad_norm": 12.04518985748291, + "learning_rate": 1.7859682884993517e-05, + "loss": 2.2058, "step": 2559 }, { - "epoch": 0.77, - "grad_norm": 17.696592330932617, - "learning_rate": 1.4870201463365743e-05, - "loss": 1.8645, + "epoch": 0.32, + "grad_norm": 9.668314933776855, + "learning_rate": 1.785884616993683e-05, + "loss": 1.3232, "step": 2560 }, { - "epoch": 0.77, - "grad_norm": 10.09614086151123, - "learning_rate": 1.4868196852761352e-05, - "loss": 1.4887, + "epoch": 0.32, + "grad_norm": 13.472427368164062, + "learning_rate": 1.7858009454880144e-05, + "loss": 2.4424, "step": 2561 }, { - "epoch": 0.77, - "grad_norm": 10.682119369506836, - "learning_rate": 1.4866192242156963e-05, - "loss": 1.296, + "epoch": 0.32, + "grad_norm": 15.245526313781738, + "learning_rate": 1.7857172739823454e-05, + "loss": 1.7416, "step": 2562 }, { - "epoch": 0.77, - "grad_norm": 18.929485321044922, - "learning_rate": 1.4864187631552572e-05, - "loss": 2.7717, + "epoch": 0.32, + "grad_norm": 13.720373153686523, + "learning_rate": 1.7856336024766768e-05, + "loss": 1.6349, "step": 2563 }, { - "epoch": 0.77, - "grad_norm": 21.349645614624023, - "learning_rate": 1.4862183020948182e-05, - "loss": 2.4609, + "epoch": 0.32, + "grad_norm": 15.144506454467773, + "learning_rate": 1.785549930971008e-05, + "loss": 2.4472, "step": 2564 }, { - "epoch": 0.77, - "grad_norm": 7.982182502746582, - "learning_rate": 1.4860178410343794e-05, - "loss": 1.9265, + "epoch": 0.32, + "grad_norm": 14.76689624786377, + "learning_rate": 1.7854662594653392e-05, + "loss": 0.6717, "step": 2565 }, { - "epoch": 0.77, - "grad_norm": 15.188501358032227, - "learning_rate": 1.4858173799739402e-05, - "loss": 3.0483, + "epoch": 0.32, + "grad_norm": 11.874075889587402, + "learning_rate": 1.7853825879596706e-05, + "loss": 1.989, "step": 2566 }, { - "epoch": 0.77, - "grad_norm": 37.26816177368164, - "learning_rate": 1.4856169189135012e-05, - "loss": 2.585, + "epoch": 0.32, + "grad_norm": 11.744961738586426, + "learning_rate": 1.785298916454002e-05, + "loss": 1.938, "step": 2567 }, { - "epoch": 0.77, - "grad_norm": 13.02489948272705, - "learning_rate": 1.485416457853062e-05, - "loss": 1.8137, + "epoch": 0.32, + "grad_norm": 7.1501145362854, + "learning_rate": 1.785215244948333e-05, + "loss": 0.8565, "step": 2568 }, { - "epoch": 0.77, - "grad_norm": 15.654376029968262, - "learning_rate": 1.4852159967926232e-05, - "loss": 2.2203, + "epoch": 0.32, + "grad_norm": 11.965447425842285, + "learning_rate": 1.7851315734426643e-05, + "loss": 0.9023, "step": 2569 }, { - "epoch": 0.77, - "grad_norm": 9.267987251281738, - "learning_rate": 1.485015535732184e-05, - "loss": 1.362, + "epoch": 0.32, + "grad_norm": 11.98586368560791, + "learning_rate": 1.7850479019369957e-05, + "loss": 0.6164, "step": 2570 }, { - "epoch": 0.77, - "grad_norm": 31.388477325439453, - "learning_rate": 1.484815074671745e-05, - "loss": 2.3867, + "epoch": 0.32, + "grad_norm": 28.456218719482422, + "learning_rate": 1.7849642304313267e-05, + "loss": 4.24, "step": 2571 }, { - "epoch": 0.77, - "grad_norm": 19.304162979125977, - "learning_rate": 1.4846146136113062e-05, - "loss": 2.6042, + "epoch": 0.32, + "grad_norm": 17.911380767822266, + "learning_rate": 1.784880558925658e-05, + "loss": 2.4132, "step": 2572 }, { - "epoch": 0.77, - "grad_norm": 21.53330421447754, - "learning_rate": 1.4844141525508671e-05, - "loss": 2.2464, + "epoch": 0.32, + "grad_norm": 5.884377479553223, + "learning_rate": 1.784796887419989e-05, + "loss": 2.5055, "step": 2573 }, { - "epoch": 0.77, - "grad_norm": 14.074399948120117, - "learning_rate": 1.4842136914904281e-05, - "loss": 1.6423, + "epoch": 0.32, + "grad_norm": 16.891359329223633, + "learning_rate": 1.7847132159143205e-05, + "loss": 2.5807, "step": 2574 }, { - "epoch": 0.77, - "grad_norm": 30.000886917114258, - "learning_rate": 1.484013230429989e-05, - "loss": 1.9405, + "epoch": 0.32, + "grad_norm": 16.468059539794922, + "learning_rate": 1.7846295444086518e-05, + "loss": 1.6703, "step": 2575 }, { - "epoch": 0.77, - "grad_norm": 12.38129711151123, - "learning_rate": 1.4838127693695501e-05, - "loss": 1.6343, + "epoch": 0.32, + "grad_norm": 10.617067337036133, + "learning_rate": 1.784545872902983e-05, + "loss": 1.2433, "step": 2576 }, { - "epoch": 0.77, - "grad_norm": 15.83830451965332, - "learning_rate": 1.4836123083091111e-05, - "loss": 1.4498, + "epoch": 0.32, + "grad_norm": 12.173160552978516, + "learning_rate": 1.7844622013973142e-05, + "loss": 3.2946, "step": 2577 }, { - "epoch": 0.78, - "grad_norm": 20.503822326660156, - "learning_rate": 1.483411847248672e-05, - "loss": 2.8636, + "epoch": 0.32, + "grad_norm": 5.2667059898376465, + "learning_rate": 1.7843785298916456e-05, + "loss": 1.5948, "step": 2578 }, { - "epoch": 0.78, - "grad_norm": 15.516641616821289, - "learning_rate": 1.4832113861882331e-05, - "loss": 2.0966, + "epoch": 0.32, + "grad_norm": 5.8900837898254395, + "learning_rate": 1.7842948583859766e-05, + "loss": 0.7239, "step": 2579 }, { - "epoch": 0.78, - "grad_norm": 18.49781036376953, - "learning_rate": 1.483010925127794e-05, - "loss": 3.8588, + "epoch": 0.32, + "grad_norm": 14.397953033447266, + "learning_rate": 1.784211186880308e-05, + "loss": 0.5331, "step": 2580 }, { - "epoch": 0.78, - "grad_norm": 11.766064643859863, - "learning_rate": 1.4828104640673552e-05, - "loss": 3.0766, + "epoch": 0.32, + "grad_norm": 18.99256706237793, + "learning_rate": 1.7841275153746393e-05, + "loss": 2.269, "step": 2581 }, { - "epoch": 0.78, - "grad_norm": 28.008140563964844, - "learning_rate": 1.482610003006916e-05, - "loss": 2.8604, + "epoch": 0.32, + "grad_norm": 25.798974990844727, + "learning_rate": 1.7840438438689707e-05, + "loss": 2.0258, "step": 2582 }, { - "epoch": 0.78, - "grad_norm": 11.355888366699219, - "learning_rate": 1.482409541946477e-05, - "loss": 1.8569, + "epoch": 0.32, + "grad_norm": 12.33133316040039, + "learning_rate": 1.7839601723633017e-05, + "loss": 1.9399, "step": 2583 }, { - "epoch": 0.78, - "grad_norm": 24.921497344970703, - "learning_rate": 1.4822090808860382e-05, - "loss": 2.3311, + "epoch": 0.32, + "grad_norm": 10.246801376342773, + "learning_rate": 1.783876500857633e-05, + "loss": 2.757, "step": 2584 }, { - "epoch": 0.78, - "grad_norm": 49.57196807861328, - "learning_rate": 1.482008619825599e-05, - "loss": 3.8008, + "epoch": 0.32, + "grad_norm": 23.0379581451416, + "learning_rate": 1.7837928293519645e-05, + "loss": 2.7737, "step": 2585 }, { - "epoch": 0.78, - "grad_norm": 18.97112274169922, - "learning_rate": 1.48180815876516e-05, - "loss": 2.2731, + "epoch": 0.32, + "grad_norm": 6.544103145599365, + "learning_rate": 1.7837091578462955e-05, + "loss": 0.6896, "step": 2586 }, { - "epoch": 0.78, - "grad_norm": 19.699682235717773, - "learning_rate": 1.4816076977047209e-05, - "loss": 3.2435, + "epoch": 0.32, + "grad_norm": 12.11794662475586, + "learning_rate": 1.783625486340627e-05, + "loss": 1.5552, "step": 2587 }, { - "epoch": 0.78, - "grad_norm": 15.111373901367188, - "learning_rate": 1.481407236644282e-05, - "loss": 2.6261, + "epoch": 0.32, + "grad_norm": 17.367733001708984, + "learning_rate": 1.7835418148349582e-05, + "loss": 2.991, "step": 2588 }, { - "epoch": 0.78, - "grad_norm": 39.739288330078125, - "learning_rate": 1.481206775583843e-05, - "loss": 3.0957, + "epoch": 0.32, + "grad_norm": 14.063064575195312, + "learning_rate": 1.7834581433292896e-05, + "loss": 2.0534, "step": 2589 }, { - "epoch": 0.78, - "grad_norm": 7.469949245452881, - "learning_rate": 1.4810063145234039e-05, - "loss": 1.5037, + "epoch": 0.33, + "grad_norm": 30.218265533447266, + "learning_rate": 1.7833744718236206e-05, + "loss": 2.8303, "step": 2590 }, { - "epoch": 0.78, - "grad_norm": 48.33216857910156, - "learning_rate": 1.480805853462965e-05, - "loss": 1.5074, + "epoch": 0.33, + "grad_norm": 41.22004318237305, + "learning_rate": 1.783290800317952e-05, + "loss": 2.892, "step": 2591 }, { - "epoch": 0.78, - "grad_norm": 14.457548141479492, - "learning_rate": 1.4806053924025259e-05, - "loss": 2.0293, + "epoch": 0.33, + "grad_norm": 9.430306434631348, + "learning_rate": 1.7832071288122833e-05, + "loss": 2.3149, "step": 2592 }, { - "epoch": 0.78, - "grad_norm": 9.994394302368164, - "learning_rate": 1.480404931342087e-05, - "loss": 1.6135, + "epoch": 0.33, + "grad_norm": 12.786920547485352, + "learning_rate": 1.7831234573066144e-05, + "loss": 1.0034, "step": 2593 }, { - "epoch": 0.78, - "grad_norm": 16.707799911499023, - "learning_rate": 1.4802044702816478e-05, - "loss": 2.0803, + "epoch": 0.33, + "grad_norm": 20.871122360229492, + "learning_rate": 1.7830397858009457e-05, + "loss": 2.1801, "step": 2594 }, { - "epoch": 0.78, - "grad_norm": 11.45969009399414, - "learning_rate": 1.480004009221209e-05, - "loss": 2.0687, + "epoch": 0.33, + "grad_norm": 9.6037015914917, + "learning_rate": 1.782956114295277e-05, + "loss": 0.4981, "step": 2595 }, { - "epoch": 0.78, - "grad_norm": 17.346237182617188, - "learning_rate": 1.47980354816077e-05, - "loss": 1.968, + "epoch": 0.33, + "grad_norm": 25.444738388061523, + "learning_rate": 1.782872442789608e-05, + "loss": 3.3543, "step": 2596 }, { - "epoch": 0.78, - "grad_norm": 16.296354293823242, - "learning_rate": 1.4796030871003308e-05, - "loss": 2.7071, + "epoch": 0.33, + "grad_norm": 7.93801736831665, + "learning_rate": 1.7827887712839395e-05, + "loss": 0.1098, "step": 2597 }, { - "epoch": 0.78, - "grad_norm": 24.699426651000977, - "learning_rate": 1.479402626039892e-05, - "loss": 2.575, + "epoch": 0.33, + "grad_norm": 15.348746299743652, + "learning_rate": 1.7827050997782705e-05, + "loss": 2.0703, "step": 2598 }, { - "epoch": 0.78, - "grad_norm": 15.403938293457031, - "learning_rate": 1.4792021649794528e-05, - "loss": 2.3392, + "epoch": 0.33, + "grad_norm": 21.247926712036133, + "learning_rate": 1.782621428272602e-05, + "loss": 2.3706, "step": 2599 }, { - "epoch": 0.78, - "grad_norm": 13.177908897399902, - "learning_rate": 1.4790017039190138e-05, - "loss": 1.4653, + "epoch": 0.33, + "grad_norm": 20.806386947631836, + "learning_rate": 1.7825377567669332e-05, + "loss": 1.9695, "step": 2600 }, { - "epoch": 0.78, - "grad_norm": 34.016021728515625, - "learning_rate": 1.4788012428585748e-05, - "loss": 2.3153, + "epoch": 0.33, + "grad_norm": 14.542155265808105, + "learning_rate": 1.7824540852612643e-05, + "loss": 2.949, "step": 2601 }, { - "epoch": 0.78, - "grad_norm": 15.840597152709961, - "learning_rate": 1.4786007817981358e-05, - "loss": 2.5034, + "epoch": 0.33, + "grad_norm": 9.618982315063477, + "learning_rate": 1.7823704137555956e-05, + "loss": 0.7286, "step": 2602 }, { - "epoch": 0.78, - "grad_norm": 31.41590118408203, - "learning_rate": 1.4784003207376968e-05, - "loss": 2.0677, + "epoch": 0.33, + "grad_norm": 21.41895294189453, + "learning_rate": 1.782286742249927e-05, + "loss": 2.5352, "step": 2603 }, { - "epoch": 0.78, - "grad_norm": 12.184178352355957, - "learning_rate": 1.4781998596772578e-05, - "loss": 2.0374, + "epoch": 0.33, + "grad_norm": 14.966450691223145, + "learning_rate": 1.782203070744258e-05, + "loss": 1.6908, "step": 2604 }, { - "epoch": 0.78, - "grad_norm": 16.264497756958008, - "learning_rate": 1.4779993986168188e-05, - "loss": 2.4106, + "epoch": 0.33, + "grad_norm": 21.944358825683594, + "learning_rate": 1.7821193992385894e-05, + "loss": 1.6259, "step": 2605 }, { - "epoch": 0.78, - "grad_norm": 16.872928619384766, - "learning_rate": 1.4777989375563797e-05, - "loss": 2.3471, + "epoch": 0.33, + "grad_norm": 33.289207458496094, + "learning_rate": 1.7820357277329207e-05, + "loss": 3.2477, "step": 2606 }, { - "epoch": 0.78, - "grad_norm": 47.74468994140625, - "learning_rate": 1.4775984764959409e-05, - "loss": 3.2204, + "epoch": 0.33, + "grad_norm": 26.20473289489746, + "learning_rate": 1.7819520562272518e-05, + "loss": 1.8603, "step": 2607 }, { - "epoch": 0.78, - "grad_norm": 20.217355728149414, - "learning_rate": 1.4773980154355019e-05, - "loss": 1.9179, + "epoch": 0.33, + "grad_norm": 27.471969604492188, + "learning_rate": 1.781868384721583e-05, + "loss": 2.7483, "step": 2608 }, { - "epoch": 0.78, - "grad_norm": 13.135704040527344, - "learning_rate": 1.4771975543750627e-05, - "loss": 2.2236, + "epoch": 0.33, + "grad_norm": 33.70694351196289, + "learning_rate": 1.7817847132159145e-05, + "loss": 3.7981, "step": 2609 }, { - "epoch": 0.78, - "grad_norm": 10.10952377319336, - "learning_rate": 1.4769970933146239e-05, - "loss": 1.2541, + "epoch": 0.33, + "grad_norm": 11.390097618103027, + "learning_rate": 1.781701041710246e-05, + "loss": 2.6712, "step": 2610 }, { - "epoch": 0.79, - "grad_norm": 14.991072654724121, - "learning_rate": 1.4767966322541847e-05, - "loss": 1.6626, + "epoch": 0.33, + "grad_norm": 116.96737670898438, + "learning_rate": 1.781617370204577e-05, + "loss": 2.0294, "step": 2611 }, { - "epoch": 0.79, - "grad_norm": 18.55371856689453, - "learning_rate": 1.4765961711937457e-05, - "loss": 2.59, + "epoch": 0.33, + "grad_norm": 23.15081787109375, + "learning_rate": 1.7815336986989083e-05, + "loss": 3.2805, "step": 2612 }, { - "epoch": 0.79, - "grad_norm": 153.79713439941406, - "learning_rate": 1.4763957101333066e-05, - "loss": 1.6441, + "epoch": 0.33, + "grad_norm": 25.70244598388672, + "learning_rate": 1.7814500271932396e-05, + "loss": 2.0328, "step": 2613 }, { - "epoch": 0.79, - "grad_norm": 20.791709899902344, - "learning_rate": 1.4761952490728678e-05, - "loss": 2.2488, + "epoch": 0.33, + "grad_norm": 19.2539119720459, + "learning_rate": 1.7813663556875706e-05, + "loss": 2.9757, "step": 2614 }, { - "epoch": 0.79, - "grad_norm": 26.96784019470215, - "learning_rate": 1.4759947880124288e-05, - "loss": 2.4947, + "epoch": 0.33, + "grad_norm": 23.953353881835938, + "learning_rate": 1.781282684181902e-05, + "loss": 1.2766, "step": 2615 }, { - "epoch": 0.79, - "grad_norm": 31.95100975036621, - "learning_rate": 1.4757943269519896e-05, - "loss": 2.5303, + "epoch": 0.33, + "grad_norm": 11.397101402282715, + "learning_rate": 1.7811990126762334e-05, + "loss": 2.3818, "step": 2616 }, { - "epoch": 0.79, - "grad_norm": 16.064725875854492, - "learning_rate": 1.4755938658915508e-05, - "loss": 2.2598, + "epoch": 0.33, + "grad_norm": 20.783130645751953, + "learning_rate": 1.7811153411705647e-05, + "loss": 2.5762, "step": 2617 }, { - "epoch": 0.79, - "grad_norm": 19.35765266418457, - "learning_rate": 1.4753934048311116e-05, - "loss": 2.0552, + "epoch": 0.33, + "grad_norm": 7.296566009521484, + "learning_rate": 1.7810316696648958e-05, + "loss": 0.759, "step": 2618 }, { - "epoch": 0.79, - "grad_norm": 11.326220512390137, - "learning_rate": 1.4751929437706726e-05, - "loss": 2.1031, + "epoch": 0.33, + "grad_norm": 12.399662971496582, + "learning_rate": 1.780947998159227e-05, + "loss": 1.9589, "step": 2619 }, { - "epoch": 0.79, - "grad_norm": 8.919992446899414, - "learning_rate": 1.4749924827102338e-05, - "loss": 1.8168, + "epoch": 0.33, + "grad_norm": 35.2669677734375, + "learning_rate": 1.7808643266535585e-05, + "loss": 3.8683, "step": 2620 }, { - "epoch": 0.79, - "grad_norm": 14.81724739074707, - "learning_rate": 1.4747920216497946e-05, - "loss": 1.9064, + "epoch": 0.33, + "grad_norm": 23.687824249267578, + "learning_rate": 1.7807806551478895e-05, + "loss": 2.1461, "step": 2621 }, { - "epoch": 0.79, - "grad_norm": 31.04239845275879, - "learning_rate": 1.4745915605893557e-05, - "loss": 2.7644, + "epoch": 0.33, + "grad_norm": 21.20037078857422, + "learning_rate": 1.780696983642221e-05, + "loss": 3.3364, "step": 2622 }, { - "epoch": 0.79, - "grad_norm": 16.15270233154297, - "learning_rate": 1.4743910995289165e-05, - "loss": 2.0384, + "epoch": 0.33, + "grad_norm": 16.72637176513672, + "learning_rate": 1.7806133121365523e-05, + "loss": 2.9599, "step": 2623 }, { - "epoch": 0.79, - "grad_norm": 20.225746154785156, - "learning_rate": 1.4741906384684777e-05, - "loss": 2.7575, + "epoch": 0.33, + "grad_norm": 13.701391220092773, + "learning_rate": 1.7805296406308833e-05, + "loss": 4.3635, "step": 2624 }, { - "epoch": 0.79, - "grad_norm": 29.362613677978516, - "learning_rate": 1.4739901774080385e-05, - "loss": 2.0288, + "epoch": 0.33, + "grad_norm": 15.569754600524902, + "learning_rate": 1.7804459691252146e-05, + "loss": 3.1822, "step": 2625 }, { - "epoch": 0.79, - "grad_norm": 18.233638763427734, - "learning_rate": 1.4737897163475995e-05, - "loss": 2.4319, + "epoch": 0.33, + "grad_norm": 9.121126174926758, + "learning_rate": 1.7803622976195457e-05, + "loss": 1.5491, "step": 2626 }, { - "epoch": 0.79, - "grad_norm": 15.207660675048828, - "learning_rate": 1.4735892552871607e-05, - "loss": 2.1672, + "epoch": 0.33, + "grad_norm": 12.470527648925781, + "learning_rate": 1.780278626113877e-05, + "loss": 1.9018, "step": 2627 }, { - "epoch": 0.79, - "grad_norm": 21.931516647338867, - "learning_rate": 1.4733887942267215e-05, - "loss": 2.7001, + "epoch": 0.33, + "grad_norm": 9.061731338500977, + "learning_rate": 1.7801949546082084e-05, + "loss": 1.7634, "step": 2628 }, { - "epoch": 0.79, - "grad_norm": 11.918092727661133, - "learning_rate": 1.4731883331662825e-05, - "loss": 2.1923, + "epoch": 0.33, + "grad_norm": 20.732572555541992, + "learning_rate": 1.7801112831025394e-05, + "loss": 1.6575, "step": 2629 }, { - "epoch": 0.79, - "grad_norm": 24.173839569091797, - "learning_rate": 1.4729878721058436e-05, - "loss": 2.7606, + "epoch": 0.33, + "grad_norm": 18.07527732849121, + "learning_rate": 1.7800276115968708e-05, + "loss": 3.7961, "step": 2630 }, { - "epoch": 0.79, - "grad_norm": 18.89200782775879, - "learning_rate": 1.4727874110454046e-05, - "loss": 2.3708, + "epoch": 0.33, + "grad_norm": 12.62188720703125, + "learning_rate": 1.779943940091202e-05, + "loss": 1.3397, "step": 2631 }, { - "epoch": 0.79, - "grad_norm": 18.636999130249023, - "learning_rate": 1.4725869499849654e-05, - "loss": 2.3505, + "epoch": 0.33, + "grad_norm": 6.171538352966309, + "learning_rate": 1.7798602685855332e-05, + "loss": 0.4798, "step": 2632 }, { - "epoch": 0.79, - "grad_norm": 24.666654586791992, - "learning_rate": 1.4723864889245266e-05, - "loss": 2.4304, + "epoch": 0.33, + "grad_norm": 8.710653305053711, + "learning_rate": 1.7797765970798645e-05, + "loss": 1.3855, "step": 2633 }, { - "epoch": 0.79, - "grad_norm": 10.553362846374512, - "learning_rate": 1.4721860278640876e-05, - "loss": 1.8944, + "epoch": 0.33, + "grad_norm": 9.380043029785156, + "learning_rate": 1.779692925574196e-05, + "loss": 0.9739, "step": 2634 }, { - "epoch": 0.79, - "grad_norm": 21.21739387512207, - "learning_rate": 1.4719855668036484e-05, - "loss": 2.6751, + "epoch": 0.33, + "grad_norm": 17.16081428527832, + "learning_rate": 1.779609254068527e-05, + "loss": 1.7828, "step": 2635 }, { - "epoch": 0.79, - "grad_norm": 38.01958465576172, - "learning_rate": 1.4717851057432096e-05, - "loss": 1.8805, + "epoch": 0.33, + "grad_norm": 16.555482864379883, + "learning_rate": 1.7795255825628583e-05, + "loss": 3.708, "step": 2636 }, { - "epoch": 0.79, - "grad_norm": 16.428316116333008, - "learning_rate": 1.4715846446827704e-05, - "loss": 2.9402, + "epoch": 0.33, + "grad_norm": 13.493598937988281, + "learning_rate": 1.7794419110571897e-05, + "loss": 1.899, "step": 2637 }, { - "epoch": 0.79, - "grad_norm": 22.54930305480957, - "learning_rate": 1.4713841836223315e-05, - "loss": 1.9364, + "epoch": 0.33, + "grad_norm": 8.096471786499023, + "learning_rate": 1.779358239551521e-05, + "loss": 1.2152, "step": 2638 }, { - "epoch": 0.79, - "grad_norm": 14.97050952911377, - "learning_rate": 1.4711837225618926e-05, - "loss": 1.7904, + "epoch": 0.33, + "grad_norm": 8.873205184936523, + "learning_rate": 1.779274568045852e-05, + "loss": 2.4998, "step": 2639 }, { - "epoch": 0.79, - "grad_norm": 16.823516845703125, - "learning_rate": 1.4709832615014535e-05, - "loss": 2.2004, - "step": 2640 - }, - { - "epoch": 0.79, - "eval_loss": 0.28386011719703674, - "eval_runtime": 44.8703, - "eval_samples_per_second": 32.962, - "eval_steps_per_second": 32.962, + "epoch": 0.33, + "grad_norm": 7.485636234283447, + "learning_rate": 1.7791908965401834e-05, + "loss": 1.4641, "step": 2640 }, { - "epoch": 0.79, - "grad_norm": 14.531737327575684, - "learning_rate": 1.4707828004410145e-05, - "loss": 2.1963, + "epoch": 0.33, + "grad_norm": 13.725582122802734, + "learning_rate": 1.7791072250345148e-05, + "loss": 1.7835, "step": 2641 }, { - "epoch": 0.79, - "grad_norm": 24.638132095336914, - "learning_rate": 1.4705823393805753e-05, - "loss": 2.2464, + "epoch": 0.33, + "grad_norm": 16.9346923828125, + "learning_rate": 1.7790235535288458e-05, + "loss": 2.8712, "step": 2642 }, { - "epoch": 0.79, - "grad_norm": 17.104135513305664, - "learning_rate": 1.4703818783201365e-05, - "loss": 2.0654, + "epoch": 0.33, + "grad_norm": 25.24087905883789, + "learning_rate": 1.7789398820231772e-05, + "loss": 3.9757, "step": 2643 }, { - "epoch": 0.79, - "grad_norm": 7.9517621994018555, - "learning_rate": 1.4701814172596973e-05, - "loss": 0.8721, + "epoch": 0.33, + "grad_norm": 24.99532127380371, + "learning_rate": 1.7788562105175085e-05, + "loss": 1.4441, "step": 2644 }, { - "epoch": 0.8, - "grad_norm": 23.127620697021484, - "learning_rate": 1.4699809561992583e-05, - "loss": 2.0121, + "epoch": 0.33, + "grad_norm": 10.38292121887207, + "learning_rate": 1.77877253901184e-05, + "loss": 1.3967, "step": 2645 }, { - "epoch": 0.8, - "grad_norm": 18.799203872680664, - "learning_rate": 1.4697804951388195e-05, - "loss": 2.0454, + "epoch": 0.33, + "grad_norm": 18.216909408569336, + "learning_rate": 1.778688867506171e-05, + "loss": 2.1929, "step": 2646 }, { - "epoch": 0.8, - "grad_norm": 22.311729431152344, - "learning_rate": 1.4695800340783804e-05, - "loss": 2.1801, + "epoch": 0.33, + "grad_norm": 9.85275936126709, + "learning_rate": 1.7786051960005023e-05, + "loss": 2.047, "step": 2647 }, { - "epoch": 0.8, - "grad_norm": 17.162185668945312, - "learning_rate": 1.4693795730179414e-05, - "loss": 1.814, + "epoch": 0.33, + "grad_norm": 12.772254943847656, + "learning_rate": 1.7785215244948337e-05, + "loss": 1.4923, "step": 2648 }, { - "epoch": 0.8, - "grad_norm": 9.871721267700195, - "learning_rate": 1.4691791119575024e-05, - "loss": 1.6796, + "epoch": 0.33, + "grad_norm": 13.479397773742676, + "learning_rate": 1.7784378529891647e-05, + "loss": 2.3109, "step": 2649 }, { - "epoch": 0.8, - "grad_norm": 18.128135681152344, - "learning_rate": 1.4689786508970634e-05, - "loss": 2.3865, + "epoch": 0.33, + "grad_norm": 28.885215759277344, + "learning_rate": 1.778354181483496e-05, + "loss": 2.2379, "step": 2650 }, { - "epoch": 0.8, - "grad_norm": 15.663517951965332, - "learning_rate": 1.4687781898366244e-05, - "loss": 1.9406, + "epoch": 0.33, + "grad_norm": 12.536259651184082, + "learning_rate": 1.778270509977827e-05, + "loss": 2.3095, "step": 2651 }, { - "epoch": 0.8, - "grad_norm": 37.457672119140625, - "learning_rate": 1.4685777287761854e-05, - "loss": 2.7683, + "epoch": 0.33, + "grad_norm": 12.961633682250977, + "learning_rate": 1.7781868384721584e-05, + "loss": 1.6991, "step": 2652 }, { - "epoch": 0.8, - "grad_norm": 19.439468383789062, - "learning_rate": 1.4683772677157464e-05, - "loss": 2.354, + "epoch": 0.33, + "grad_norm": 10.719265937805176, + "learning_rate": 1.7781031669664898e-05, + "loss": 1.0747, "step": 2653 }, { - "epoch": 0.8, - "grad_norm": 29.902244567871094, - "learning_rate": 1.4681768066553072e-05, - "loss": 1.993, + "epoch": 0.33, + "grad_norm": 31.90835952758789, + "learning_rate": 1.778019495460821e-05, + "loss": 3.2469, "step": 2654 }, { - "epoch": 0.8, - "grad_norm": 11.236685752868652, - "learning_rate": 1.4679763455948684e-05, - "loss": 1.8682, + "epoch": 0.33, + "grad_norm": 20.774629592895508, + "learning_rate": 1.7779358239551522e-05, + "loss": 1.533, "step": 2655 }, { - "epoch": 0.8, - "grad_norm": 11.110152244567871, - "learning_rate": 1.4677758845344293e-05, - "loss": 1.378, + "epoch": 0.33, + "grad_norm": 7.061487674713135, + "learning_rate": 1.7778521524494832e-05, + "loss": 3.181, "step": 2656 }, { - "epoch": 0.8, - "grad_norm": 15.20004653930664, - "learning_rate": 1.4675754234739903e-05, - "loss": 2.4361, + "epoch": 0.33, + "grad_norm": 11.634260177612305, + "learning_rate": 1.7777684809438146e-05, + "loss": 1.6974, "step": 2657 }, { - "epoch": 0.8, - "grad_norm": 44.016544342041016, - "learning_rate": 1.4673749624135514e-05, - "loss": 2.6364, + "epoch": 0.33, + "grad_norm": 15.743753433227539, + "learning_rate": 1.777684809438146e-05, + "loss": 2.3629, "step": 2658 }, { - "epoch": 0.8, - "grad_norm": 18.134666442871094, - "learning_rate": 1.4671745013531123e-05, - "loss": 2.1308, + "epoch": 0.33, + "grad_norm": 10.4588041305542, + "learning_rate": 1.7776011379324773e-05, + "loss": 1.0229, "step": 2659 }, { - "epoch": 0.8, - "grad_norm": 83.64218139648438, - "learning_rate": 1.4669740402926733e-05, - "loss": 3.793, + "epoch": 0.33, + "grad_norm": 23.603330612182617, + "learning_rate": 1.7775174664268083e-05, + "loss": 2.5311, "step": 2660 }, { - "epoch": 0.8, - "grad_norm": 18.634946823120117, - "learning_rate": 1.4667735792322341e-05, - "loss": 2.0182, + "epoch": 0.33, + "grad_norm": 12.474515914916992, + "learning_rate": 1.7774337949211397e-05, + "loss": 2.6368, "step": 2661 }, { - "epoch": 0.8, - "grad_norm": 46.72085189819336, - "learning_rate": 1.4665731181717953e-05, - "loss": 2.1095, + "epoch": 0.33, + "grad_norm": 15.076018333435059, + "learning_rate": 1.777350123415471e-05, + "loss": 2.7853, "step": 2662 }, { - "epoch": 0.8, - "grad_norm": 18.656938552856445, - "learning_rate": 1.4663726571113563e-05, - "loss": 2.4388, + "epoch": 0.33, + "grad_norm": 11.735377311706543, + "learning_rate": 1.777266451909802e-05, + "loss": 3.4325, "step": 2663 }, { - "epoch": 0.8, - "grad_norm": 16.998310089111328, - "learning_rate": 1.4661721960509172e-05, - "loss": 1.9669, + "epoch": 0.33, + "grad_norm": 12.364471435546875, + "learning_rate": 1.7771827804041335e-05, + "loss": 1.7619, "step": 2664 }, { - "epoch": 0.8, - "grad_norm": 25.790451049804688, - "learning_rate": 1.4659717349904783e-05, - "loss": 2.6492, + "epoch": 0.33, + "grad_norm": 12.087916374206543, + "learning_rate": 1.777099108898465e-05, + "loss": 0.9063, "step": 2665 }, { - "epoch": 0.8, - "grad_norm": 11.008011817932129, - "learning_rate": 1.4657712739300392e-05, - "loss": 1.926, + "epoch": 0.33, + "grad_norm": 9.966253280639648, + "learning_rate": 1.7770154373927962e-05, + "loss": 2.5133, "step": 2666 }, { - "epoch": 0.8, - "grad_norm": 8.969502449035645, - "learning_rate": 1.4655708128696002e-05, - "loss": 0.8433, + "epoch": 0.33, + "grad_norm": 13.330294609069824, + "learning_rate": 1.7769317658871272e-05, + "loss": 2.1951, "step": 2667 }, { - "epoch": 0.8, - "grad_norm": 24.146800994873047, - "learning_rate": 1.465370351809161e-05, - "loss": 1.7039, + "epoch": 0.33, + "grad_norm": 10.899292945861816, + "learning_rate": 1.7768480943814586e-05, + "loss": 2.7435, "step": 2668 }, { - "epoch": 0.8, - "grad_norm": 32.524452209472656, - "learning_rate": 1.4651698907487222e-05, - "loss": 2.2074, + "epoch": 0.33, + "grad_norm": 17.162519454956055, + "learning_rate": 1.77676442287579e-05, + "loss": 2.4719, "step": 2669 }, { - "epoch": 0.8, - "grad_norm": 17.53997802734375, - "learning_rate": 1.4649694296882832e-05, - "loss": 2.1067, + "epoch": 0.34, + "grad_norm": 8.715301513671875, + "learning_rate": 1.776680751370121e-05, + "loss": 0.6385, "step": 2670 }, { - "epoch": 0.8, - "grad_norm": 70.99577331542969, - "learning_rate": 1.464768968627844e-05, - "loss": 2.5416, + "epoch": 0.34, + "grad_norm": 20.172853469848633, + "learning_rate": 1.7765970798644523e-05, + "loss": 2.5166, "step": 2671 }, { - "epoch": 0.8, - "grad_norm": 15.582018852233887, - "learning_rate": 1.4645685075674052e-05, - "loss": 2.4462, + "epoch": 0.34, + "grad_norm": 11.463748931884766, + "learning_rate": 1.7765134083587837e-05, + "loss": 2.1591, "step": 2672 }, { - "epoch": 0.8, - "grad_norm": 26.792186737060547, - "learning_rate": 1.464368046506966e-05, - "loss": 1.7403, + "epoch": 0.34, + "grad_norm": 15.173055648803711, + "learning_rate": 1.776429736853115e-05, + "loss": 1.7884, "step": 2673 }, { - "epoch": 0.8, - "grad_norm": 13.64623737335205, - "learning_rate": 1.464167585446527e-05, - "loss": 2.7516, + "epoch": 0.34, + "grad_norm": 21.31320571899414, + "learning_rate": 1.776346065347446e-05, + "loss": 3.175, "step": 2674 }, { - "epoch": 0.8, - "grad_norm": 17.885374069213867, - "learning_rate": 1.463967124386088e-05, - "loss": 2.9638, + "epoch": 0.34, + "grad_norm": 12.708723068237305, + "learning_rate": 1.7762623938417775e-05, + "loss": 1.8536, "step": 2675 }, { - "epoch": 0.8, - "grad_norm": 11.002893447875977, - "learning_rate": 1.4637666633256491e-05, - "loss": 1.5192, + "epoch": 0.34, + "grad_norm": 9.75171947479248, + "learning_rate": 1.7761787223361088e-05, + "loss": 1.2302, "step": 2676 }, { - "epoch": 0.8, - "grad_norm": 18.869712829589844, - "learning_rate": 1.4635662022652101e-05, - "loss": 2.5806, + "epoch": 0.34, + "grad_norm": 12.269949913024902, + "learning_rate": 1.77609505083044e-05, + "loss": 2.3773, "step": 2677 }, { - "epoch": 0.81, - "grad_norm": 13.818095207214355, - "learning_rate": 1.4633657412047711e-05, - "loss": 2.1488, + "epoch": 0.34, + "grad_norm": 14.616438865661621, + "learning_rate": 1.7760113793247712e-05, + "loss": 2.2528, "step": 2678 }, { - "epoch": 0.81, - "grad_norm": 28.199312210083008, - "learning_rate": 1.4631652801443321e-05, - "loss": 1.8242, + "epoch": 0.34, + "grad_norm": 10.644914627075195, + "learning_rate": 1.7759277078191022e-05, + "loss": 1.71, "step": 2679 }, { - "epoch": 0.81, - "grad_norm": 19.68671417236328, - "learning_rate": 1.462964819083893e-05, - "loss": 1.5373, + "epoch": 0.34, + "grad_norm": 13.962441444396973, + "learning_rate": 1.7758440363134336e-05, + "loss": 1.6699, "step": 2680 }, { - "epoch": 0.81, - "grad_norm": 36.305686950683594, - "learning_rate": 1.4627643580234541e-05, - "loss": 3.0527, + "epoch": 0.34, + "grad_norm": 33.41127395629883, + "learning_rate": 1.775760364807765e-05, + "loss": 1.6907, "step": 2681 }, { - "epoch": 0.81, - "grad_norm": 22.186141967773438, - "learning_rate": 1.4625638969630151e-05, - "loss": 3.4701, + "epoch": 0.34, + "grad_norm": 11.684005737304688, + "learning_rate": 1.775676693302096e-05, + "loss": 0.837, "step": 2682 }, { - "epoch": 0.81, - "grad_norm": 17.44498634338379, - "learning_rate": 1.462363435902576e-05, - "loss": 2.0777, + "epoch": 0.34, + "grad_norm": 11.661955833435059, + "learning_rate": 1.7755930217964274e-05, + "loss": 1.3837, "step": 2683 }, { - "epoch": 0.81, - "grad_norm": 25.54764747619629, - "learning_rate": 1.4621629748421372e-05, - "loss": 1.9166, + "epoch": 0.34, + "grad_norm": 20.234384536743164, + "learning_rate": 1.7755093502907584e-05, + "loss": 3.4401, "step": 2684 }, { - "epoch": 0.81, - "grad_norm": 15.869314193725586, - "learning_rate": 1.461962513781698e-05, - "loss": 2.0511, + "epoch": 0.34, + "grad_norm": 11.128459930419922, + "learning_rate": 1.7754256787850898e-05, + "loss": 1.632, "step": 2685 }, { - "epoch": 0.81, - "grad_norm": 16.541519165039062, - "learning_rate": 1.461762052721259e-05, - "loss": 2.2221, + "epoch": 0.34, + "grad_norm": 7.553885459899902, + "learning_rate": 1.775342007279421e-05, + "loss": 1.9309, "step": 2686 }, { - "epoch": 0.81, - "grad_norm": 12.096132278442383, - "learning_rate": 1.4615615916608198e-05, - "loss": 2.2477, + "epoch": 0.34, + "grad_norm": 17.167726516723633, + "learning_rate": 1.775258335773752e-05, + "loss": 0.35, "step": 2687 }, { - "epoch": 0.81, - "grad_norm": 32.8545036315918, - "learning_rate": 1.461361130600381e-05, - "loss": 2.4819, + "epoch": 0.34, + "grad_norm": 16.623626708984375, + "learning_rate": 1.7751746642680835e-05, + "loss": 2.3347, "step": 2688 }, { - "epoch": 0.81, - "grad_norm": 17.078535079956055, - "learning_rate": 1.461160669539942e-05, - "loss": 2.5493, + "epoch": 0.34, + "grad_norm": 15.077774047851562, + "learning_rate": 1.775090992762415e-05, + "loss": 2.5027, "step": 2689 }, { - "epoch": 0.81, - "grad_norm": 20.856630325317383, - "learning_rate": 1.4609602084795029e-05, - "loss": 2.9245, + "epoch": 0.34, + "grad_norm": 17.26755142211914, + "learning_rate": 1.7750073212567462e-05, + "loss": 1.7269, "step": 2690 }, { - "epoch": 0.81, - "grad_norm": 30.489768981933594, - "learning_rate": 1.460759747419064e-05, - "loss": 2.2884, + "epoch": 0.34, + "grad_norm": 9.179740905761719, + "learning_rate": 1.7749236497510773e-05, + "loss": 1.4656, "step": 2691 }, { - "epoch": 0.81, - "grad_norm": 13.105022430419922, - "learning_rate": 1.4605592863586249e-05, - "loss": 2.3658, + "epoch": 0.34, + "grad_norm": 13.58669376373291, + "learning_rate": 1.7748399782454086e-05, + "loss": 3.1928, "step": 2692 }, { - "epoch": 0.81, - "grad_norm": 14.78238296508789, - "learning_rate": 1.4603588252981859e-05, - "loss": 2.5098, + "epoch": 0.34, + "grad_norm": 14.194979667663574, + "learning_rate": 1.77475630673974e-05, + "loss": 3.3585, "step": 2693 }, { - "epoch": 0.81, - "grad_norm": 17.489654541015625, - "learning_rate": 1.460158364237747e-05, - "loss": 1.8979, + "epoch": 0.34, + "grad_norm": 24.152509689331055, + "learning_rate": 1.774672635234071e-05, + "loss": 1.9617, "step": 2694 }, { - "epoch": 0.81, - "grad_norm": 15.65456771850586, - "learning_rate": 1.4599579031773079e-05, - "loss": 2.2142, + "epoch": 0.34, + "grad_norm": 20.574277877807617, + "learning_rate": 1.7745889637284024e-05, + "loss": 1.93, "step": 2695 }, { - "epoch": 0.81, - "grad_norm": 24.52695655822754, - "learning_rate": 1.459757442116869e-05, - "loss": 2.7726, + "epoch": 0.34, + "grad_norm": 15.949616432189941, + "learning_rate": 1.7745052922227338e-05, + "loss": 1.8353, "step": 2696 }, { - "epoch": 0.81, - "grad_norm": 33.155067443847656, - "learning_rate": 1.4595569810564298e-05, - "loss": 2.2196, + "epoch": 0.34, + "grad_norm": 14.588492393493652, + "learning_rate": 1.774421620717065e-05, + "loss": 1.7303, "step": 2697 }, { - "epoch": 0.81, - "grad_norm": 9.755550384521484, - "learning_rate": 1.459356519995991e-05, - "loss": 1.7204, + "epoch": 0.34, + "grad_norm": 11.943470001220703, + "learning_rate": 1.774337949211396e-05, + "loss": 2.2723, "step": 2698 }, { - "epoch": 0.81, - "grad_norm": 19.83466911315918, - "learning_rate": 1.4591560589355518e-05, - "loss": 2.5923, + "epoch": 0.34, + "grad_norm": 10.931513786315918, + "learning_rate": 1.7742542777057275e-05, + "loss": 1.7117, "step": 2699 }, { - "epoch": 0.81, - "grad_norm": 21.58110809326172, - "learning_rate": 1.458955597875113e-05, - "loss": 1.7443, + "epoch": 0.34, + "grad_norm": 11.123241424560547, + "learning_rate": 1.774170606200059e-05, + "loss": 1.2408, "step": 2700 }, { - "epoch": 0.81, - "grad_norm": 20.373117446899414, - "learning_rate": 1.458755136814674e-05, - "loss": 2.949, + "epoch": 0.34, + "grad_norm": 24.709917068481445, + "learning_rate": 1.77408693469439e-05, + "loss": 2.841, "step": 2701 }, { - "epoch": 0.81, - "grad_norm": 12.08033275604248, - "learning_rate": 1.4585546757542348e-05, - "loss": 1.7417, + "epoch": 0.34, + "grad_norm": 75.23438262939453, + "learning_rate": 1.7740032631887213e-05, + "loss": 1.7636, "step": 2702 }, { - "epoch": 0.81, - "grad_norm": 16.984411239624023, - "learning_rate": 1.458354214693796e-05, - "loss": 1.973, + "epoch": 0.34, + "grad_norm": 19.17953872680664, + "learning_rate": 1.7739195916830526e-05, + "loss": 3.1879, "step": 2703 }, { - "epoch": 0.81, - "grad_norm": 24.945363998413086, - "learning_rate": 1.4581537536333568e-05, - "loss": 1.7648, + "epoch": 0.34, + "grad_norm": 12.488292694091797, + "learning_rate": 1.7738359201773837e-05, + "loss": 2.1714, "step": 2704 }, { - "epoch": 0.81, - "grad_norm": 18.574451446533203, - "learning_rate": 1.4579532925729178e-05, - "loss": 1.326, + "epoch": 0.34, + "grad_norm": 18.393566131591797, + "learning_rate": 1.773752248671715e-05, + "loss": 3.5136, "step": 2705 }, { - "epoch": 0.81, - "grad_norm": 32.63951873779297, - "learning_rate": 1.457752831512479e-05, - "loss": 2.0216, + "epoch": 0.34, + "grad_norm": 20.458900451660156, + "learning_rate": 1.7736685771660464e-05, + "loss": 1.256, "step": 2706 }, { - "epoch": 0.81, - "grad_norm": 20.653427124023438, - "learning_rate": 1.4575523704520398e-05, - "loss": 2.0913, + "epoch": 0.34, + "grad_norm": 10.179489135742188, + "learning_rate": 1.7735849056603774e-05, + "loss": 0.9635, "step": 2707 }, { - "epoch": 0.81, - "grad_norm": 41.5728645324707, - "learning_rate": 1.4573519093916009e-05, - "loss": 3.4579, + "epoch": 0.34, + "grad_norm": 15.560358047485352, + "learning_rate": 1.7735012341547088e-05, + "loss": 2.8496, "step": 2708 }, { - "epoch": 0.81, - "grad_norm": 12.26309585571289, - "learning_rate": 1.4571514483311617e-05, - "loss": 1.4278, + "epoch": 0.34, + "grad_norm": 14.694575309753418, + "learning_rate": 1.7734175626490398e-05, + "loss": 1.7613, "step": 2709 }, { - "epoch": 0.81, - "grad_norm": 17.044031143188477, - "learning_rate": 1.4569509872707229e-05, - "loss": 1.7787, + "epoch": 0.34, + "grad_norm": 11.86589527130127, + "learning_rate": 1.7733338911433712e-05, + "loss": 1.9254, "step": 2710 }, { - "epoch": 0.82, - "grad_norm": 22.376020431518555, - "learning_rate": 1.4567505262102837e-05, - "loss": 2.2045, + "epoch": 0.34, + "grad_norm": 7.738234519958496, + "learning_rate": 1.7732502196377025e-05, + "loss": 0.7106, "step": 2711 }, { - "epoch": 0.82, - "grad_norm": 15.766566276550293, - "learning_rate": 1.4565500651498447e-05, - "loss": 2.087, + "epoch": 0.34, + "grad_norm": 35.35801315307617, + "learning_rate": 1.7731665481320336e-05, + "loss": 2.4502, "step": 2712 }, { - "epoch": 0.82, - "grad_norm": 10.13654899597168, - "learning_rate": 1.4563496040894059e-05, - "loss": 2.7784, + "epoch": 0.34, + "grad_norm": 15.700124740600586, + "learning_rate": 1.773082876626365e-05, + "loss": 2.1184, "step": 2713 }, { - "epoch": 0.82, - "grad_norm": 12.07814884185791, - "learning_rate": 1.4561491430289667e-05, - "loss": 2.2884, + "epoch": 0.34, + "grad_norm": 4.762723445892334, + "learning_rate": 1.7729992051206963e-05, + "loss": 0.1881, "step": 2714 }, { - "epoch": 0.82, - "grad_norm": 15.864092826843262, - "learning_rate": 1.4559486819685277e-05, - "loss": 2.1431, + "epoch": 0.34, + "grad_norm": 16.25357437133789, + "learning_rate": 1.7729155336150273e-05, + "loss": 1.4104, "step": 2715 }, { - "epoch": 0.82, - "grad_norm": 18.540136337280273, - "learning_rate": 1.4557482209080886e-05, - "loss": 2.4553, + "epoch": 0.34, + "grad_norm": 19.92592430114746, + "learning_rate": 1.7728318621093587e-05, + "loss": 3.8268, "step": 2716 }, { - "epoch": 0.82, - "grad_norm": 9.907877922058105, - "learning_rate": 1.4555477598476498e-05, - "loss": 1.6814, + "epoch": 0.34, + "grad_norm": 12.886872291564941, + "learning_rate": 1.77274819060369e-05, + "loss": 3.1179, "step": 2717 }, { - "epoch": 0.82, - "grad_norm": 9.542145729064941, - "learning_rate": 1.4553472987872106e-05, - "loss": 1.1338, + "epoch": 0.34, + "grad_norm": 14.823153495788574, + "learning_rate": 1.7726645190980214e-05, + "loss": 0.8029, "step": 2718 }, { - "epoch": 0.82, - "grad_norm": 18.332304000854492, - "learning_rate": 1.4551468377267716e-05, - "loss": 1.7549, + "epoch": 0.34, + "grad_norm": 14.305251121520996, + "learning_rate": 1.7725808475923524e-05, + "loss": 3.3145, "step": 2719 }, { - "epoch": 0.82, - "grad_norm": 9.094773292541504, - "learning_rate": 1.4549463766663328e-05, - "loss": 1.1251, + "epoch": 0.34, + "grad_norm": 14.948331832885742, + "learning_rate": 1.7724971760866838e-05, + "loss": 2.4797, "step": 2720 }, { - "epoch": 0.82, - "grad_norm": 15.948875427246094, - "learning_rate": 1.4547459156058936e-05, - "loss": 1.5578, + "epoch": 0.34, + "grad_norm": 6.8798699378967285, + "learning_rate": 1.772413504581015e-05, + "loss": 0.9005, "step": 2721 }, { - "epoch": 0.82, - "grad_norm": 13.107511520385742, - "learning_rate": 1.4545454545454546e-05, - "loss": 1.8671, + "epoch": 0.34, + "grad_norm": 18.058286666870117, + "learning_rate": 1.7723298330753462e-05, + "loss": 1.6352, "step": 2722 }, { - "epoch": 0.82, - "grad_norm": 17.999923706054688, - "learning_rate": 1.4543449934850156e-05, - "loss": 2.1096, + "epoch": 0.34, + "grad_norm": 11.84815502166748, + "learning_rate": 1.7722461615696776e-05, + "loss": 1.6202, "step": 2723 }, { - "epoch": 0.82, - "grad_norm": 22.923246383666992, - "learning_rate": 1.4541445324245767e-05, - "loss": 2.1625, + "epoch": 0.34, + "grad_norm": 12.707499504089355, + "learning_rate": 1.772162490064009e-05, + "loss": 1.5351, "step": 2724 }, { - "epoch": 0.82, - "grad_norm": 12.2493314743042, - "learning_rate": 1.4539440713641377e-05, - "loss": 2.9069, + "epoch": 0.34, + "grad_norm": 12.71505355834961, + "learning_rate": 1.7720788185583403e-05, + "loss": 2.962, "step": 2725 }, { - "epoch": 0.82, - "grad_norm": 20.940412521362305, - "learning_rate": 1.4537436103036987e-05, - "loss": 2.6205, + "epoch": 0.34, + "grad_norm": 14.472443580627441, + "learning_rate": 1.7719951470526713e-05, + "loss": 2.7893, "step": 2726 }, { - "epoch": 0.82, - "grad_norm": 28.031394958496094, - "learning_rate": 1.4535431492432597e-05, - "loss": 2.167, + "epoch": 0.34, + "grad_norm": 11.948232650756836, + "learning_rate": 1.7719114755470027e-05, + "loss": 1.3851, "step": 2727 }, { - "epoch": 0.82, - "grad_norm": 12.496899604797363, - "learning_rate": 1.4533426881828205e-05, - "loss": 2.8417, + "epoch": 0.34, + "grad_norm": 26.376602172851562, + "learning_rate": 1.771827804041334e-05, + "loss": 2.3619, "step": 2728 }, { - "epoch": 0.82, - "grad_norm": 15.856078147888184, - "learning_rate": 1.4531422271223817e-05, - "loss": 2.5802, + "epoch": 0.34, + "grad_norm": 14.06189250946045, + "learning_rate": 1.771744132535665e-05, + "loss": 1.9028, "step": 2729 }, { - "epoch": 0.82, - "grad_norm": 19.049468994140625, - "learning_rate": 1.4529417660619425e-05, - "loss": 2.2789, + "epoch": 0.34, + "grad_norm": 13.420397758483887, + "learning_rate": 1.7716604610299964e-05, + "loss": 2.0853, "step": 2730 }, { - "epoch": 0.82, - "grad_norm": 16.88075828552246, - "learning_rate": 1.4527413050015035e-05, - "loss": 1.6496, + "epoch": 0.34, + "grad_norm": 12.996766090393066, + "learning_rate": 1.7715767895243278e-05, + "loss": 2.9761, "step": 2731 }, { - "epoch": 0.82, - "grad_norm": 23.054603576660156, - "learning_rate": 1.4525408439410647e-05, - "loss": 1.8223, + "epoch": 0.34, + "grad_norm": 11.15153980255127, + "learning_rate": 1.7714931180186588e-05, + "loss": 1.5978, "step": 2732 }, { - "epoch": 0.82, - "grad_norm": 27.310646057128906, - "learning_rate": 1.4523403828806256e-05, - "loss": 2.5239, + "epoch": 0.34, + "grad_norm": 24.499765396118164, + "learning_rate": 1.7714094465129902e-05, + "loss": 2.8331, "step": 2733 }, { - "epoch": 0.82, - "grad_norm": 20.423734664916992, - "learning_rate": 1.4521399218201866e-05, - "loss": 2.2311, + "epoch": 0.34, + "grad_norm": 125.93241882324219, + "learning_rate": 1.7713257750073216e-05, + "loss": 1.9642, "step": 2734 }, { - "epoch": 0.82, - "grad_norm": 13.929975509643555, - "learning_rate": 1.4519394607597474e-05, - "loss": 2.5546, + "epoch": 0.34, + "grad_norm": 21.821949005126953, + "learning_rate": 1.7712421035016526e-05, + "loss": 2.0355, "step": 2735 }, { - "epoch": 0.82, - "grad_norm": 12.425309181213379, - "learning_rate": 1.4517389996993086e-05, - "loss": 1.4081, + "epoch": 0.34, + "grad_norm": 12.31302261352539, + "learning_rate": 1.771158431995984e-05, + "loss": 1.9599, "step": 2736 }, { - "epoch": 0.82, - "grad_norm": 11.25351333618164, - "learning_rate": 1.4515385386388696e-05, - "loss": 1.7953, + "epoch": 0.34, + "grad_norm": 12.336325645446777, + "learning_rate": 1.771074760490315e-05, + "loss": 2.4827, "step": 2737 }, { - "epoch": 0.82, - "grad_norm": 19.53854751586914, - "learning_rate": 1.4513380775784304e-05, - "loss": 1.207, + "epoch": 0.34, + "grad_norm": 11.652437210083008, + "learning_rate": 1.7709910889846463e-05, + "loss": 2.3775, "step": 2738 }, { - "epoch": 0.82, - "grad_norm": 22.41812515258789, - "learning_rate": 1.4511376165179916e-05, - "loss": 2.4787, + "epoch": 0.34, + "grad_norm": 7.911839008331299, + "learning_rate": 1.7709074174789777e-05, + "loss": 1.0462, "step": 2739 }, { - "epoch": 0.82, - "grad_norm": 21.234926223754883, - "learning_rate": 1.4509371554575524e-05, - "loss": 2.2675, + "epoch": 0.34, + "grad_norm": 11.181539535522461, + "learning_rate": 1.7708237459733087e-05, + "loss": 1.6236, "step": 2740 }, { - "epoch": 0.82, - "grad_norm": 19.57403564453125, - "learning_rate": 1.4507366943971135e-05, - "loss": 2.3916, + "epoch": 0.34, + "grad_norm": 21.37649154663086, + "learning_rate": 1.77074007446764e-05, + "loss": 2.3584, "step": 2741 }, { - "epoch": 0.82, - "grad_norm": 25.845134735107422, - "learning_rate": 1.4505362333366743e-05, - "loss": 2.4502, + "epoch": 0.34, + "grad_norm": 8.938224792480469, + "learning_rate": 1.7706564029619715e-05, + "loss": 2.2325, "step": 2742 }, { - "epoch": 0.82, - "grad_norm": 42.33112716674805, - "learning_rate": 1.4503357722762355e-05, - "loss": 2.0345, + "epoch": 0.34, + "grad_norm": 17.55942153930664, + "learning_rate": 1.7705727314563025e-05, + "loss": 1.9935, "step": 2743 }, { - "epoch": 0.83, - "grad_norm": 15.049954414367676, - "learning_rate": 1.4501353112157965e-05, - "loss": 2.8193, + "epoch": 0.34, + "grad_norm": 31.897920608520508, + "learning_rate": 1.770489059950634e-05, + "loss": 1.6684, "step": 2744 }, { - "epoch": 0.83, - "grad_norm": 25.87766456604004, - "learning_rate": 1.4499348501553573e-05, - "loss": 2.2518, + "epoch": 0.34, + "grad_norm": 23.712093353271484, + "learning_rate": 1.7704053884449652e-05, + "loss": 1.8706, "step": 2745 }, { - "epoch": 0.83, - "grad_norm": 21.152681350708008, - "learning_rate": 1.4497343890949185e-05, - "loss": 2.3211, + "epoch": 0.34, + "grad_norm": 14.985076904296875, + "learning_rate": 1.7703217169392966e-05, + "loss": 2.9731, "step": 2746 }, { - "epoch": 0.83, - "grad_norm": 36.79154586791992, - "learning_rate": 1.4495339280344793e-05, - "loss": 2.1298, + "epoch": 0.34, + "grad_norm": 25.487062454223633, + "learning_rate": 1.7702380454336276e-05, + "loss": 2.1705, "step": 2747 }, { - "epoch": 0.83, - "grad_norm": 9.398030281066895, - "learning_rate": 1.4493334669740403e-05, - "loss": 1.8969, + "epoch": 0.34, + "grad_norm": 11.097169876098633, + "learning_rate": 1.770154373927959e-05, + "loss": 1.2433, "step": 2748 }, { - "epoch": 0.83, - "grad_norm": 20.946956634521484, - "learning_rate": 1.4491330059136015e-05, - "loss": 2.3464, + "epoch": 0.34, + "grad_norm": 7.978656768798828, + "learning_rate": 1.7700707024222903e-05, + "loss": 1.1207, "step": 2749 }, { - "epoch": 0.83, - "grad_norm": 9.130115509033203, - "learning_rate": 1.4489325448531624e-05, - "loss": 1.7633, + "epoch": 0.35, + "grad_norm": 7.823447227478027, + "learning_rate": 1.7699870309166214e-05, + "loss": 1.0118, "step": 2750 }, { - "epoch": 0.83, - "grad_norm": 14.199234962463379, - "learning_rate": 1.4487320837927234e-05, - "loss": 1.8352, + "epoch": 0.35, + "grad_norm": 15.047630310058594, + "learning_rate": 1.7699033594109527e-05, + "loss": 1.2199, "step": 2751 }, { - "epoch": 0.83, - "grad_norm": 8.78415298461914, - "learning_rate": 1.4485316227322844e-05, - "loss": 1.6194, + "epoch": 0.35, + "grad_norm": 14.528481483459473, + "learning_rate": 1.769819687905284e-05, + "loss": 1.4999, "step": 2752 }, { - "epoch": 0.83, - "grad_norm": 32.84090805053711, - "learning_rate": 1.4483311616718454e-05, - "loss": 1.7946, + "epoch": 0.35, + "grad_norm": 21.73208999633789, + "learning_rate": 1.7697360163996155e-05, + "loss": 2.8917, "step": 2753 }, { - "epoch": 0.83, - "grad_norm": 18.42530632019043, - "learning_rate": 1.4481307006114062e-05, - "loss": 2.4003, + "epoch": 0.35, + "grad_norm": 17.519514083862305, + "learning_rate": 1.7696523448939465e-05, + "loss": 2.5031, "step": 2754 }, { - "epoch": 0.83, - "grad_norm": 10.56641960144043, - "learning_rate": 1.4479302395509674e-05, - "loss": 1.698, + "epoch": 0.35, + "grad_norm": 13.164900779724121, + "learning_rate": 1.769568673388278e-05, + "loss": 1.3442, "step": 2755 }, { - "epoch": 0.83, - "grad_norm": 13.460083961486816, - "learning_rate": 1.4477297784905284e-05, - "loss": 2.7324, + "epoch": 0.35, + "grad_norm": 18.803659439086914, + "learning_rate": 1.7694850018826092e-05, + "loss": 1.4444, "step": 2756 }, { - "epoch": 0.83, - "grad_norm": 20.335878372192383, - "learning_rate": 1.4475293174300893e-05, - "loss": 1.5947, + "epoch": 0.35, + "grad_norm": 38.67995834350586, + "learning_rate": 1.7694013303769402e-05, + "loss": 0.2441, "step": 2757 }, { - "epoch": 0.83, - "grad_norm": 44.001834869384766, - "learning_rate": 1.4473288563696504e-05, - "loss": 1.8942, + "epoch": 0.35, + "grad_norm": 13.010946273803711, + "learning_rate": 1.7693176588712716e-05, + "loss": 1.1844, "step": 2758 }, { - "epoch": 0.83, - "grad_norm": 13.544523239135742, - "learning_rate": 1.4471283953092113e-05, - "loss": 2.016, + "epoch": 0.35, + "grad_norm": 17.245418548583984, + "learning_rate": 1.769233987365603e-05, + "loss": 2.793, "step": 2759 }, { - "epoch": 0.83, - "grad_norm": 30.979524612426758, - "learning_rate": 1.4469279342487723e-05, - "loss": 3.1479, - "step": 2760 - }, - { - "epoch": 0.83, - "eval_loss": 0.2657427191734314, - "eval_runtime": 43.5175, - "eval_samples_per_second": 33.986, - "eval_steps_per_second": 33.986, + "epoch": 0.35, + "grad_norm": 8.228273391723633, + "learning_rate": 1.769150315859934e-05, + "loss": 1.1293, "step": 2760 }, { - "epoch": 0.83, - "grad_norm": 23.16549301147461, - "learning_rate": 1.4467274731883331e-05, - "loss": 2.2599, + "epoch": 0.35, + "grad_norm": 15.159379959106445, + "learning_rate": 1.7690666443542654e-05, + "loss": 2.7442, "step": 2761 }, { - "epoch": 0.83, - "grad_norm": 18.973628997802734, - "learning_rate": 1.4465270121278943e-05, - "loss": 2.1258, + "epoch": 0.35, + "grad_norm": 7.7474894523620605, + "learning_rate": 1.7689829728485964e-05, + "loss": 0.6898, "step": 2762 }, { - "epoch": 0.83, - "grad_norm": 26.273101806640625, - "learning_rate": 1.4463265510674553e-05, - "loss": 3.4268, + "epoch": 0.35, + "grad_norm": 22.496170043945312, + "learning_rate": 1.7688993013429277e-05, + "loss": 2.9833, "step": 2763 }, { - "epoch": 0.83, - "grad_norm": 14.230957984924316, - "learning_rate": 1.4461260900070161e-05, - "loss": 1.4143, + "epoch": 0.35, + "grad_norm": 15.267127990722656, + "learning_rate": 1.768815629837259e-05, + "loss": 3.6275, "step": 2764 }, { - "epoch": 0.83, - "grad_norm": 14.468798637390137, - "learning_rate": 1.4459256289465773e-05, - "loss": 2.3477, + "epoch": 0.35, + "grad_norm": 10.416468620300293, + "learning_rate": 1.76873195833159e-05, + "loss": 2.7587, "step": 2765 }, { - "epoch": 0.83, - "grad_norm": 16.977825164794922, - "learning_rate": 1.4457251678861382e-05, - "loss": 2.3843, + "epoch": 0.35, + "grad_norm": 12.335618019104004, + "learning_rate": 1.7686482868259215e-05, + "loss": 2.8012, "step": 2766 }, { - "epoch": 0.83, - "grad_norm": 14.977238655090332, - "learning_rate": 1.4455247068256992e-05, - "loss": 2.0197, + "epoch": 0.35, + "grad_norm": 9.918840408325195, + "learning_rate": 1.768564615320253e-05, + "loss": 1.3928, "step": 2767 }, { - "epoch": 0.83, - "grad_norm": 14.926851272583008, - "learning_rate": 1.4453242457652603e-05, - "loss": 2.9558, + "epoch": 0.35, + "grad_norm": 36.22608184814453, + "learning_rate": 1.768480943814584e-05, + "loss": 2.8527, "step": 2768 }, { - "epoch": 0.83, - "grad_norm": 23.74448013305664, - "learning_rate": 1.4451237847048212e-05, - "loss": 2.1472, + "epoch": 0.35, + "grad_norm": 37.150150299072266, + "learning_rate": 1.7683972723089153e-05, + "loss": 2.1441, "step": 2769 }, { - "epoch": 0.83, - "grad_norm": 41.35685729980469, - "learning_rate": 1.4449233236443822e-05, - "loss": 2.7331, + "epoch": 0.35, + "grad_norm": 6.690373420715332, + "learning_rate": 1.7683136008032466e-05, + "loss": 1.4193, "step": 2770 }, { - "epoch": 0.83, - "grad_norm": 13.470633506774902, - "learning_rate": 1.4447228625839432e-05, - "loss": 1.9928, + "epoch": 0.35, + "grad_norm": 13.47512149810791, + "learning_rate": 1.7682299292975777e-05, + "loss": 2.6323, "step": 2771 }, { - "epoch": 0.83, - "grad_norm": 12.274412155151367, - "learning_rate": 1.4445224015235042e-05, - "loss": 1.7747, + "epoch": 0.35, + "grad_norm": 18.517271041870117, + "learning_rate": 1.768146257791909e-05, + "loss": 2.4489, "step": 2772 }, { - "epoch": 0.83, - "grad_norm": 12.649324417114258, - "learning_rate": 1.444321940463065e-05, - "loss": 1.599, + "epoch": 0.35, + "grad_norm": 13.417510032653809, + "learning_rate": 1.7680625862862404e-05, + "loss": 2.7828, "step": 2773 }, { - "epoch": 0.83, - "grad_norm": 13.914393424987793, - "learning_rate": 1.4441214794026262e-05, - "loss": 1.7285, + "epoch": 0.35, + "grad_norm": 19.947721481323242, + "learning_rate": 1.7679789147805717e-05, + "loss": 3.3091, "step": 2774 }, { - "epoch": 0.83, - "grad_norm": 9.780823707580566, - "learning_rate": 1.4439210183421872e-05, - "loss": 1.3613, + "epoch": 0.35, + "grad_norm": 8.461196899414062, + "learning_rate": 1.7678952432749028e-05, + "loss": 3.1268, "step": 2775 }, { - "epoch": 0.83, - "grad_norm": 13.444343566894531, - "learning_rate": 1.443720557281748e-05, - "loss": 1.4692, + "epoch": 0.35, + "grad_norm": 16.18375015258789, + "learning_rate": 1.767811571769234e-05, + "loss": 1.1768, "step": 2776 }, { - "epoch": 0.83, - "grad_norm": 11.597440719604492, - "learning_rate": 1.4435200962213093e-05, - "loss": 2.5432, + "epoch": 0.35, + "grad_norm": 20.837011337280273, + "learning_rate": 1.7677279002635655e-05, + "loss": 1.2293, "step": 2777 }, { - "epoch": 0.84, - "grad_norm": 66.30996704101562, - "learning_rate": 1.4433196351608701e-05, - "loss": 1.9169, + "epoch": 0.35, + "grad_norm": 16.653564453125, + "learning_rate": 1.7676442287578965e-05, + "loss": 1.8402, "step": 2778 }, { - "epoch": 0.84, - "grad_norm": 42.77153778076172, - "learning_rate": 1.4431191741004311e-05, - "loss": 2.1056, + "epoch": 0.35, + "grad_norm": 7.685312271118164, + "learning_rate": 1.767560557252228e-05, + "loss": 2.022, "step": 2779 }, { - "epoch": 0.84, - "grad_norm": 17.99850082397461, - "learning_rate": 1.4429187130399923e-05, - "loss": 1.6885, + "epoch": 0.35, + "grad_norm": 23.538360595703125, + "learning_rate": 1.7674768857465593e-05, + "loss": 1.1104, "step": 2780 }, { - "epoch": 0.84, - "grad_norm": 11.443147659301758, - "learning_rate": 1.4427182519795531e-05, - "loss": 2.2285, + "epoch": 0.35, + "grad_norm": 15.101953506469727, + "learning_rate": 1.7673932142408906e-05, + "loss": 3.7181, "step": 2781 }, { - "epoch": 0.84, - "grad_norm": 18.145082473754883, - "learning_rate": 1.4425177909191141e-05, - "loss": 2.14, + "epoch": 0.35, + "grad_norm": 20.23198890686035, + "learning_rate": 1.7673095427352216e-05, + "loss": 3.4027, "step": 2782 }, { - "epoch": 0.84, - "grad_norm": 12.900897026062012, - "learning_rate": 1.442317329858675e-05, - "loss": 1.8419, + "epoch": 0.35, + "grad_norm": 13.295400619506836, + "learning_rate": 1.767225871229553e-05, + "loss": 3.3314, "step": 2783 }, { - "epoch": 0.84, - "grad_norm": 12.71401596069336, - "learning_rate": 1.4421168687982361e-05, - "loss": 1.6758, + "epoch": 0.35, + "grad_norm": 16.027250289916992, + "learning_rate": 1.7671421997238844e-05, + "loss": 1.3427, "step": 2784 }, { - "epoch": 0.84, - "grad_norm": 10.644526481628418, - "learning_rate": 1.441916407737797e-05, - "loss": 1.5368, + "epoch": 0.35, + "grad_norm": 12.23305892944336, + "learning_rate": 1.7670585282182154e-05, + "loss": 1.5726, "step": 2785 }, { - "epoch": 0.84, - "grad_norm": 11.878511428833008, - "learning_rate": 1.441715946677358e-05, - "loss": 2.8906, + "epoch": 0.35, + "grad_norm": 11.057424545288086, + "learning_rate": 1.7669748567125468e-05, + "loss": 2.4224, "step": 2786 }, { - "epoch": 0.84, - "grad_norm": 23.1770076751709, - "learning_rate": 1.4415154856169192e-05, - "loss": 2.9468, + "epoch": 0.35, + "grad_norm": 18.01150894165039, + "learning_rate": 1.766891185206878e-05, + "loss": 3.5597, "step": 2787 }, { - "epoch": 0.84, - "grad_norm": 13.855196952819824, - "learning_rate": 1.44131502455648e-05, - "loss": 2.0071, + "epoch": 0.35, + "grad_norm": 19.285484313964844, + "learning_rate": 1.766807513701209e-05, + "loss": 1.7419, "step": 2788 }, { - "epoch": 0.84, - "grad_norm": 10.715313911437988, - "learning_rate": 1.441114563496041e-05, - "loss": 1.3851, + "epoch": 0.35, + "grad_norm": 14.585424423217773, + "learning_rate": 1.7667238421955405e-05, + "loss": 3.3739, "step": 2789 }, { - "epoch": 0.84, - "grad_norm": 11.7968168258667, - "learning_rate": 1.4409141024356019e-05, - "loss": 1.7609, + "epoch": 0.35, + "grad_norm": 9.299799919128418, + "learning_rate": 1.7666401706898716e-05, + "loss": 1.6106, "step": 2790 }, { - "epoch": 0.84, - "grad_norm": 15.877503395080566, - "learning_rate": 1.440713641375163e-05, - "loss": 3.124, + "epoch": 0.35, + "grad_norm": 16.445552825927734, + "learning_rate": 1.766556499184203e-05, + "loss": 1.2977, "step": 2791 }, { - "epoch": 0.84, - "grad_norm": 19.398963928222656, - "learning_rate": 1.440513180314724e-05, - "loss": 2.2776, + "epoch": 0.35, + "grad_norm": 10.708333969116211, + "learning_rate": 1.7664728276785343e-05, + "loss": 3.0598, "step": 2792 }, { - "epoch": 0.84, - "grad_norm": 13.756909370422363, - "learning_rate": 1.4403127192542849e-05, - "loss": 2.429, + "epoch": 0.35, + "grad_norm": 15.353691101074219, + "learning_rate": 1.7663891561728653e-05, + "loss": 3.7235, "step": 2793 }, { - "epoch": 0.84, - "grad_norm": 8.20438289642334, - "learning_rate": 1.440112258193846e-05, - "loss": 1.3021, + "epoch": 0.35, + "grad_norm": 22.779550552368164, + "learning_rate": 1.7663054846671967e-05, + "loss": 2.2107, "step": 2794 }, { - "epoch": 0.84, - "grad_norm": 35.246524810791016, - "learning_rate": 1.4399117971334069e-05, - "loss": 3.4113, + "epoch": 0.35, + "grad_norm": 5.81117057800293, + "learning_rate": 1.766221813161528e-05, + "loss": 0.5669, "step": 2795 }, { - "epoch": 0.84, - "grad_norm": 15.750861167907715, - "learning_rate": 1.4397113360729679e-05, - "loss": 2.1455, + "epoch": 0.35, + "grad_norm": 4.142109394073486, + "learning_rate": 1.766138141655859e-05, + "loss": 0.2277, "step": 2796 }, { - "epoch": 0.84, - "grad_norm": 12.513425827026367, - "learning_rate": 1.4395108750125289e-05, - "loss": 2.1875, + "epoch": 0.35, + "grad_norm": 15.641167640686035, + "learning_rate": 1.7660544701501904e-05, + "loss": 2.264, "step": 2797 }, { - "epoch": 0.84, - "grad_norm": 20.23858070373535, - "learning_rate": 1.43931041395209e-05, - "loss": 2.686, + "epoch": 0.35, + "grad_norm": 12.36007022857666, + "learning_rate": 1.7659707986445218e-05, + "loss": 3.5559, "step": 2798 }, { - "epoch": 0.84, - "grad_norm": 14.214842796325684, - "learning_rate": 1.439109952891651e-05, - "loss": 1.5095, + "epoch": 0.35, + "grad_norm": 17.426403045654297, + "learning_rate": 1.7658871271388528e-05, + "loss": 3.0523, "step": 2799 }, { - "epoch": 0.84, - "grad_norm": 13.132567405700684, - "learning_rate": 1.438909491831212e-05, - "loss": 1.4686, + "epoch": 0.35, + "grad_norm": 13.43343448638916, + "learning_rate": 1.7658034556331842e-05, + "loss": 3.2828, "step": 2800 }, { - "epoch": 0.84, - "grad_norm": 12.941914558410645, - "learning_rate": 1.438709030770773e-05, - "loss": 1.6349, + "epoch": 0.35, + "eval_loss": 0.15001514554023743, + "eval_runtime": 94.9595, + "eval_samples_per_second": 37.3, + "eval_steps_per_second": 37.3, + "step": 2800 + }, + { + "epoch": 0.35, + "grad_norm": 20.58778953552246, + "learning_rate": 1.7657197841275155e-05, + "loss": 3.473, "step": 2801 }, { - "epoch": 0.84, - "grad_norm": 23.00050926208496, - "learning_rate": 1.4385085697103338e-05, - "loss": 2.3867, + "epoch": 0.35, + "grad_norm": 8.765314102172852, + "learning_rate": 1.765636112621847e-05, + "loss": 1.4409, "step": 2802 }, { - "epoch": 0.84, - "grad_norm": 27.743297576904297, - "learning_rate": 1.438308108649895e-05, - "loss": 2.8904, + "epoch": 0.35, + "grad_norm": 32.125858306884766, + "learning_rate": 1.765552441116178e-05, + "loss": 2.5083, "step": 2803 }, { - "epoch": 0.84, - "grad_norm": 28.265438079833984, - "learning_rate": 1.4381076475894558e-05, - "loss": 2.0912, + "epoch": 0.35, + "grad_norm": 13.41010570526123, + "learning_rate": 1.7654687696105093e-05, + "loss": 1.4009, "step": 2804 }, { - "epoch": 0.84, - "grad_norm": 42.51979064941406, - "learning_rate": 1.4379071865290168e-05, - "loss": 2.9312, + "epoch": 0.35, + "grad_norm": 10.478439331054688, + "learning_rate": 1.7653850981048407e-05, + "loss": 0.8087, "step": 2805 }, { - "epoch": 0.84, - "grad_norm": 13.225244522094727, - "learning_rate": 1.437706725468578e-05, - "loss": 2.3977, + "epoch": 0.35, + "grad_norm": 13.714133262634277, + "learning_rate": 1.7653014265991717e-05, + "loss": 1.5322, "step": 2806 }, { - "epoch": 0.84, - "grad_norm": 21.915138244628906, - "learning_rate": 1.4375062644081388e-05, - "loss": 2.3075, + "epoch": 0.35, + "grad_norm": 12.272222518920898, + "learning_rate": 1.765217755093503e-05, + "loss": 1.9538, "step": 2807 }, { - "epoch": 0.84, - "grad_norm": 27.076114654541016, - "learning_rate": 1.4373058033476998e-05, - "loss": 2.9827, + "epoch": 0.35, + "grad_norm": 29.58174705505371, + "learning_rate": 1.7651340835878344e-05, + "loss": 2.5173, "step": 2808 }, { - "epoch": 0.84, - "grad_norm": 29.377344131469727, - "learning_rate": 1.4371053422872607e-05, - "loss": 1.7643, + "epoch": 0.35, + "grad_norm": 16.548465728759766, + "learning_rate": 1.7650504120821658e-05, + "loss": 2.0753, "step": 2809 }, { - "epoch": 0.84, - "grad_norm": 16.541988372802734, - "learning_rate": 1.4369048812268219e-05, - "loss": 2.0169, + "epoch": 0.35, + "grad_norm": 12.644722938537598, + "learning_rate": 1.7649667405764968e-05, + "loss": 2.7874, "step": 2810 }, { - "epoch": 0.85, - "grad_norm": 11.847968101501465, - "learning_rate": 1.4367044201663829e-05, - "loss": 1.9688, + "epoch": 0.35, + "grad_norm": 7.718317985534668, + "learning_rate": 1.7648830690708282e-05, + "loss": 1.015, "step": 2811 }, { - "epoch": 0.85, - "grad_norm": 19.333833694458008, - "learning_rate": 1.4365039591059437e-05, - "loss": 1.6196, + "epoch": 0.35, + "grad_norm": 9.271620750427246, + "learning_rate": 1.7647993975651595e-05, + "loss": 1.8111, "step": 2812 }, { - "epoch": 0.85, - "grad_norm": 11.35824966430664, - "learning_rate": 1.4363034980455049e-05, - "loss": 2.2096, + "epoch": 0.35, + "grad_norm": 13.277092933654785, + "learning_rate": 1.7647157260594906e-05, + "loss": 1.4301, "step": 2813 }, { - "epoch": 0.85, - "grad_norm": 21.909666061401367, - "learning_rate": 1.4361030369850657e-05, - "loss": 2.0165, + "epoch": 0.35, + "grad_norm": 10.31788158416748, + "learning_rate": 1.764632054553822e-05, + "loss": 1.5806, "step": 2814 }, { - "epoch": 0.85, - "grad_norm": 12.537689208984375, - "learning_rate": 1.4359025759246267e-05, - "loss": 1.4654, + "epoch": 0.35, + "grad_norm": 13.863436698913574, + "learning_rate": 1.764548383048153e-05, + "loss": 2.6027, "step": 2815 }, { - "epoch": 0.85, - "grad_norm": 16.89116096496582, - "learning_rate": 1.4357021148641876e-05, - "loss": 2.2603, + "epoch": 0.35, + "grad_norm": 13.447190284729004, + "learning_rate": 1.7644647115424843e-05, + "loss": 3.0108, "step": 2816 }, { - "epoch": 0.85, - "grad_norm": 29.52833366394043, - "learning_rate": 1.4355016538037487e-05, - "loss": 2.1104, + "epoch": 0.35, + "grad_norm": 11.306679725646973, + "learning_rate": 1.7643810400368157e-05, + "loss": 1.6569, "step": 2817 }, { - "epoch": 0.85, - "grad_norm": 31.932817459106445, - "learning_rate": 1.4353011927433097e-05, - "loss": 3.2788, + "epoch": 0.35, + "grad_norm": 18.264440536499023, + "learning_rate": 1.7642973685311467e-05, + "loss": 4.5724, "step": 2818 }, { - "epoch": 0.85, - "grad_norm": 13.009839057922363, - "learning_rate": 1.4351007316828708e-05, - "loss": 1.8396, + "epoch": 0.35, + "grad_norm": 12.04774284362793, + "learning_rate": 1.764213697025478e-05, + "loss": 2.782, "step": 2819 }, { - "epoch": 0.85, - "grad_norm": 20.032115936279297, - "learning_rate": 1.4349002706224318e-05, - "loss": 2.5377, + "epoch": 0.35, + "grad_norm": 15.501517295837402, + "learning_rate": 1.764130025519809e-05, + "loss": 1.5684, "step": 2820 }, { - "epoch": 0.85, - "grad_norm": 30.521154403686523, - "learning_rate": 1.4346998095619926e-05, - "loss": 2.9662, + "epoch": 0.35, + "grad_norm": 9.706591606140137, + "learning_rate": 1.7640463540141405e-05, + "loss": 1.604, "step": 2821 }, { - "epoch": 0.85, - "grad_norm": 25.22045135498047, - "learning_rate": 1.4344993485015538e-05, - "loss": 2.4709, + "epoch": 0.35, + "grad_norm": 14.179362297058105, + "learning_rate": 1.763962682508472e-05, + "loss": 1.4216, "step": 2822 }, { - "epoch": 0.85, - "grad_norm": 36.31182861328125, - "learning_rate": 1.4342988874411148e-05, - "loss": 2.1305, + "epoch": 0.35, + "grad_norm": 7.879173755645752, + "learning_rate": 1.7638790110028032e-05, + "loss": 1.9733, "step": 2823 }, { - "epoch": 0.85, - "grad_norm": 10.13243293762207, - "learning_rate": 1.4340984263806756e-05, - "loss": 1.2709, + "epoch": 0.35, + "grad_norm": 17.222301483154297, + "learning_rate": 1.7637953394971342e-05, + "loss": 3.1487, "step": 2824 }, { - "epoch": 0.85, - "grad_norm": 13.83365535736084, - "learning_rate": 1.4338979653202368e-05, - "loss": 2.4132, + "epoch": 0.35, + "grad_norm": 12.826745986938477, + "learning_rate": 1.7637116679914656e-05, + "loss": 2.612, "step": 2825 }, { - "epoch": 0.85, - "grad_norm": 13.130005836486816, - "learning_rate": 1.4336975042597976e-05, - "loss": 1.4074, + "epoch": 0.35, + "grad_norm": 13.919160842895508, + "learning_rate": 1.763627996485797e-05, + "loss": 2.6432, "step": 2826 }, { - "epoch": 0.85, - "grad_norm": 20.482786178588867, - "learning_rate": 1.4334970431993587e-05, - "loss": 2.3476, + "epoch": 0.35, + "grad_norm": 11.945897102355957, + "learning_rate": 1.763544324980128e-05, + "loss": 1.0819, "step": 2827 }, { - "epoch": 0.85, - "grad_norm": 9.617236137390137, - "learning_rate": 1.4332965821389195e-05, - "loss": 1.595, + "epoch": 0.35, + "grad_norm": 18.103195190429688, + "learning_rate": 1.7634606534744594e-05, + "loss": 3.3143, "step": 2828 }, { - "epoch": 0.85, - "grad_norm": 11.414641380310059, - "learning_rate": 1.4330961210784807e-05, - "loss": 1.5817, + "epoch": 0.36, + "grad_norm": 7.475184917449951, + "learning_rate": 1.7633769819687907e-05, + "loss": 1.0552, "step": 2829 }, { - "epoch": 0.85, - "grad_norm": 16.799848556518555, - "learning_rate": 1.4328956600180417e-05, - "loss": 2.1956, + "epoch": 0.36, + "grad_norm": 14.897072792053223, + "learning_rate": 1.763293310463122e-05, + "loss": 2.4431, "step": 2830 }, { - "epoch": 0.85, - "grad_norm": 10.216323852539062, - "learning_rate": 1.4326951989576025e-05, - "loss": 1.4594, + "epoch": 0.36, + "grad_norm": 13.615031242370605, + "learning_rate": 1.763209638957453e-05, + "loss": 2.111, "step": 2831 }, { - "epoch": 0.85, - "grad_norm": 28.910261154174805, - "learning_rate": 1.4324947378971637e-05, - "loss": 2.556, + "epoch": 0.36, + "grad_norm": 11.770406723022461, + "learning_rate": 1.7631259674517845e-05, + "loss": 1.1007, "step": 2832 }, { - "epoch": 0.85, - "grad_norm": 26.716176986694336, - "learning_rate": 1.4322942768367245e-05, - "loss": 2.3864, + "epoch": 0.36, + "grad_norm": 27.718374252319336, + "learning_rate": 1.763042295946116e-05, + "loss": 2.6574, "step": 2833 }, { - "epoch": 0.85, - "grad_norm": 16.635345458984375, - "learning_rate": 1.4320938157762855e-05, - "loss": 2.3808, + "epoch": 0.36, + "grad_norm": 37.83617401123047, + "learning_rate": 1.762958624440447e-05, + "loss": 2.9518, "step": 2834 }, { - "epoch": 0.85, - "grad_norm": 21.195531845092773, - "learning_rate": 1.4318933547158467e-05, - "loss": 1.7784, + "epoch": 0.36, + "grad_norm": 6.720913887023926, + "learning_rate": 1.7628749529347782e-05, + "loss": 1.9323, "step": 2835 }, { - "epoch": 0.85, - "grad_norm": 16.338502883911133, - "learning_rate": 1.4316928936554076e-05, - "loss": 2.1817, + "epoch": 0.36, + "grad_norm": 13.271183013916016, + "learning_rate": 1.7627912814291096e-05, + "loss": 3.0903, "step": 2836 }, { - "epoch": 0.85, - "grad_norm": 10.698309898376465, - "learning_rate": 1.4314924325949686e-05, - "loss": 1.8357, + "epoch": 0.36, + "grad_norm": 14.988275527954102, + "learning_rate": 1.762707609923441e-05, + "loss": 1.5745, "step": 2837 }, { - "epoch": 0.85, - "grad_norm": 22.48438262939453, - "learning_rate": 1.4312919715345294e-05, - "loss": 1.9495, + "epoch": 0.36, + "grad_norm": 13.079087257385254, + "learning_rate": 1.762623938417772e-05, + "loss": 2.4665, "step": 2838 }, { - "epoch": 0.85, - "grad_norm": 72.1753921508789, - "learning_rate": 1.4310915104740906e-05, - "loss": 1.8956, + "epoch": 0.36, + "grad_norm": 15.878899574279785, + "learning_rate": 1.7625402669121033e-05, + "loss": 1.0119, "step": 2839 }, { - "epoch": 0.85, - "grad_norm": 14.804839134216309, - "learning_rate": 1.4308910494136514e-05, - "loss": 2.0498, + "epoch": 0.36, + "grad_norm": 12.98774528503418, + "learning_rate": 1.7624565954064347e-05, + "loss": 1.7453, "step": 2840 }, { - "epoch": 0.85, - "grad_norm": 46.27671432495117, - "learning_rate": 1.4306905883532124e-05, - "loss": 2.3295, + "epoch": 0.36, + "grad_norm": 37.145545959472656, + "learning_rate": 1.7623729239007657e-05, + "loss": 1.7648, "step": 2841 }, { - "epoch": 0.85, - "grad_norm": 20.91038703918457, - "learning_rate": 1.4304901272927736e-05, - "loss": 1.7985, + "epoch": 0.36, + "grad_norm": 17.651174545288086, + "learning_rate": 1.762289252395097e-05, + "loss": 2.2728, "step": 2842 }, { - "epoch": 0.85, - "grad_norm": 15.907576560974121, - "learning_rate": 1.4302896662323345e-05, - "loss": 1.945, + "epoch": 0.36, + "grad_norm": 91.55896759033203, + "learning_rate": 1.762205580889428e-05, + "loss": 1.4332, "step": 2843 }, { - "epoch": 0.86, - "grad_norm": 11.249744415283203, - "learning_rate": 1.4300892051718955e-05, - "loss": 2.0269, + "epoch": 0.36, + "grad_norm": 19.58599281311035, + "learning_rate": 1.7621219093837595e-05, + "loss": 2.1461, "step": 2844 }, { - "epoch": 0.86, - "grad_norm": 31.293607711791992, - "learning_rate": 1.4298887441114565e-05, - "loss": 2.0211, + "epoch": 0.36, + "grad_norm": 40.6429557800293, + "learning_rate": 1.762038237878091e-05, + "loss": 3.9308, "step": 2845 }, { - "epoch": 0.86, - "grad_norm": 15.472600936889648, - "learning_rate": 1.4296882830510175e-05, - "loss": 2.2907, + "epoch": 0.36, + "grad_norm": 27.435991287231445, + "learning_rate": 1.761954566372422e-05, + "loss": 3.3426, "step": 2846 }, { - "epoch": 0.86, - "grad_norm": 15.497085571289062, - "learning_rate": 1.4294878219905783e-05, - "loss": 2.4152, + "epoch": 0.36, + "grad_norm": 23.73316192626953, + "learning_rate": 1.7618708948667533e-05, + "loss": 3.6674, "step": 2847 }, { - "epoch": 0.86, - "grad_norm": 16.131122589111328, - "learning_rate": 1.4292873609301395e-05, - "loss": 1.7781, + "epoch": 0.36, + "grad_norm": 18.161706924438477, + "learning_rate": 1.7617872233610843e-05, + "loss": 2.6338, "step": 2848 }, { - "epoch": 0.86, - "grad_norm": 20.631099700927734, - "learning_rate": 1.4290868998697005e-05, - "loss": 2.6782, + "epoch": 0.36, + "grad_norm": 4.604570388793945, + "learning_rate": 1.7617035518554156e-05, + "loss": 1.7168, "step": 2849 }, { - "epoch": 0.86, - "grad_norm": 28.31133460998535, - "learning_rate": 1.4288864388092613e-05, - "loss": 2.7064, + "epoch": 0.36, + "grad_norm": 10.27620792388916, + "learning_rate": 1.761619880349747e-05, + "loss": 1.9358, "step": 2850 }, { - "epoch": 0.86, - "grad_norm": 22.287992477416992, - "learning_rate": 1.4286859777488225e-05, - "loss": 1.5085, + "epoch": 0.36, + "grad_norm": 9.236021041870117, + "learning_rate": 1.7615362088440784e-05, + "loss": 3.0787, "step": 2851 }, { - "epoch": 0.86, - "grad_norm": 14.099394798278809, - "learning_rate": 1.4284855166883834e-05, - "loss": 2.1791, + "epoch": 0.36, + "grad_norm": 12.75297737121582, + "learning_rate": 1.7614525373384094e-05, + "loss": 2.4493, "step": 2852 }, { - "epoch": 0.86, - "grad_norm": 23.74030876159668, - "learning_rate": 1.4282850556279444e-05, - "loss": 1.5725, + "epoch": 0.36, + "grad_norm": 8.181326866149902, + "learning_rate": 1.7613688658327408e-05, + "loss": 1.5055, "step": 2853 }, { - "epoch": 0.86, - "grad_norm": 18.97103500366211, - "learning_rate": 1.4280845945675055e-05, - "loss": 2.8069, + "epoch": 0.36, + "grad_norm": 20.349233627319336, + "learning_rate": 1.761285194327072e-05, + "loss": 2.5806, "step": 2854 }, { - "epoch": 0.86, - "grad_norm": 15.047412872314453, - "learning_rate": 1.4278841335070664e-05, - "loss": 2.3392, + "epoch": 0.36, + "grad_norm": 10.25844955444336, + "learning_rate": 1.761201522821403e-05, + "loss": 0.9415, "step": 2855 }, { - "epoch": 0.86, - "grad_norm": 11.085140228271484, - "learning_rate": 1.4276836724466274e-05, - "loss": 1.8397, + "epoch": 0.36, + "grad_norm": 14.34598445892334, + "learning_rate": 1.7611178513157345e-05, + "loss": 1.1714, "step": 2856 }, { - "epoch": 0.86, - "grad_norm": 25.895097732543945, - "learning_rate": 1.4274832113861882e-05, - "loss": 2.0397, + "epoch": 0.36, + "grad_norm": 26.95628547668457, + "learning_rate": 1.761034179810066e-05, + "loss": 3.4365, "step": 2857 }, { - "epoch": 0.86, - "grad_norm": 15.016706466674805, - "learning_rate": 1.4272827503257494e-05, - "loss": 2.7215, + "epoch": 0.36, + "grad_norm": 26.015779495239258, + "learning_rate": 1.7609505083043972e-05, + "loss": 1.4385, "step": 2858 }, { - "epoch": 0.86, - "grad_norm": 30.092390060424805, - "learning_rate": 1.4270822892653102e-05, - "loss": 2.8657, + "epoch": 0.36, + "grad_norm": 15.12834358215332, + "learning_rate": 1.7608668367987283e-05, + "loss": 3.0311, "step": 2859 }, { - "epoch": 0.86, - "grad_norm": 16.425628662109375, - "learning_rate": 1.4268818282048713e-05, - "loss": 1.7927, + "epoch": 0.36, + "grad_norm": 40.686676025390625, + "learning_rate": 1.7607831652930596e-05, + "loss": 1.7407, "step": 2860 }, { - "epoch": 0.86, - "grad_norm": 39.504478454589844, - "learning_rate": 1.4266813671444324e-05, - "loss": 2.4321, + "epoch": 0.36, + "grad_norm": 8.17596435546875, + "learning_rate": 1.760699493787391e-05, + "loss": 1.5953, "step": 2861 }, { - "epoch": 0.86, - "grad_norm": 15.988898277282715, - "learning_rate": 1.4264809060839933e-05, - "loss": 3.2789, + "epoch": 0.36, + "grad_norm": 12.739534378051758, + "learning_rate": 1.760615822281722e-05, + "loss": 2.7701, "step": 2862 }, { - "epoch": 0.86, - "grad_norm": 25.57854461669922, - "learning_rate": 1.4262804450235543e-05, - "loss": 2.2735, + "epoch": 0.36, + "grad_norm": 10.815932273864746, + "learning_rate": 1.7605321507760534e-05, + "loss": 1.4069, "step": 2863 }, { - "epoch": 0.86, - "grad_norm": 24.288047790527344, - "learning_rate": 1.4260799839631151e-05, - "loss": 2.4726, + "epoch": 0.36, + "grad_norm": 14.07378101348877, + "learning_rate": 1.7604484792703848e-05, + "loss": 1.7353, "step": 2864 }, { - "epoch": 0.86, - "grad_norm": 9.727755546569824, - "learning_rate": 1.4258795229026763e-05, - "loss": 1.2214, + "epoch": 0.36, + "grad_norm": 10.749878883361816, + "learning_rate": 1.760364807764716e-05, + "loss": 3.3234, "step": 2865 }, { - "epoch": 0.86, - "grad_norm": 11.615317344665527, - "learning_rate": 1.4256790618422373e-05, - "loss": 1.5758, + "epoch": 0.36, + "grad_norm": 15.514153480529785, + "learning_rate": 1.760281136259047e-05, + "loss": 2.0563, "step": 2866 }, { - "epoch": 0.86, - "grad_norm": 18.63646697998047, - "learning_rate": 1.4254786007817981e-05, - "loss": 3.0492, + "epoch": 0.36, + "grad_norm": 9.187424659729004, + "learning_rate": 1.7601974647533785e-05, + "loss": 1.5415, "step": 2867 }, { - "epoch": 0.86, - "grad_norm": 18.21438217163086, - "learning_rate": 1.4252781397213593e-05, - "loss": 2.2775, + "epoch": 0.36, + "grad_norm": 9.863618850708008, + "learning_rate": 1.7601137932477095e-05, + "loss": 1.2027, "step": 2868 }, { - "epoch": 0.86, - "grad_norm": 11.930097579956055, - "learning_rate": 1.4250776786609202e-05, - "loss": 1.8362, + "epoch": 0.36, + "grad_norm": 13.988532066345215, + "learning_rate": 1.760030121742041e-05, + "loss": 1.6365, "step": 2869 }, { - "epoch": 0.86, - "grad_norm": 18.075613021850586, - "learning_rate": 1.4248772176004812e-05, - "loss": 3.2771, + "epoch": 0.36, + "grad_norm": 10.842628479003906, + "learning_rate": 1.7599464502363723e-05, + "loss": 1.1997, "step": 2870 }, { - "epoch": 0.86, - "grad_norm": 17.12186050415039, - "learning_rate": 1.4246767565400422e-05, - "loss": 1.9388, + "epoch": 0.36, + "grad_norm": 12.993040084838867, + "learning_rate": 1.7598627787307033e-05, + "loss": 3.3648, "step": 2871 }, { - "epoch": 0.86, - "grad_norm": 20.949424743652344, - "learning_rate": 1.4244762954796032e-05, - "loss": 2.5683, + "epoch": 0.36, + "grad_norm": 15.58774185180664, + "learning_rate": 1.7597791072250347e-05, + "loss": 2.8855, "step": 2872 }, { - "epoch": 0.86, - "grad_norm": 36.75067901611328, - "learning_rate": 1.4242758344191644e-05, - "loss": 1.8602, + "epoch": 0.36, + "grad_norm": 7.670743942260742, + "learning_rate": 1.7596954357193657e-05, + "loss": 1.8101, "step": 2873 }, { - "epoch": 0.86, - "grad_norm": 12.314196586608887, - "learning_rate": 1.4240753733587252e-05, - "loss": 1.7533, + "epoch": 0.36, + "grad_norm": 18.823976516723633, + "learning_rate": 1.759611764213697e-05, + "loss": 1.8527, "step": 2874 }, { - "epoch": 0.86, - "grad_norm": 9.40181827545166, - "learning_rate": 1.4238749122982862e-05, - "loss": 1.0538, + "epoch": 0.36, + "grad_norm": 3.7253715991973877, + "learning_rate": 1.7595280927080284e-05, + "loss": 0.1347, "step": 2875 }, { - "epoch": 0.86, - "grad_norm": 18.72924041748047, - "learning_rate": 1.423674451237847e-05, - "loss": 1.741, + "epoch": 0.36, + "grad_norm": 10.255762100219727, + "learning_rate": 1.7594444212023594e-05, + "loss": 1.8739, "step": 2876 }, { - "epoch": 0.87, - "grad_norm": 86.12344360351562, - "learning_rate": 1.4234739901774082e-05, - "loss": 1.8543, + "epoch": 0.36, + "grad_norm": 6.958003520965576, + "learning_rate": 1.7593607496966908e-05, + "loss": 1.1171, "step": 2877 }, { - "epoch": 0.87, - "grad_norm": 11.64871883392334, - "learning_rate": 1.4232735291169692e-05, - "loss": 1.8377, + "epoch": 0.36, + "grad_norm": 3.8992414474487305, + "learning_rate": 1.7592770781910222e-05, + "loss": 0.1972, "step": 2878 }, { - "epoch": 0.87, - "grad_norm": 20.858604431152344, - "learning_rate": 1.42307306805653e-05, - "loss": 2.407, + "epoch": 0.36, + "grad_norm": 8.176952362060547, + "learning_rate": 1.7591934066853535e-05, + "loss": 1.4856, "step": 2879 }, { - "epoch": 0.87, - "grad_norm": 16.45133399963379, - "learning_rate": 1.4228726069960913e-05, - "loss": 2.1642, - "step": 2880 - }, - { - "epoch": 0.87, - "eval_loss": 0.27140170335769653, - "eval_runtime": 43.4249, - "eval_samples_per_second": 34.059, - "eval_steps_per_second": 34.059, + "epoch": 0.36, + "grad_norm": 19.96003532409668, + "learning_rate": 1.7591097351796846e-05, + "loss": 2.7468, "step": 2880 }, { - "epoch": 0.87, - "grad_norm": 16.983951568603516, - "learning_rate": 1.4226721459356521e-05, - "loss": 2.1911, + "epoch": 0.36, + "grad_norm": 15.29173755645752, + "learning_rate": 1.759026063674016e-05, + "loss": 2.4146, "step": 2881 }, { - "epoch": 0.87, - "grad_norm": 12.091941833496094, - "learning_rate": 1.4224716848752131e-05, - "loss": 1.6567, + "epoch": 0.36, + "grad_norm": 7.0636305809021, + "learning_rate": 1.7589423921683473e-05, + "loss": 0.9918, "step": 2882 }, { - "epoch": 0.87, - "grad_norm": 22.655092239379883, - "learning_rate": 1.422271223814774e-05, - "loss": 1.7178, + "epoch": 0.36, + "grad_norm": 46.704769134521484, + "learning_rate": 1.7588587206626783e-05, + "loss": 2.6682, "step": 2883 }, { - "epoch": 0.87, - "grad_norm": 13.33409595489502, - "learning_rate": 1.4220707627543351e-05, - "loss": 1.42, + "epoch": 0.36, + "grad_norm": 12.229507446289062, + "learning_rate": 1.7587750491570097e-05, + "loss": 3.7473, "step": 2884 }, { - "epoch": 0.87, - "grad_norm": 41.79966735839844, - "learning_rate": 1.4218703016938961e-05, - "loss": 2.5635, + "epoch": 0.36, + "grad_norm": 29.765493392944336, + "learning_rate": 1.758691377651341e-05, + "loss": 2.2255, "step": 2885 }, { - "epoch": 0.87, - "grad_norm": 9.73848819732666, - "learning_rate": 1.421669840633457e-05, - "loss": 1.5657, + "epoch": 0.36, + "grad_norm": 14.75451374053955, + "learning_rate": 1.7586077061456724e-05, + "loss": 2.9142, "step": 2886 }, { - "epoch": 0.87, - "grad_norm": 13.572857856750488, - "learning_rate": 1.4214693795730181e-05, - "loss": 1.8109, + "epoch": 0.36, + "grad_norm": 10.729453086853027, + "learning_rate": 1.7585240346400034e-05, + "loss": 2.4328, "step": 2887 }, { - "epoch": 0.87, - "grad_norm": 19.471956253051758, - "learning_rate": 1.421268918512579e-05, - "loss": 1.7335, + "epoch": 0.36, + "grad_norm": 8.80831527709961, + "learning_rate": 1.7584403631343348e-05, + "loss": 2.3335, "step": 2888 }, { - "epoch": 0.87, - "grad_norm": 16.479246139526367, - "learning_rate": 1.42106845745214e-05, - "loss": 2.1827, + "epoch": 0.36, + "grad_norm": 15.32617473602295, + "learning_rate": 1.7583566916286662e-05, + "loss": 2.5147, "step": 2889 }, { - "epoch": 0.87, - "grad_norm": 42.60647964477539, - "learning_rate": 1.420867996391701e-05, - "loss": 4.2229, + "epoch": 0.36, + "grad_norm": 16.763200759887695, + "learning_rate": 1.7582730201229972e-05, + "loss": 2.105, "step": 2890 }, { - "epoch": 0.87, - "grad_norm": 29.324525833129883, - "learning_rate": 1.420667535331262e-05, - "loss": 2.2616, + "epoch": 0.36, + "grad_norm": 9.007331848144531, + "learning_rate": 1.7581893486173286e-05, + "loss": 1.1816, "step": 2891 }, { - "epoch": 0.87, - "grad_norm": 17.365251541137695, - "learning_rate": 1.420467074270823e-05, - "loss": 1.9783, + "epoch": 0.36, + "grad_norm": 11.328272819519043, + "learning_rate": 1.75810567711166e-05, + "loss": 2.079, "step": 2892 }, { - "epoch": 0.87, - "grad_norm": 14.282939910888672, - "learning_rate": 1.420266613210384e-05, - "loss": 1.9463, + "epoch": 0.36, + "grad_norm": 8.322172164916992, + "learning_rate": 1.758022005605991e-05, + "loss": 1.808, "step": 2893 }, { - "epoch": 0.87, - "grad_norm": 18.807802200317383, - "learning_rate": 1.420066152149945e-05, - "loss": 1.9243, + "epoch": 0.36, + "grad_norm": 22.682048797607422, + "learning_rate": 1.7579383341003223e-05, + "loss": 2.3697, "step": 2894 }, { - "epoch": 0.87, - "grad_norm": 12.65291976928711, - "learning_rate": 1.4198656910895059e-05, - "loss": 1.5824, + "epoch": 0.36, + "grad_norm": 12.664169311523438, + "learning_rate": 1.7578546625946537e-05, + "loss": 2.7809, "step": 2895 }, { - "epoch": 0.87, - "grad_norm": 24.225215911865234, - "learning_rate": 1.419665230029067e-05, - "loss": 1.4894, + "epoch": 0.36, + "grad_norm": 23.70795440673828, + "learning_rate": 1.7577709910889847e-05, + "loss": 2.8973, "step": 2896 }, { - "epoch": 0.87, - "grad_norm": 19.319774627685547, - "learning_rate": 1.419464768968628e-05, - "loss": 2.8796, + "epoch": 0.36, + "grad_norm": 12.631494522094727, + "learning_rate": 1.757687319583316e-05, + "loss": 1.4789, "step": 2897 }, { - "epoch": 0.87, - "grad_norm": 14.177779197692871, - "learning_rate": 1.4192643079081889e-05, - "loss": 1.7554, + "epoch": 0.36, + "grad_norm": 17.631160736083984, + "learning_rate": 1.7576036480776474e-05, + "loss": 2.8649, "step": 2898 }, { - "epoch": 0.87, - "grad_norm": 17.083988189697266, - "learning_rate": 1.41906384684775e-05, - "loss": 2.0681, + "epoch": 0.36, + "grad_norm": 11.38050365447998, + "learning_rate": 1.7575199765719785e-05, + "loss": 2.7911, "step": 2899 }, { - "epoch": 0.87, - "grad_norm": 40.30973434448242, - "learning_rate": 1.418863385787311e-05, - "loss": 2.4183, + "epoch": 0.36, + "grad_norm": 10.819679260253906, + "learning_rate": 1.7574363050663098e-05, + "loss": 3.1221, "step": 2900 }, { - "epoch": 0.87, - "grad_norm": 9.512898445129395, - "learning_rate": 1.418662924726872e-05, - "loss": 1.5707, + "epoch": 0.36, + "grad_norm": 9.505936622619629, + "learning_rate": 1.757352633560641e-05, + "loss": 1.217, "step": 2901 }, { - "epoch": 0.87, - "grad_norm": 52.70304870605469, - "learning_rate": 1.4184624636664328e-05, - "loss": 2.9803, + "epoch": 0.36, + "grad_norm": 11.83896255493164, + "learning_rate": 1.7572689620549722e-05, + "loss": 0.502, "step": 2902 }, { - "epoch": 0.87, - "grad_norm": 16.734039306640625, - "learning_rate": 1.418262002605994e-05, - "loss": 2.037, + "epoch": 0.36, + "grad_norm": 7.863925933837891, + "learning_rate": 1.7571852905493036e-05, + "loss": 1.6534, "step": 2903 }, { - "epoch": 0.87, - "grad_norm": 19.740087509155273, - "learning_rate": 1.418061541545555e-05, - "loss": 1.8485, + "epoch": 0.36, + "grad_norm": 10.372111320495605, + "learning_rate": 1.7571016190436346e-05, + "loss": 1.9913, "step": 2904 }, { - "epoch": 0.87, - "grad_norm": 14.386117935180664, - "learning_rate": 1.4178610804851158e-05, - "loss": 1.0815, + "epoch": 0.36, + "grad_norm": 27.196186065673828, + "learning_rate": 1.757017947537966e-05, + "loss": 3.9937, "step": 2905 }, { - "epoch": 0.87, - "grad_norm": 16.817588806152344, - "learning_rate": 1.417660619424677e-05, - "loss": 1.6195, + "epoch": 0.36, + "grad_norm": 16.792673110961914, + "learning_rate": 1.7569342760322973e-05, + "loss": 2.6868, "step": 2906 }, { - "epoch": 0.87, - "grad_norm": 27.766712188720703, - "learning_rate": 1.4174601583642378e-05, - "loss": 1.9603, + "epoch": 0.36, + "grad_norm": 16.74106216430664, + "learning_rate": 1.7568506045266287e-05, + "loss": 2.2425, "step": 2907 }, { - "epoch": 0.87, - "grad_norm": 33.24577713012695, - "learning_rate": 1.4172596973037988e-05, - "loss": 1.6909, + "epoch": 0.36, + "grad_norm": 10.993690490722656, + "learning_rate": 1.7567669330209597e-05, + "loss": 1.5335, "step": 2908 }, { - "epoch": 0.87, - "grad_norm": 11.028932571411133, - "learning_rate": 1.41705923624336e-05, - "loss": 1.3208, + "epoch": 0.37, + "grad_norm": 10.971261978149414, + "learning_rate": 1.756683261515291e-05, + "loss": 3.1004, "step": 2909 }, { - "epoch": 0.87, - "grad_norm": 12.61074447631836, - "learning_rate": 1.4168587751829208e-05, - "loss": 1.9588, + "epoch": 0.37, + "grad_norm": 20.94425392150879, + "learning_rate": 1.7565995900096225e-05, + "loss": 3.9462, "step": 2910 }, { - "epoch": 0.88, - "grad_norm": 11.678028106689453, - "learning_rate": 1.4166583141224818e-05, - "loss": 0.8482, + "epoch": 0.37, + "grad_norm": 17.663053512573242, + "learning_rate": 1.7565159185039535e-05, + "loss": 2.0604, "step": 2911 }, { - "epoch": 0.88, - "grad_norm": 12.001538276672363, - "learning_rate": 1.4164578530620427e-05, - "loss": 1.9327, + "epoch": 0.37, + "grad_norm": 35.784873962402344, + "learning_rate": 1.756432246998285e-05, + "loss": 3.1819, "step": 2912 }, { - "epoch": 0.88, - "grad_norm": 16.76682472229004, - "learning_rate": 1.4162573920016039e-05, - "loss": 2.0711, + "epoch": 0.37, + "grad_norm": 18.560909271240234, + "learning_rate": 1.7563485754926162e-05, + "loss": 1.7441, "step": 2913 }, { - "epoch": 0.88, - "grad_norm": 27.799612045288086, - "learning_rate": 1.4160569309411647e-05, - "loss": 2.1891, + "epoch": 0.37, + "grad_norm": 16.16483497619629, + "learning_rate": 1.7562649039869476e-05, + "loss": 2.3441, "step": 2914 }, { - "epoch": 0.88, - "grad_norm": 24.618627548217773, - "learning_rate": 1.4158564698807257e-05, - "loss": 1.8592, + "epoch": 0.37, + "grad_norm": 19.676025390625, + "learning_rate": 1.7561812324812786e-05, + "loss": 3.5454, "step": 2915 }, { - "epoch": 0.88, - "grad_norm": 24.141916275024414, - "learning_rate": 1.4156560088202869e-05, - "loss": 2.2455, + "epoch": 0.37, + "grad_norm": 14.388813018798828, + "learning_rate": 1.75609756097561e-05, + "loss": 1.9982, "step": 2916 }, { - "epoch": 0.88, - "grad_norm": 10.506229400634766, - "learning_rate": 1.4154555477598477e-05, - "loss": 1.8823, + "epoch": 0.37, + "grad_norm": 14.46896743774414, + "learning_rate": 1.7560138894699413e-05, + "loss": 2.5543, "step": 2917 }, { - "epoch": 0.88, - "grad_norm": 14.979269981384277, - "learning_rate": 1.4152550866994087e-05, - "loss": 2.0007, + "epoch": 0.37, + "grad_norm": 13.310667991638184, + "learning_rate": 1.7559302179642724e-05, + "loss": 3.2821, "step": 2918 }, { - "epoch": 0.88, - "grad_norm": 34.168128967285156, - "learning_rate": 1.4150546256389697e-05, - "loss": 2.2599, + "epoch": 0.37, + "grad_norm": 14.989908218383789, + "learning_rate": 1.7558465464586037e-05, + "loss": 1.7186, "step": 2919 }, { - "epoch": 0.88, - "grad_norm": 17.805347442626953, - "learning_rate": 1.4148541645785307e-05, - "loss": 2.8172, + "epoch": 0.37, + "grad_norm": 7.282630920410156, + "learning_rate": 1.755762874952935e-05, + "loss": 1.1641, "step": 2920 }, { - "epoch": 0.88, - "grad_norm": 18.875598907470703, - "learning_rate": 1.4146537035180918e-05, - "loss": 1.7194, + "epoch": 0.37, + "grad_norm": 25.337600708007812, + "learning_rate": 1.755679203447266e-05, + "loss": 1.0596, "step": 2921 }, { - "epoch": 0.88, - "grad_norm": 44.858428955078125, - "learning_rate": 1.4144532424576528e-05, - "loss": 1.6975, + "epoch": 0.37, + "grad_norm": 12.670037269592285, + "learning_rate": 1.7555955319415975e-05, + "loss": 2.7756, "step": 2922 }, { - "epoch": 0.88, - "grad_norm": 10.500758171081543, - "learning_rate": 1.4142527813972138e-05, - "loss": 1.8214, + "epoch": 0.37, + "grad_norm": 12.989263534545898, + "learning_rate": 1.755511860435929e-05, + "loss": 1.7594, "step": 2923 }, { - "epoch": 0.88, - "grad_norm": 19.17152214050293, - "learning_rate": 1.4140523203367746e-05, - "loss": 1.9509, + "epoch": 0.37, + "grad_norm": 9.042985916137695, + "learning_rate": 1.75542818893026e-05, + "loss": 1.5087, "step": 2924 }, { - "epoch": 0.88, - "grad_norm": 19.903955459594727, - "learning_rate": 1.4138518592763358e-05, - "loss": 2.9421, + "epoch": 0.37, + "grad_norm": 14.791471481323242, + "learning_rate": 1.7553445174245912e-05, + "loss": 2.6902, "step": 2925 }, { - "epoch": 0.88, - "grad_norm": 17.388938903808594, - "learning_rate": 1.4136513982158966e-05, - "loss": 1.8245, + "epoch": 0.37, + "grad_norm": 10.423829078674316, + "learning_rate": 1.7552608459189223e-05, + "loss": 1.6936, "step": 2926 }, { - "epoch": 0.88, - "grad_norm": 15.076804161071777, - "learning_rate": 1.4134509371554576e-05, - "loss": 1.5359, + "epoch": 0.37, + "grad_norm": 11.191834449768066, + "learning_rate": 1.7551771744132536e-05, + "loss": 1.3695, "step": 2927 }, { - "epoch": 0.88, - "grad_norm": 11.603935241699219, - "learning_rate": 1.4132504760950188e-05, - "loss": 1.5563, + "epoch": 0.37, + "grad_norm": 11.632593154907227, + "learning_rate": 1.755093502907585e-05, + "loss": 0.9178, "step": 2928 }, { - "epoch": 0.88, - "grad_norm": 25.9620418548584, - "learning_rate": 1.4130500150345797e-05, - "loss": 3.2783, + "epoch": 0.37, + "grad_norm": 63.4360237121582, + "learning_rate": 1.755009831401916e-05, + "loss": 1.2587, "step": 2929 }, { - "epoch": 0.88, - "grad_norm": 16.795143127441406, - "learning_rate": 1.4128495539741407e-05, - "loss": 1.5015, + "epoch": 0.37, + "grad_norm": 10.623050689697266, + "learning_rate": 1.7549261598962474e-05, + "loss": 4.4241, "step": 2930 }, { - "epoch": 0.88, - "grad_norm": 11.010591506958008, - "learning_rate": 1.4126490929137015e-05, - "loss": 1.8603, + "epoch": 0.37, + "grad_norm": 28.13072395324707, + "learning_rate": 1.7548424883905788e-05, + "loss": 1.4253, "step": 2931 }, { - "epoch": 0.88, - "grad_norm": 12.306628227233887, - "learning_rate": 1.4124486318532627e-05, - "loss": 1.4446, + "epoch": 0.37, + "grad_norm": 15.974139213562012, + "learning_rate": 1.7547588168849098e-05, + "loss": 3.0009, "step": 2932 }, { - "epoch": 0.88, - "grad_norm": 14.734245300292969, - "learning_rate": 1.4122481707928235e-05, - "loss": 2.5827, + "epoch": 0.37, + "grad_norm": 10.374823570251465, + "learning_rate": 1.754675145379241e-05, + "loss": 2.2726, "step": 2933 }, { - "epoch": 0.88, - "grad_norm": 23.01751708984375, - "learning_rate": 1.4120477097323845e-05, - "loss": 2.9274, + "epoch": 0.37, + "grad_norm": 10.936897277832031, + "learning_rate": 1.7545914738735725e-05, + "loss": 1.3559, "step": 2934 }, { - "epoch": 0.88, - "grad_norm": 20.132675170898438, - "learning_rate": 1.4118472486719457e-05, - "loss": 2.0589, + "epoch": 0.37, + "grad_norm": 7.858895301818848, + "learning_rate": 1.754507802367904e-05, + "loss": 1.4125, "step": 2935 }, { - "epoch": 0.88, - "grad_norm": 17.275482177734375, - "learning_rate": 1.4116467876115065e-05, - "loss": 2.1773, + "epoch": 0.37, + "grad_norm": 11.782905578613281, + "learning_rate": 1.754424130862235e-05, + "loss": 2.6438, "step": 2936 }, { - "epoch": 0.88, - "grad_norm": 40.08796310424805, - "learning_rate": 1.4114463265510675e-05, - "loss": 2.3652, + "epoch": 0.37, + "grad_norm": 13.125471115112305, + "learning_rate": 1.7543404593565663e-05, + "loss": 2.3761, "step": 2937 }, { - "epoch": 0.88, - "grad_norm": 22.47896957397461, - "learning_rate": 1.4112458654906286e-05, - "loss": 2.069, + "epoch": 0.37, + "grad_norm": 7.501157760620117, + "learning_rate": 1.7542567878508976e-05, + "loss": 2.2839, "step": 2938 }, { - "epoch": 0.88, - "grad_norm": 21.368188858032227, - "learning_rate": 1.4110454044301896e-05, - "loss": 2.5254, + "epoch": 0.37, + "grad_norm": 25.19307518005371, + "learning_rate": 1.7541731163452287e-05, + "loss": 2.704, "step": 2939 }, { - "epoch": 0.88, - "grad_norm": 8.838847160339355, - "learning_rate": 1.4108449433697506e-05, - "loss": 1.4187, + "epoch": 0.37, + "grad_norm": 13.05322551727295, + "learning_rate": 1.75408944483956e-05, + "loss": 1.9, "step": 2940 }, { - "epoch": 0.88, - "grad_norm": 19.025794982910156, - "learning_rate": 1.4106444823093116e-05, - "loss": 1.7403, + "epoch": 0.37, + "grad_norm": 26.257415771484375, + "learning_rate": 1.7540057733338914e-05, + "loss": 2.7862, "step": 2941 }, { - "epoch": 0.88, - "grad_norm": 26.310054779052734, - "learning_rate": 1.4104440212488726e-05, - "loss": 2.0741, + "epoch": 0.37, + "grad_norm": 6.446777820587158, + "learning_rate": 1.7539221018282228e-05, + "loss": 0.5044, "step": 2942 }, { - "epoch": 0.88, - "grad_norm": 14.412237167358398, - "learning_rate": 1.4102435601884334e-05, - "loss": 1.6891, + "epoch": 0.37, + "grad_norm": 17.562593460083008, + "learning_rate": 1.7538384303225538e-05, + "loss": 3.0915, "step": 2943 }, { - "epoch": 0.89, - "grad_norm": 15.92914867401123, - "learning_rate": 1.4100430991279946e-05, - "loss": 2.542, + "epoch": 0.37, + "grad_norm": 14.979905128479004, + "learning_rate": 1.753754758816885e-05, + "loss": 3.0751, "step": 2944 }, { - "epoch": 0.89, - "grad_norm": 26.77935028076172, - "learning_rate": 1.4098426380675554e-05, - "loss": 3.2923, + "epoch": 0.37, + "grad_norm": 41.20693588256836, + "learning_rate": 1.7536710873112165e-05, + "loss": 1.6509, "step": 2945 }, { - "epoch": 0.89, - "grad_norm": 61.19276428222656, - "learning_rate": 1.4096421770071165e-05, - "loss": 2.9397, + "epoch": 0.37, + "grad_norm": 35.70596694946289, + "learning_rate": 1.7535874158055475e-05, + "loss": 2.5653, "step": 2946 }, { - "epoch": 0.89, - "grad_norm": 15.064374923706055, - "learning_rate": 1.4094417159466776e-05, - "loss": 1.757, + "epoch": 0.37, + "grad_norm": 34.14892578125, + "learning_rate": 1.753503744299879e-05, + "loss": 2.738, "step": 2947 }, { - "epoch": 0.89, - "grad_norm": 14.561102867126465, - "learning_rate": 1.4092412548862385e-05, - "loss": 1.7057, + "epoch": 0.37, + "grad_norm": 10.848490715026855, + "learning_rate": 1.7534200727942103e-05, + "loss": 1.7576, "step": 2948 }, { - "epoch": 0.89, - "grad_norm": 17.088661193847656, - "learning_rate": 1.4090407938257995e-05, - "loss": 2.2723, + "epoch": 0.37, + "grad_norm": 7.745413303375244, + "learning_rate": 1.7533364012885413e-05, + "loss": 0.7855, "step": 2949 }, { - "epoch": 0.89, - "grad_norm": 7.7882256507873535, - "learning_rate": 1.4088403327653603e-05, - "loss": 1.8397, + "epoch": 0.37, + "grad_norm": 10.996760368347168, + "learning_rate": 1.7532527297828727e-05, + "loss": 1.5187, "step": 2950 }, { - "epoch": 0.89, - "grad_norm": 11.358787536621094, - "learning_rate": 1.4086398717049215e-05, - "loss": 2.2978, + "epoch": 0.37, + "grad_norm": 15.464694023132324, + "learning_rate": 1.753169058277204e-05, + "loss": 2.7653, "step": 2951 }, { - "epoch": 0.89, - "grad_norm": 28.74677276611328, - "learning_rate": 1.4084394106444825e-05, - "loss": 2.7206, + "epoch": 0.37, + "grad_norm": 9.640546798706055, + "learning_rate": 1.753085386771535e-05, + "loss": 1.0689, "step": 2952 }, { - "epoch": 0.89, - "grad_norm": 16.747051239013672, - "learning_rate": 1.4082389495840433e-05, - "loss": 1.9734, + "epoch": 0.37, + "grad_norm": 28.65819549560547, + "learning_rate": 1.7530017152658664e-05, + "loss": 1.2512, "step": 2953 }, { - "epoch": 0.89, - "grad_norm": 12.262884140014648, - "learning_rate": 1.4080384885236045e-05, - "loss": 2.0553, + "epoch": 0.37, + "grad_norm": 14.472492218017578, + "learning_rate": 1.7529180437601974e-05, + "loss": 1.1207, "step": 2954 }, { - "epoch": 0.89, - "grad_norm": 20.407411575317383, - "learning_rate": 1.4078380274631654e-05, - "loss": 1.3774, + "epoch": 0.37, + "grad_norm": 10.543621063232422, + "learning_rate": 1.7528343722545288e-05, + "loss": 1.9395, "step": 2955 }, { - "epoch": 0.89, - "grad_norm": 25.87983512878418, - "learning_rate": 1.4076375664027264e-05, - "loss": 2.7685, + "epoch": 0.37, + "grad_norm": 33.20399475097656, + "learning_rate": 1.75275070074886e-05, + "loss": 2.0916, "step": 2956 }, { - "epoch": 0.89, - "grad_norm": 14.667790412902832, - "learning_rate": 1.4074371053422872e-05, - "loss": 2.1229, + "epoch": 0.37, + "grad_norm": 10.970297813415527, + "learning_rate": 1.7526670292431912e-05, + "loss": 1.5082, "step": 2957 }, { - "epoch": 0.89, - "grad_norm": 10.724139213562012, - "learning_rate": 1.4072366442818484e-05, - "loss": 1.6101, + "epoch": 0.37, + "grad_norm": 13.990462303161621, + "learning_rate": 1.7525833577375226e-05, + "loss": 2.0058, "step": 2958 }, { - "epoch": 0.89, - "grad_norm": 25.648866653442383, - "learning_rate": 1.4070361832214094e-05, - "loss": 2.4417, + "epoch": 0.37, + "grad_norm": 9.842002868652344, + "learning_rate": 1.752499686231854e-05, + "loss": 1.9603, "step": 2959 }, { - "epoch": 0.89, - "grad_norm": 24.464935302734375, - "learning_rate": 1.4068357221609702e-05, - "loss": 1.071, + "epoch": 0.37, + "grad_norm": 7.550311088562012, + "learning_rate": 1.752416014726185e-05, + "loss": 1.5242, "step": 2960 }, { - "epoch": 0.89, - "grad_norm": 22.294677734375, - "learning_rate": 1.4066352611005314e-05, - "loss": 2.0512, + "epoch": 0.37, + "grad_norm": 14.05145263671875, + "learning_rate": 1.7523323432205163e-05, + "loss": 2.3942, "step": 2961 }, { - "epoch": 0.89, - "grad_norm": 20.227296829223633, - "learning_rate": 1.4064348000400923e-05, - "loss": 1.4506, + "epoch": 0.37, + "grad_norm": 8.621101379394531, + "learning_rate": 1.7522486717148477e-05, + "loss": 1.4718, "step": 2962 }, { - "epoch": 0.89, - "grad_norm": 35.15935134887695, - "learning_rate": 1.4062343389796533e-05, - "loss": 2.224, + "epoch": 0.37, + "grad_norm": 8.392345428466797, + "learning_rate": 1.752165000209179e-05, + "loss": 0.8059, "step": 2963 }, { - "epoch": 0.89, - "grad_norm": 15.452341079711914, - "learning_rate": 1.4060338779192144e-05, - "loss": 3.0408, + "epoch": 0.37, + "grad_norm": 20.275230407714844, + "learning_rate": 1.75208132870351e-05, + "loss": 2.6383, "step": 2964 }, { - "epoch": 0.89, - "grad_norm": 13.116132736206055, - "learning_rate": 1.4058334168587753e-05, - "loss": 2.6705, + "epoch": 0.37, + "grad_norm": 26.309600830078125, + "learning_rate": 1.7519976571978414e-05, + "loss": 1.837, "step": 2965 }, { - "epoch": 0.89, - "grad_norm": 22.434358596801758, - "learning_rate": 1.4056329557983363e-05, - "loss": 2.6559, + "epoch": 0.37, + "grad_norm": 21.659997940063477, + "learning_rate": 1.7519139856921728e-05, + "loss": 3.1818, "step": 2966 }, { - "epoch": 0.89, - "grad_norm": 27.88434600830078, - "learning_rate": 1.4054324947378973e-05, - "loss": 2.2654, + "epoch": 0.37, + "grad_norm": 26.298851013183594, + "learning_rate": 1.7518303141865038e-05, + "loss": 3.1853, "step": 2967 }, { - "epoch": 0.89, - "grad_norm": 29.59733772277832, - "learning_rate": 1.4052320336774583e-05, - "loss": 1.7697, + "epoch": 0.37, + "grad_norm": 9.970256805419922, + "learning_rate": 1.7517466426808352e-05, + "loss": 2.1248, "step": 2968 }, { - "epoch": 0.89, - "grad_norm": 27.069503784179688, - "learning_rate": 1.4050315726170191e-05, - "loss": 2.1905, + "epoch": 0.37, + "grad_norm": 21.72904396057129, + "learning_rate": 1.7516629711751666e-05, + "loss": 1.4126, "step": 2969 }, { - "epoch": 0.89, - "grad_norm": 29.36873435974121, - "learning_rate": 1.4048311115565803e-05, - "loss": 2.3816, + "epoch": 0.37, + "grad_norm": 26.062219619750977, + "learning_rate": 1.751579299669498e-05, + "loss": 1.6812, "step": 2970 }, { - "epoch": 0.89, - "grad_norm": 10.980319023132324, - "learning_rate": 1.4046306504961413e-05, - "loss": 1.6791, + "epoch": 0.37, + "grad_norm": 15.527777671813965, + "learning_rate": 1.751495628163829e-05, + "loss": 1.2272, "step": 2971 }, { - "epoch": 0.89, - "grad_norm": 13.023073196411133, - "learning_rate": 1.4044301894357022e-05, - "loss": 1.9126, + "epoch": 0.37, + "grad_norm": 14.073158264160156, + "learning_rate": 1.7514119566581603e-05, + "loss": 1.8272, "step": 2972 }, { - "epoch": 0.89, - "grad_norm": 38.787940979003906, - "learning_rate": 1.4042297283752633e-05, - "loss": 2.6036, + "epoch": 0.37, + "grad_norm": 17.060989379882812, + "learning_rate": 1.7513282851524917e-05, + "loss": 1.0744, "step": 2973 }, { - "epoch": 0.89, - "grad_norm": 29.560956954956055, - "learning_rate": 1.4040292673148242e-05, - "loss": 2.8441, + "epoch": 0.37, + "grad_norm": 7.3651838302612305, + "learning_rate": 1.7512446136468227e-05, + "loss": 1.3256, "step": 2974 }, { - "epoch": 0.89, - "grad_norm": 8.834044456481934, - "learning_rate": 1.4038288062543852e-05, - "loss": 1.4787, + "epoch": 0.37, + "grad_norm": 13.686807632446289, + "learning_rate": 1.751160942141154e-05, + "loss": 3.2167, "step": 2975 }, { - "epoch": 0.89, - "grad_norm": 11.226600646972656, - "learning_rate": 1.403628345193946e-05, - "loss": 1.5387, + "epoch": 0.37, + "grad_norm": 14.580486297607422, + "learning_rate": 1.7510772706354854e-05, + "loss": 2.9224, "step": 2976 }, { - "epoch": 0.9, - "grad_norm": 16.720735549926758, - "learning_rate": 1.4034278841335072e-05, - "loss": 2.2926, + "epoch": 0.37, + "grad_norm": 15.495469093322754, + "learning_rate": 1.7509935991298165e-05, + "loss": 0.9868, "step": 2977 }, { - "epoch": 0.9, - "grad_norm": 20.5881290435791, - "learning_rate": 1.4032274230730682e-05, - "loss": 2.5574, + "epoch": 0.37, + "grad_norm": 11.540633201599121, + "learning_rate": 1.7509099276241478e-05, + "loss": 1.0614, "step": 2978 }, { - "epoch": 0.9, - "grad_norm": 31.438135147094727, - "learning_rate": 1.403026962012629e-05, - "loss": 1.8076, + "epoch": 0.37, + "grad_norm": 8.092855453491211, + "learning_rate": 1.750826256118479e-05, + "loss": 1.0501, "step": 2979 }, { - "epoch": 0.9, - "grad_norm": 12.068266868591309, - "learning_rate": 1.4028265009521902e-05, - "loss": 1.2816, + "epoch": 0.37, + "grad_norm": 12.961438179016113, + "learning_rate": 1.7507425846128102e-05, + "loss": 2.6531, "step": 2980 }, { - "epoch": 0.9, - "grad_norm": 25.184410095214844, - "learning_rate": 1.402626039891751e-05, - "loss": 2.4355, + "epoch": 0.37, + "grad_norm": 8.35956859588623, + "learning_rate": 1.7506589131071416e-05, + "loss": 2.8781, "step": 2981 }, { - "epoch": 0.9, - "grad_norm": 11.62043285369873, - "learning_rate": 1.402425578831312e-05, - "loss": 1.7278, + "epoch": 0.37, + "grad_norm": 16.814393997192383, + "learning_rate": 1.7505752416014726e-05, + "loss": 2.518, "step": 2982 }, { - "epoch": 0.9, - "grad_norm": 10.73161506652832, - "learning_rate": 1.4022251177708733e-05, - "loss": 1.5009, + "epoch": 0.37, + "grad_norm": 9.360370635986328, + "learning_rate": 1.750491570095804e-05, + "loss": 1.1125, "step": 2983 }, { - "epoch": 0.9, - "grad_norm": 18.191015243530273, - "learning_rate": 1.4020246567104341e-05, - "loss": 1.7434, + "epoch": 0.37, + "grad_norm": 13.405939102172852, + "learning_rate": 1.7504078985901353e-05, + "loss": 1.3022, "step": 2984 }, { - "epoch": 0.9, - "grad_norm": 35.12432861328125, - "learning_rate": 1.4018241956499951e-05, - "loss": 1.6909, + "epoch": 0.37, + "grad_norm": 28.36098861694336, + "learning_rate": 1.7503242270844664e-05, + "loss": 3.2326, "step": 2985 }, { - "epoch": 0.9, - "grad_norm": 11.526291847229004, - "learning_rate": 1.401623734589556e-05, - "loss": 1.4617, + "epoch": 0.37, + "grad_norm": 19.087400436401367, + "learning_rate": 1.7502405555787977e-05, + "loss": 2.0111, "step": 2986 }, { - "epoch": 0.9, - "grad_norm": 61.24599075317383, - "learning_rate": 1.4014232735291171e-05, - "loss": 1.9207, + "epoch": 0.37, + "grad_norm": 32.10877227783203, + "learning_rate": 1.750156884073129e-05, + "loss": 2.5922, "step": 2987 }, { - "epoch": 0.9, - "grad_norm": 21.56028175354004, - "learning_rate": 1.401222812468678e-05, - "loss": 2.4295, + "epoch": 0.37, + "grad_norm": 10.107169151306152, + "learning_rate": 1.75007321256746e-05, + "loss": 2.5872, "step": 2988 }, { - "epoch": 0.9, - "grad_norm": 15.971736907958984, - "learning_rate": 1.4010223514082391e-05, - "loss": 1.9247, + "epoch": 0.38, + "grad_norm": 18.61122703552246, + "learning_rate": 1.7499895410617915e-05, + "loss": 3.6177, "step": 2989 }, { - "epoch": 0.9, - "grad_norm": 18.692066192626953, - "learning_rate": 1.4008218903478001e-05, - "loss": 2.1351, + "epoch": 0.38, + "grad_norm": 7.76904296875, + "learning_rate": 1.749905869556123e-05, + "loss": 1.7388, "step": 2990 }, { - "epoch": 0.9, - "grad_norm": 19.446582794189453, - "learning_rate": 1.400621429287361e-05, - "loss": 2.2271, + "epoch": 0.38, + "grad_norm": 19.504474639892578, + "learning_rate": 1.7498221980504542e-05, + "loss": 3.4638, "step": 2991 }, { - "epoch": 0.9, - "grad_norm": 14.894234657287598, - "learning_rate": 1.4004209682269222e-05, - "loss": 2.0351, + "epoch": 0.38, + "grad_norm": 16.34959602355957, + "learning_rate": 1.7497385265447852e-05, + "loss": 2.5737, "step": 2992 }, { - "epoch": 0.9, - "grad_norm": 10.384711265563965, - "learning_rate": 1.400220507166483e-05, - "loss": 1.3777, + "epoch": 0.38, + "grad_norm": 6.932929515838623, + "learning_rate": 1.7496548550391166e-05, + "loss": 0.9677, "step": 2993 }, { - "epoch": 0.9, - "grad_norm": 15.35462760925293, - "learning_rate": 1.400020046106044e-05, - "loss": 1.4528, + "epoch": 0.38, + "grad_norm": 13.935084342956543, + "learning_rate": 1.749571183533448e-05, + "loss": 1.6441, "step": 2994 }, { - "epoch": 0.9, - "grad_norm": 9.53518009185791, - "learning_rate": 1.3998195850456052e-05, - "loss": 2.0396, + "epoch": 0.38, + "grad_norm": 10.431344985961914, + "learning_rate": 1.749487512027779e-05, + "loss": 2.1761, "step": 2995 }, { - "epoch": 0.9, - "grad_norm": 12.702194213867188, - "learning_rate": 1.399619123985166e-05, - "loss": 3.5201, + "epoch": 0.38, + "grad_norm": 5.776036739349365, + "learning_rate": 1.7494038405221104e-05, + "loss": 0.7499, "step": 2996 }, { - "epoch": 0.9, - "grad_norm": 16.689231872558594, - "learning_rate": 1.399418662924727e-05, - "loss": 2.5204, + "epoch": 0.38, + "grad_norm": 14.878742218017578, + "learning_rate": 1.7493201690164417e-05, + "loss": 2.2408, "step": 2997 }, { - "epoch": 0.9, - "grad_norm": 16.812484741210938, - "learning_rate": 1.3992182018642879e-05, - "loss": 1.823, + "epoch": 0.38, + "grad_norm": 9.810598373413086, + "learning_rate": 1.749236497510773e-05, + "loss": 1.5583, "step": 2998 }, { - "epoch": 0.9, - "grad_norm": 27.859556198120117, - "learning_rate": 1.399017740803849e-05, - "loss": 2.3145, + "epoch": 0.38, + "grad_norm": 16.407333374023438, + "learning_rate": 1.749152826005104e-05, + "loss": 1.1398, "step": 2999 }, { - "epoch": 0.9, - "grad_norm": 40.26885223388672, - "learning_rate": 1.3988172797434099e-05, - "loss": 1.981, - "step": 3000 - }, - { - "epoch": 0.9, - "eval_loss": 0.290047824382782, - "eval_runtime": 43.5045, - "eval_samples_per_second": 33.996, - "eval_steps_per_second": 33.996, + "epoch": 0.38, + "grad_norm": 10.976561546325684, + "learning_rate": 1.7490691544994355e-05, + "loss": 2.8406, "step": 3000 }, { - "epoch": 0.9, - "grad_norm": 16.576269149780273, - "learning_rate": 1.3986168186829709e-05, - "loss": 1.7576, + "epoch": 0.38, + "grad_norm": 11.879322052001953, + "learning_rate": 1.748985482993767e-05, + "loss": 1.451, "step": 3001 }, { - "epoch": 0.9, - "grad_norm": 11.136350631713867, - "learning_rate": 1.398416357622532e-05, - "loss": 1.1301, + "epoch": 0.38, + "grad_norm": 16.093420028686523, + "learning_rate": 1.748901811488098e-05, + "loss": 2.4345, "step": 3002 }, { - "epoch": 0.9, - "grad_norm": 7.8271684646606445, - "learning_rate": 1.398215896562093e-05, - "loss": 1.6797, + "epoch": 0.38, + "grad_norm": 15.955079078674316, + "learning_rate": 1.7488181399824292e-05, + "loss": 3.383, "step": 3003 }, { - "epoch": 0.9, - "grad_norm": 18.726409912109375, - "learning_rate": 1.398015435501654e-05, - "loss": 2.3383, + "epoch": 0.38, + "grad_norm": 14.334516525268555, + "learning_rate": 1.7487344684767603e-05, + "loss": 3.2137, "step": 3004 }, { - "epoch": 0.9, - "grad_norm": 13.888818740844727, - "learning_rate": 1.3978149744412148e-05, - "loss": 1.7, + "epoch": 0.38, + "grad_norm": 9.579103469848633, + "learning_rate": 1.7486507969710916e-05, + "loss": 3.0365, "step": 3005 }, { - "epoch": 0.9, - "grad_norm": 17.36934471130371, - "learning_rate": 1.397614513380776e-05, - "loss": 1.9836, + "epoch": 0.38, + "grad_norm": 8.867793083190918, + "learning_rate": 1.748567125465423e-05, + "loss": 1.9244, "step": 3006 }, { - "epoch": 0.9, - "grad_norm": 34.03765869140625, - "learning_rate": 1.3974140523203368e-05, - "loss": 2.0497, + "epoch": 0.38, + "grad_norm": 41.3364372253418, + "learning_rate": 1.748483453959754e-05, + "loss": 3.6235, "step": 3007 }, { - "epoch": 0.9, - "grad_norm": 24.520832061767578, - "learning_rate": 1.3972135912598978e-05, - "loss": 3.1885, + "epoch": 0.38, + "grad_norm": 53.81862258911133, + "learning_rate": 1.7483997824540854e-05, + "loss": 2.396, "step": 3008 }, { - "epoch": 0.9, - "grad_norm": 13.99000072479248, - "learning_rate": 1.397013130199459e-05, - "loss": 2.3549, + "epoch": 0.38, + "grad_norm": 9.474702835083008, + "learning_rate": 1.7483161109484167e-05, + "loss": 1.8655, "step": 3009 }, { - "epoch": 0.9, - "grad_norm": 14.597472190856934, - "learning_rate": 1.3968126691390198e-05, - "loss": 2.2443, + "epoch": 0.38, + "grad_norm": 17.841140747070312, + "learning_rate": 1.7482324394427478e-05, + "loss": 1.9738, "step": 3010 }, { - "epoch": 0.91, - "grad_norm": 10.168909072875977, - "learning_rate": 1.3966122080785808e-05, - "loss": 2.1224, + "epoch": 0.38, + "grad_norm": 11.464025497436523, + "learning_rate": 1.748148767937079e-05, + "loss": 2.1443, "step": 3011 }, { - "epoch": 0.91, - "grad_norm": 21.07402229309082, - "learning_rate": 1.3964117470181418e-05, - "loss": 2.2544, + "epoch": 0.38, + "grad_norm": 24.358856201171875, + "learning_rate": 1.7480650964314105e-05, + "loss": 1.866, "step": 3012 }, { - "epoch": 0.91, - "grad_norm": 35.29275894165039, - "learning_rate": 1.3962112859577028e-05, - "loss": 4.147, + "epoch": 0.38, + "grad_norm": 8.274801254272461, + "learning_rate": 1.7479814249257415e-05, + "loss": 0.9875, "step": 3013 }, { - "epoch": 0.91, - "grad_norm": 19.924930572509766, - "learning_rate": 1.3960108248972638e-05, - "loss": 2.3426, + "epoch": 0.38, + "grad_norm": 16.024633407592773, + "learning_rate": 1.747897753420073e-05, + "loss": 1.8152, "step": 3014 }, { - "epoch": 0.91, - "grad_norm": 18.175373077392578, - "learning_rate": 1.3958103638368249e-05, - "loss": 1.6238, + "epoch": 0.38, + "grad_norm": 22.47686195373535, + "learning_rate": 1.7478140819144043e-05, + "loss": 3.7221, "step": 3015 }, { - "epoch": 0.91, - "grad_norm": 16.749662399291992, - "learning_rate": 1.3956099027763859e-05, - "loss": 1.9444, + "epoch": 0.38, + "grad_norm": 10.875396728515625, + "learning_rate": 1.7477304104087353e-05, + "loss": 2.6813, "step": 3016 }, { - "epoch": 0.91, - "grad_norm": 32.66911697387695, - "learning_rate": 1.3954094417159467e-05, - "loss": 2.5848, + "epoch": 0.38, + "grad_norm": 10.279182434082031, + "learning_rate": 1.7476467389030666e-05, + "loss": 3.5712, "step": 3017 }, { - "epoch": 0.91, - "grad_norm": 10.309561729431152, - "learning_rate": 1.3952089806555079e-05, - "loss": 1.2661, + "epoch": 0.38, + "grad_norm": 34.010494232177734, + "learning_rate": 1.747563067397398e-05, + "loss": 2.5344, "step": 3018 }, { - "epoch": 0.91, - "grad_norm": 32.622840881347656, - "learning_rate": 1.3950085195950687e-05, - "loss": 3.1314, + "epoch": 0.38, + "grad_norm": 9.324440956115723, + "learning_rate": 1.7474793958917294e-05, + "loss": 1.9682, "step": 3019 }, { - "epoch": 0.91, - "grad_norm": 51.04716491699219, - "learning_rate": 1.3948080585346297e-05, - "loss": 2.5912, + "epoch": 0.38, + "grad_norm": 9.498290061950684, + "learning_rate": 1.7473957243860604e-05, + "loss": 2.3948, "step": 3020 }, { - "epoch": 0.91, - "grad_norm": 9.933348655700684, - "learning_rate": 1.3946075974741909e-05, - "loss": 1.6906, + "epoch": 0.38, + "grad_norm": 33.985198974609375, + "learning_rate": 1.7473120528803918e-05, + "loss": 3.7208, "step": 3021 }, { - "epoch": 0.91, - "grad_norm": 11.241128921508789, - "learning_rate": 1.3944071364137517e-05, - "loss": 2.0141, + "epoch": 0.38, + "grad_norm": 19.785444259643555, + "learning_rate": 1.747228381374723e-05, + "loss": 2.1617, "step": 3022 }, { - "epoch": 0.91, - "grad_norm": 20.985105514526367, - "learning_rate": 1.3942066753533127e-05, - "loss": 2.3962, + "epoch": 0.38, + "grad_norm": 23.751007080078125, + "learning_rate": 1.747144709869054e-05, + "loss": 2.6466, "step": 3023 }, { - "epoch": 0.91, - "grad_norm": 14.399240493774414, - "learning_rate": 1.3940062142928736e-05, - "loss": 2.4056, + "epoch": 0.38, + "grad_norm": 28.25634765625, + "learning_rate": 1.7470610383633855e-05, + "loss": 2.7269, "step": 3024 }, { - "epoch": 0.91, - "grad_norm": 23.836753845214844, - "learning_rate": 1.3938057532324348e-05, - "loss": 1.492, + "epoch": 0.38, + "grad_norm": 15.494318962097168, + "learning_rate": 1.746977366857717e-05, + "loss": 1.9662, "step": 3025 }, { - "epoch": 0.91, - "grad_norm": 18.37982940673828, - "learning_rate": 1.3936052921719958e-05, - "loss": 1.8183, + "epoch": 0.38, + "grad_norm": 40.99577713012695, + "learning_rate": 1.7468936953520483e-05, + "loss": 3.8287, "step": 3026 }, { - "epoch": 0.91, - "grad_norm": 13.537169456481934, - "learning_rate": 1.3934048311115566e-05, - "loss": 2.1759, + "epoch": 0.38, + "grad_norm": 15.126395225524902, + "learning_rate": 1.7468100238463793e-05, + "loss": 2.892, "step": 3027 }, { - "epoch": 0.91, - "grad_norm": 23.467910766601562, - "learning_rate": 1.3932043700511178e-05, - "loss": 2.6208, + "epoch": 0.38, + "grad_norm": 15.43254280090332, + "learning_rate": 1.7467263523407106e-05, + "loss": 1.6425, "step": 3028 }, { - "epoch": 0.91, - "grad_norm": 17.59671401977539, - "learning_rate": 1.3930039089906786e-05, - "loss": 2.3887, + "epoch": 0.38, + "grad_norm": 22.664308547973633, + "learning_rate": 1.746642680835042e-05, + "loss": 2.8555, "step": 3029 }, { - "epoch": 0.91, - "grad_norm": 12.384592056274414, - "learning_rate": 1.3928034479302396e-05, - "loss": 2.0166, + "epoch": 0.38, + "grad_norm": 22.935243606567383, + "learning_rate": 1.746559009329373e-05, + "loss": 3.0264, "step": 3030 }, { - "epoch": 0.91, - "grad_norm": 17.471967697143555, - "learning_rate": 1.3926029868698005e-05, - "loss": 1.875, + "epoch": 0.38, + "grad_norm": 8.225278854370117, + "learning_rate": 1.7464753378237044e-05, + "loss": 4.3243, "step": 3031 }, { - "epoch": 0.91, - "grad_norm": 12.046953201293945, - "learning_rate": 1.3924025258093617e-05, - "loss": 1.3814, + "epoch": 0.38, + "grad_norm": 6.5009613037109375, + "learning_rate": 1.7463916663180354e-05, + "loss": 3.2662, "step": 3032 }, { - "epoch": 0.91, - "grad_norm": 15.179215431213379, - "learning_rate": 1.3922020647489227e-05, - "loss": 1.6944, + "epoch": 0.38, + "grad_norm": 10.92690372467041, + "learning_rate": 1.7463079948123668e-05, + "loss": 1.8278, "step": 3033 }, { - "epoch": 0.91, - "grad_norm": 10.748503684997559, - "learning_rate": 1.3920016036884835e-05, - "loss": 2.1539, + "epoch": 0.38, + "grad_norm": 17.307109832763672, + "learning_rate": 1.746224323306698e-05, + "loss": 3.076, "step": 3034 }, { - "epoch": 0.91, - "grad_norm": 20.441787719726562, - "learning_rate": 1.3918011426280447e-05, - "loss": 2.9749, + "epoch": 0.38, + "grad_norm": 12.11257266998291, + "learning_rate": 1.7461406518010292e-05, + "loss": 2.4566, "step": 3035 }, { - "epoch": 0.91, - "grad_norm": 23.096572875976562, - "learning_rate": 1.3916006815676055e-05, - "loss": 1.9606, + "epoch": 0.38, + "grad_norm": 21.447715759277344, + "learning_rate": 1.7460569802953605e-05, + "loss": 1.4901, "step": 3036 }, { - "epoch": 0.91, - "grad_norm": 27.173370361328125, - "learning_rate": 1.3914002205071665e-05, - "loss": 2.5916, + "epoch": 0.38, + "grad_norm": 8.973800659179688, + "learning_rate": 1.7459733087896916e-05, + "loss": 2.4119, "step": 3037 }, { - "epoch": 0.91, - "grad_norm": 11.456189155578613, - "learning_rate": 1.3911997594467277e-05, - "loss": 1.7153, + "epoch": 0.38, + "grad_norm": 12.067655563354492, + "learning_rate": 1.745889637284023e-05, + "loss": 1.4386, "step": 3038 }, { - "epoch": 0.91, - "grad_norm": 36.13789367675781, - "learning_rate": 1.3909992983862885e-05, - "loss": 2.2955, + "epoch": 0.38, + "grad_norm": 10.95497989654541, + "learning_rate": 1.7458059657783543e-05, + "loss": 2.5098, "step": 3039 }, { - "epoch": 0.91, - "grad_norm": 25.46799659729004, - "learning_rate": 1.3907988373258496e-05, - "loss": 2.2377, + "epoch": 0.38, + "grad_norm": 27.85697364807129, + "learning_rate": 1.7457222942726853e-05, + "loss": 3.0961, "step": 3040 }, { - "epoch": 0.91, - "grad_norm": 18.01449203491211, - "learning_rate": 1.3905983762654106e-05, - "loss": 2.5831, + "epoch": 0.38, + "grad_norm": 23.892608642578125, + "learning_rate": 1.7456386227670167e-05, + "loss": 2.4394, "step": 3041 }, { - "epoch": 0.91, - "grad_norm": 25.396291732788086, - "learning_rate": 1.3903979152049716e-05, - "loss": 2.2237, + "epoch": 0.38, + "grad_norm": 26.527515411376953, + "learning_rate": 1.745554951261348e-05, + "loss": 3.225, "step": 3042 }, { - "epoch": 0.91, - "grad_norm": 15.773884773254395, - "learning_rate": 1.3901974541445324e-05, - "loss": 2.6421, + "epoch": 0.38, + "grad_norm": 13.135971069335938, + "learning_rate": 1.7454712797556794e-05, + "loss": 1.3418, "step": 3043 }, { - "epoch": 0.92, - "grad_norm": 16.961776733398438, - "learning_rate": 1.3899969930840936e-05, - "loss": 1.8037, + "epoch": 0.38, + "grad_norm": 7.4857072830200195, + "learning_rate": 1.7453876082500104e-05, + "loss": 2.2289, "step": 3044 }, { - "epoch": 0.92, - "grad_norm": 46.810585021972656, - "learning_rate": 1.3897965320236546e-05, - "loss": 1.4078, + "epoch": 0.38, + "grad_norm": 20.1530818939209, + "learning_rate": 1.7453039367443418e-05, + "loss": 3.5615, "step": 3045 }, { - "epoch": 0.92, - "grad_norm": 19.385047912597656, - "learning_rate": 1.3895960709632154e-05, - "loss": 1.9391, + "epoch": 0.38, + "grad_norm": 12.91379165649414, + "learning_rate": 1.7452202652386732e-05, + "loss": 4.9487, "step": 3046 }, { - "epoch": 0.92, - "grad_norm": 14.272333145141602, - "learning_rate": 1.3893956099027766e-05, - "loss": 1.7094, + "epoch": 0.38, + "grad_norm": 6.44926118850708, + "learning_rate": 1.7451365937330042e-05, + "loss": 0.9218, "step": 3047 }, { - "epoch": 0.92, - "grad_norm": 12.611112594604492, - "learning_rate": 1.3891951488423375e-05, - "loss": 2.0959, + "epoch": 0.38, + "grad_norm": 14.764775276184082, + "learning_rate": 1.7450529222273356e-05, + "loss": 2.2744, "step": 3048 }, { - "epoch": 0.92, - "grad_norm": 17.307199478149414, - "learning_rate": 1.3889946877818985e-05, - "loss": 2.1561, + "epoch": 0.38, + "grad_norm": 10.772073745727539, + "learning_rate": 1.744969250721667e-05, + "loss": 2.0143, "step": 3049 }, { - "epoch": 0.92, - "grad_norm": 20.245845794677734, - "learning_rate": 1.3887942267214593e-05, - "loss": 2.8128, + "epoch": 0.38, + "grad_norm": 34.17717361450195, + "learning_rate": 1.7448855792159983e-05, + "loss": 4.1729, "step": 3050 }, { - "epoch": 0.92, - "grad_norm": 17.531375885009766, - "learning_rate": 1.3885937656610205e-05, - "loss": 1.6429, + "epoch": 0.38, + "grad_norm": 20.677505493164062, + "learning_rate": 1.7448019077103293e-05, + "loss": 2.2328, "step": 3051 }, { - "epoch": 0.92, - "grad_norm": 12.451351165771484, - "learning_rate": 1.3883933046005815e-05, - "loss": 2.0644, + "epoch": 0.38, + "grad_norm": 13.210722923278809, + "learning_rate": 1.7447182362046607e-05, + "loss": 1.8375, "step": 3052 }, { - "epoch": 0.92, - "grad_norm": 16.824329376220703, - "learning_rate": 1.3881928435401423e-05, - "loss": 2.1751, + "epoch": 0.38, + "grad_norm": 7.626525402069092, + "learning_rate": 1.744634564698992e-05, + "loss": 1.3073, "step": 3053 }, { - "epoch": 0.92, - "grad_norm": 19.493457794189453, - "learning_rate": 1.3879923824797035e-05, - "loss": 1.8751, + "epoch": 0.38, + "grad_norm": 15.71025276184082, + "learning_rate": 1.744550893193323e-05, + "loss": 1.94, "step": 3054 }, { - "epoch": 0.92, - "grad_norm": 16.869348526000977, - "learning_rate": 1.3877919214192643e-05, - "loss": 1.9817, + "epoch": 0.38, + "grad_norm": 10.431207656860352, + "learning_rate": 1.7444672216876544e-05, + "loss": 1.1058, "step": 3055 }, { - "epoch": 0.92, - "grad_norm": 29.737041473388672, - "learning_rate": 1.3875914603588254e-05, - "loss": 2.4454, + "epoch": 0.38, + "grad_norm": 50.677146911621094, + "learning_rate": 1.7443835501819858e-05, + "loss": 1.7067, "step": 3056 }, { - "epoch": 0.92, - "grad_norm": 22.09736442565918, - "learning_rate": 1.3873909992983865e-05, - "loss": 1.7898, + "epoch": 0.38, + "grad_norm": 17.605249404907227, + "learning_rate": 1.744299878676317e-05, + "loss": 2.4274, "step": 3057 }, { - "epoch": 0.92, - "grad_norm": 17.416990280151367, - "learning_rate": 1.3871905382379474e-05, - "loss": 2.9486, + "epoch": 0.38, + "grad_norm": 18.438739776611328, + "learning_rate": 1.7442162071706482e-05, + "loss": 1.4258, "step": 3058 }, { - "epoch": 0.92, - "grad_norm": 13.20331859588623, - "learning_rate": 1.3869900771775084e-05, - "loss": 2.2757, + "epoch": 0.38, + "grad_norm": 37.58784484863281, + "learning_rate": 1.7441325356649796e-05, + "loss": 3.915, "step": 3059 }, { - "epoch": 0.92, - "grad_norm": 7.922008514404297, - "learning_rate": 1.3867896161170694e-05, - "loss": 2.0063, + "epoch": 0.38, + "grad_norm": 18.80876350402832, + "learning_rate": 1.7440488641593106e-05, + "loss": 2.3819, "step": 3060 }, { - "epoch": 0.92, - "grad_norm": 14.735453605651855, - "learning_rate": 1.3865891550566304e-05, - "loss": 2.4153, + "epoch": 0.38, + "grad_norm": 12.009927749633789, + "learning_rate": 1.743965192653642e-05, + "loss": 2.8513, "step": 3061 }, { - "epoch": 0.92, - "grad_norm": 13.794514656066895, - "learning_rate": 1.3863886939961912e-05, - "loss": 1.8875, + "epoch": 0.38, + "grad_norm": 28.326574325561523, + "learning_rate": 1.7438815211479733e-05, + "loss": 2.6981, "step": 3062 }, { - "epoch": 0.92, - "grad_norm": 11.291130065917969, - "learning_rate": 1.3861882329357524e-05, - "loss": 1.3319, + "epoch": 0.38, + "grad_norm": 10.06318473815918, + "learning_rate": 1.7437978496423043e-05, + "loss": 3.7585, "step": 3063 }, { - "epoch": 0.92, - "grad_norm": 19.629484176635742, - "learning_rate": 1.3859877718753134e-05, - "loss": 1.4707, + "epoch": 0.38, + "grad_norm": 13.952555656433105, + "learning_rate": 1.7437141781366357e-05, + "loss": 1.7787, "step": 3064 }, { - "epoch": 0.92, - "grad_norm": 15.42160415649414, - "learning_rate": 1.3857873108148743e-05, - "loss": 1.5667, + "epoch": 0.38, + "grad_norm": 39.10877227783203, + "learning_rate": 1.7436305066309667e-05, + "loss": 2.7993, "step": 3065 }, { - "epoch": 0.92, - "grad_norm": 14.26404094696045, - "learning_rate": 1.3855868497544354e-05, - "loss": 2.0599, + "epoch": 0.38, + "grad_norm": 20.320301055908203, + "learning_rate": 1.743546835125298e-05, + "loss": 1.2898, "step": 3066 }, { - "epoch": 0.92, - "grad_norm": 8.3535795211792, - "learning_rate": 1.3853863886939963e-05, - "loss": 1.3769, + "epoch": 0.38, + "grad_norm": 23.73604965209961, + "learning_rate": 1.7434631636196295e-05, + "loss": 3.5141, "step": 3067 }, { - "epoch": 0.92, - "grad_norm": 13.800082206726074, - "learning_rate": 1.3851859276335573e-05, - "loss": 1.5491, + "epoch": 0.39, + "grad_norm": 10.633537292480469, + "learning_rate": 1.7433794921139605e-05, + "loss": 2.189, "step": 3068 }, { - "epoch": 0.92, - "grad_norm": 16.2960262298584, - "learning_rate": 1.3849854665731185e-05, - "loss": 1.8461, + "epoch": 0.39, + "grad_norm": 13.728384971618652, + "learning_rate": 1.743295820608292e-05, + "loss": 1.515, "step": 3069 }, { - "epoch": 0.92, - "grad_norm": 12.411531448364258, - "learning_rate": 1.3847850055126793e-05, - "loss": 2.1916, + "epoch": 0.39, + "grad_norm": 11.651106834411621, + "learning_rate": 1.7432121491026232e-05, + "loss": 1.1459, "step": 3070 }, { - "epoch": 0.92, - "grad_norm": 10.797046661376953, - "learning_rate": 1.3845845444522403e-05, - "loss": 1.2897, + "epoch": 0.39, + "grad_norm": 9.04185962677002, + "learning_rate": 1.7431284775969546e-05, + "loss": 0.6994, "step": 3071 }, { - "epoch": 0.92, - "grad_norm": 17.1668643951416, - "learning_rate": 1.3843840833918011e-05, - "loss": 2.0636, + "epoch": 0.39, + "grad_norm": 10.415605545043945, + "learning_rate": 1.7430448060912856e-05, + "loss": 2.3379, "step": 3072 }, { - "epoch": 0.92, - "grad_norm": 12.581513404846191, - "learning_rate": 1.3841836223313623e-05, - "loss": 2.0915, + "epoch": 0.39, + "grad_norm": 14.66109848022461, + "learning_rate": 1.742961134585617e-05, + "loss": 2.6971, "step": 3073 }, { - "epoch": 0.92, - "grad_norm": 18.048904418945312, - "learning_rate": 1.3839831612709232e-05, - "loss": 1.8037, + "epoch": 0.39, + "grad_norm": 19.801557540893555, + "learning_rate": 1.7428774630799483e-05, + "loss": 2.4251, "step": 3074 }, { - "epoch": 0.92, - "grad_norm": 25.745256423950195, - "learning_rate": 1.3837827002104842e-05, - "loss": 2.4617, + "epoch": 0.39, + "grad_norm": 12.651113510131836, + "learning_rate": 1.7427937915742794e-05, + "loss": 2.507, "step": 3075 }, { - "epoch": 0.92, - "grad_norm": 23.290611267089844, - "learning_rate": 1.3835822391500453e-05, - "loss": 2.3306, + "epoch": 0.39, + "grad_norm": 14.438727378845215, + "learning_rate": 1.7427101200686107e-05, + "loss": 1.5898, "step": 3076 }, { - "epoch": 0.93, - "grad_norm": 18.817665100097656, - "learning_rate": 1.3833817780896062e-05, - "loss": 2.0089, + "epoch": 0.39, + "grad_norm": 21.17618179321289, + "learning_rate": 1.742626448562942e-05, + "loss": 3.7627, "step": 3077 }, { - "epoch": 0.93, - "grad_norm": 16.671045303344727, - "learning_rate": 1.3831813170291672e-05, - "loss": 2.0306, + "epoch": 0.39, + "grad_norm": 20.78024673461914, + "learning_rate": 1.7425427770572735e-05, + "loss": 2.8387, "step": 3078 }, { - "epoch": 0.93, - "grad_norm": 17.590496063232422, - "learning_rate": 1.382980855968728e-05, - "loss": 2.5062, + "epoch": 0.39, + "grad_norm": 8.485142707824707, + "learning_rate": 1.7424591055516045e-05, + "loss": 2.4435, "step": 3079 }, { - "epoch": 0.93, - "grad_norm": 15.332666397094727, - "learning_rate": 1.3827803949082892e-05, - "loss": 1.5195, + "epoch": 0.39, + "grad_norm": 7.811776161193848, + "learning_rate": 1.742375434045936e-05, + "loss": 2.0698, "step": 3080 }, { - "epoch": 0.93, - "grad_norm": 28.62594985961914, - "learning_rate": 1.3825799338478502e-05, - "loss": 2.3762, + "epoch": 0.39, + "grad_norm": 16.553388595581055, + "learning_rate": 1.7422917625402672e-05, + "loss": 1.2277, "step": 3081 }, { - "epoch": 0.93, - "grad_norm": 15.981874465942383, - "learning_rate": 1.382379472787411e-05, - "loss": 2.0642, + "epoch": 0.39, + "grad_norm": 23.243209838867188, + "learning_rate": 1.7422080910345982e-05, + "loss": 3.6227, "step": 3082 }, { - "epoch": 0.93, - "grad_norm": 16.672204971313477, - "learning_rate": 1.3821790117269722e-05, - "loss": 2.0906, + "epoch": 0.39, + "grad_norm": 11.811983108520508, + "learning_rate": 1.7421244195289296e-05, + "loss": 1.2775, "step": 3083 }, { - "epoch": 0.93, - "grad_norm": 15.016446113586426, - "learning_rate": 1.381978550666533e-05, - "loss": 1.7059, + "epoch": 0.39, + "grad_norm": 14.592747688293457, + "learning_rate": 1.742040748023261e-05, + "loss": 1.3105, "step": 3084 }, { - "epoch": 0.93, - "grad_norm": 9.689031600952148, - "learning_rate": 1.3817780896060941e-05, - "loss": 1.625, + "epoch": 0.39, + "grad_norm": 15.486154556274414, + "learning_rate": 1.741957076517592e-05, + "loss": 2.1047, "step": 3085 }, { - "epoch": 0.93, - "grad_norm": 8.496919631958008, - "learning_rate": 1.3815776285456551e-05, - "loss": 0.9668, + "epoch": 0.39, + "grad_norm": 28.537649154663086, + "learning_rate": 1.7418734050119234e-05, + "loss": 3.2185, "step": 3086 }, { - "epoch": 0.93, - "grad_norm": 20.545669555664062, - "learning_rate": 1.3813771674852161e-05, - "loss": 1.827, + "epoch": 0.39, + "grad_norm": 10.975678443908691, + "learning_rate": 1.7417897335062547e-05, + "loss": 1.8193, "step": 3087 }, { - "epoch": 0.93, - "grad_norm": 14.81933307647705, - "learning_rate": 1.3811767064247771e-05, - "loss": 2.3411, + "epoch": 0.39, + "grad_norm": 21.351566314697266, + "learning_rate": 1.7417060620005858e-05, + "loss": 1.0022, "step": 3088 }, { - "epoch": 0.93, - "grad_norm": 11.812262535095215, - "learning_rate": 1.3809762453643381e-05, - "loss": 3.2352, + "epoch": 0.39, + "grad_norm": 16.831201553344727, + "learning_rate": 1.741622390494917e-05, + "loss": 2.9873, "step": 3089 }, { - "epoch": 0.93, - "grad_norm": 50.63083267211914, - "learning_rate": 1.3807757843038991e-05, - "loss": 1.491, + "epoch": 0.39, + "grad_norm": 12.240836143493652, + "learning_rate": 1.741538718989248e-05, + "loss": 2.8131, "step": 3090 }, { - "epoch": 0.93, - "grad_norm": 17.194198608398438, - "learning_rate": 1.38057532324346e-05, - "loss": 1.45, + "epoch": 0.39, + "grad_norm": 12.75204086303711, + "learning_rate": 1.7414550474835795e-05, + "loss": 1.3815, "step": 3091 }, { - "epoch": 0.93, - "grad_norm": 25.95490074157715, - "learning_rate": 1.3803748621830211e-05, - "loss": 3.7476, + "epoch": 0.39, + "grad_norm": 17.798721313476562, + "learning_rate": 1.741371375977911e-05, + "loss": 1.5151, "step": 3092 }, { - "epoch": 0.93, - "grad_norm": 12.458473205566406, - "learning_rate": 1.380174401122582e-05, - "loss": 2.0199, + "epoch": 0.39, + "grad_norm": 12.037940979003906, + "learning_rate": 1.741287704472242e-05, + "loss": 2.6141, "step": 3093 }, { - "epoch": 0.93, - "grad_norm": 17.711647033691406, - "learning_rate": 1.379973940062143e-05, - "loss": 2.1909, + "epoch": 0.39, + "grad_norm": 6.18284273147583, + "learning_rate": 1.7412040329665733e-05, + "loss": 0.9533, "step": 3094 }, { - "epoch": 0.93, - "grad_norm": 21.15656852722168, - "learning_rate": 1.3797734790017042e-05, - "loss": 2.6897, + "epoch": 0.39, + "grad_norm": 9.289584159851074, + "learning_rate": 1.7411203614609046e-05, + "loss": 1.0435, "step": 3095 }, { - "epoch": 0.93, - "grad_norm": 14.197000503540039, - "learning_rate": 1.379573017941265e-05, - "loss": 2.5904, + "epoch": 0.39, + "grad_norm": 10.22266674041748, + "learning_rate": 1.7410366899552357e-05, + "loss": 1.583, "step": 3096 }, { - "epoch": 0.93, - "grad_norm": 24.11342430114746, - "learning_rate": 1.379372556880826e-05, - "loss": 2.4864, + "epoch": 0.39, + "grad_norm": 24.6475887298584, + "learning_rate": 1.740953018449567e-05, + "loss": 1.5329, "step": 3097 }, { - "epoch": 0.93, - "grad_norm": 18.00290298461914, - "learning_rate": 1.3791720958203869e-05, - "loss": 2.6676, + "epoch": 0.39, + "grad_norm": 14.650251388549805, + "learning_rate": 1.7408693469438984e-05, + "loss": 2.0842, "step": 3098 }, { - "epoch": 0.93, - "grad_norm": 10.474432945251465, - "learning_rate": 1.378971634759948e-05, - "loss": 1.9275, + "epoch": 0.39, + "grad_norm": 12.11793327331543, + "learning_rate": 1.7407856754382298e-05, + "loss": 2.7101, "step": 3099 }, { - "epoch": 0.93, - "grad_norm": 15.15256118774414, - "learning_rate": 1.378771173699509e-05, - "loss": 2.0671, + "epoch": 0.39, + "grad_norm": 28.798845291137695, + "learning_rate": 1.7407020039325608e-05, + "loss": 4.1897, "step": 3100 }, { - "epoch": 0.93, - "grad_norm": 16.475624084472656, - "learning_rate": 1.3785707126390699e-05, - "loss": 2.206, + "epoch": 0.39, + "grad_norm": 22.567594528198242, + "learning_rate": 1.740618332426892e-05, + "loss": 2.1251, "step": 3101 }, { - "epoch": 0.93, - "grad_norm": 32.260719299316406, - "learning_rate": 1.378370251578631e-05, - "loss": 1.7881, + "epoch": 0.39, + "grad_norm": 10.604387283325195, + "learning_rate": 1.7405346609212235e-05, + "loss": 1.1912, "step": 3102 }, { - "epoch": 0.93, - "grad_norm": 15.899197578430176, - "learning_rate": 1.3781697905181919e-05, - "loss": 1.865, + "epoch": 0.39, + "grad_norm": 23.615659713745117, + "learning_rate": 1.7404509894155545e-05, + "loss": 2.5832, "step": 3103 }, { - "epoch": 0.93, - "grad_norm": 16.820289611816406, - "learning_rate": 1.3779693294577529e-05, - "loss": 1.9125, + "epoch": 0.39, + "grad_norm": 14.243099212646484, + "learning_rate": 1.740367317909886e-05, + "loss": 2.0185, "step": 3104 }, { - "epoch": 0.93, - "grad_norm": 24.38895606994629, - "learning_rate": 1.3777688683973137e-05, - "loss": 1.5985, + "epoch": 0.39, + "grad_norm": 25.76270866394043, + "learning_rate": 1.7402836464042173e-05, + "loss": 1.8266, "step": 3105 }, { - "epoch": 0.93, - "grad_norm": 34.085914611816406, - "learning_rate": 1.377568407336875e-05, - "loss": 2.2492, + "epoch": 0.39, + "grad_norm": 14.212577819824219, + "learning_rate": 1.7401999748985486e-05, + "loss": 1.8881, "step": 3106 }, { - "epoch": 0.93, - "grad_norm": 14.358930587768555, - "learning_rate": 1.377367946276436e-05, - "loss": 1.9382, + "epoch": 0.39, + "grad_norm": 11.802764892578125, + "learning_rate": 1.7401163033928797e-05, + "loss": 2.3556, "step": 3107 }, { - "epoch": 0.93, - "grad_norm": 25.895401000976562, - "learning_rate": 1.377167485215997e-05, - "loss": 1.1291, + "epoch": 0.39, + "grad_norm": 8.763890266418457, + "learning_rate": 1.740032631887211e-05, + "loss": 2.9164, "step": 3108 }, { - "epoch": 0.93, - "grad_norm": 17.190685272216797, - "learning_rate": 1.376967024155558e-05, - "loss": 1.4162, + "epoch": 0.39, + "grad_norm": 16.387601852416992, + "learning_rate": 1.7399489603815424e-05, + "loss": 2.4992, "step": 3109 }, { - "epoch": 0.94, - "grad_norm": 13.299299240112305, - "learning_rate": 1.3767665630951188e-05, - "loss": 1.6711, + "epoch": 0.39, + "grad_norm": 18.163043975830078, + "learning_rate": 1.7398652888758734e-05, + "loss": 1.8894, "step": 3110 }, { - "epoch": 0.94, - "grad_norm": 14.426549911499023, - "learning_rate": 1.37656610203468e-05, - "loss": 1.9699, + "epoch": 0.39, + "grad_norm": 13.54871654510498, + "learning_rate": 1.7397816173702048e-05, + "loss": 1.6424, "step": 3111 }, { - "epoch": 0.94, - "grad_norm": 25.35927963256836, - "learning_rate": 1.376365640974241e-05, - "loss": 2.6309, + "epoch": 0.39, + "grad_norm": 18.142717361450195, + "learning_rate": 1.739697945864536e-05, + "loss": 1.5697, "step": 3112 }, { - "epoch": 0.94, - "grad_norm": 9.705031394958496, - "learning_rate": 1.3761651799138018e-05, - "loss": 2.4938, + "epoch": 0.39, + "grad_norm": 9.927594184875488, + "learning_rate": 1.7396142743588672e-05, + "loss": 1.808, "step": 3113 }, { - "epoch": 0.94, - "grad_norm": 12.233379364013672, - "learning_rate": 1.375964718853363e-05, - "loss": 1.8221, + "epoch": 0.39, + "grad_norm": 12.471274375915527, + "learning_rate": 1.7395306028531985e-05, + "loss": 2.2684, "step": 3114 }, { - "epoch": 0.94, - "grad_norm": 15.394095420837402, - "learning_rate": 1.3757642577929238e-05, - "loss": 1.7775, + "epoch": 0.39, + "grad_norm": 45.707679748535156, + "learning_rate": 1.73944693134753e-05, + "loss": 2.5269, "step": 3115 }, { - "epoch": 0.94, - "grad_norm": 13.07430362701416, - "learning_rate": 1.3755637967324848e-05, - "loss": 1.0702, + "epoch": 0.39, + "grad_norm": 10.306867599487305, + "learning_rate": 1.739363259841861e-05, + "loss": 2.6692, "step": 3116 }, { - "epoch": 0.94, - "grad_norm": 12.21866226196289, - "learning_rate": 1.3753633356720457e-05, - "loss": 1.6868, + "epoch": 0.39, + "grad_norm": 9.580382347106934, + "learning_rate": 1.7392795883361923e-05, + "loss": 1.0849, "step": 3117 }, { - "epoch": 0.94, - "grad_norm": 10.357650756835938, - "learning_rate": 1.3751628746116069e-05, - "loss": 2.2551, + "epoch": 0.39, + "grad_norm": 9.687782287597656, + "learning_rate": 1.7391959168305233e-05, + "loss": 1.3304, "step": 3118 }, { - "epoch": 0.94, - "grad_norm": 18.206022262573242, - "learning_rate": 1.3749624135511679e-05, - "loss": 1.8297, + "epoch": 0.39, + "grad_norm": 13.099113464355469, + "learning_rate": 1.7391122453248547e-05, + "loss": 1.5095, "step": 3119 }, { - "epoch": 0.94, - "grad_norm": 15.580822944641113, - "learning_rate": 1.3747619524907287e-05, - "loss": 1.9925, - "step": 3120 - }, - { - "epoch": 0.94, - "eval_loss": 0.2486942857503891, - "eval_runtime": 43.5686, - "eval_samples_per_second": 33.946, - "eval_steps_per_second": 33.946, + "epoch": 0.39, + "grad_norm": 25.176481246948242, + "learning_rate": 1.739028573819186e-05, + "loss": 1.6254, "step": 3120 }, { - "epoch": 0.94, - "grad_norm": 10.770172119140625, - "learning_rate": 1.3745614914302899e-05, - "loss": 2.3822, + "epoch": 0.39, + "grad_norm": 8.78892707824707, + "learning_rate": 1.738944902313517e-05, + "loss": 3.0991, "step": 3121 }, { - "epoch": 0.94, - "grad_norm": 25.76287078857422, - "learning_rate": 1.3743610303698507e-05, - "loss": 2.8554, + "epoch": 0.39, + "grad_norm": 12.70933723449707, + "learning_rate": 1.7388612308078484e-05, + "loss": 2.7174, "step": 3122 }, { - "epoch": 0.94, - "grad_norm": 11.431665420532227, - "learning_rate": 1.3741605693094117e-05, - "loss": 2.0408, + "epoch": 0.39, + "grad_norm": 21.841047286987305, + "learning_rate": 1.7387775593021798e-05, + "loss": 2.8923, "step": 3123 }, { - "epoch": 0.94, - "grad_norm": 12.141244888305664, - "learning_rate": 1.3739601082489729e-05, - "loss": 1.5182, + "epoch": 0.39, + "grad_norm": 9.589460372924805, + "learning_rate": 1.7386938877965108e-05, + "loss": 2.7387, "step": 3124 }, { - "epoch": 0.94, - "grad_norm": 12.981203079223633, - "learning_rate": 1.3737596471885337e-05, - "loss": 1.3628, + "epoch": 0.39, + "grad_norm": 30.16423225402832, + "learning_rate": 1.7386102162908422e-05, + "loss": 1.0212, "step": 3125 }, { - "epoch": 0.94, - "grad_norm": 12.477705955505371, - "learning_rate": 1.3735591861280948e-05, - "loss": 1.9388, + "epoch": 0.39, + "grad_norm": 19.877349853515625, + "learning_rate": 1.7385265447851736e-05, + "loss": 2.6417, "step": 3126 }, { - "epoch": 0.94, - "grad_norm": 21.606971740722656, - "learning_rate": 1.3733587250676556e-05, - "loss": 1.7253, + "epoch": 0.39, + "grad_norm": 10.562746047973633, + "learning_rate": 1.738442873279505e-05, + "loss": 0.9116, "step": 3127 }, { - "epoch": 0.94, - "grad_norm": 15.517189025878906, - "learning_rate": 1.3731582640072168e-05, - "loss": 2.0146, + "epoch": 0.39, + "grad_norm": 13.513136863708496, + "learning_rate": 1.738359201773836e-05, + "loss": 1.7079, "step": 3128 }, { - "epoch": 0.94, - "grad_norm": 30.338869094848633, - "learning_rate": 1.3729578029467776e-05, - "loss": 1.8816, + "epoch": 0.39, + "grad_norm": 12.24991512298584, + "learning_rate": 1.7382755302681673e-05, + "loss": 1.3581, "step": 3129 }, { - "epoch": 0.94, - "grad_norm": 34.77288818359375, - "learning_rate": 1.3727573418863386e-05, - "loss": 3.4102, + "epoch": 0.39, + "grad_norm": 23.374011993408203, + "learning_rate": 1.7381918587624987e-05, + "loss": 2.057, "step": 3130 }, { - "epoch": 0.94, - "grad_norm": 29.494319915771484, - "learning_rate": 1.3725568808258998e-05, - "loss": 1.3876, + "epoch": 0.39, + "grad_norm": 14.807626724243164, + "learning_rate": 1.7381081872568297e-05, + "loss": 2.0256, "step": 3131 }, { - "epoch": 0.94, - "grad_norm": 15.238880157470703, - "learning_rate": 1.3723564197654606e-05, - "loss": 2.0381, + "epoch": 0.39, + "grad_norm": 9.561110496520996, + "learning_rate": 1.738024515751161e-05, + "loss": 1.4404, "step": 3132 }, { - "epoch": 0.94, - "grad_norm": 13.510786056518555, - "learning_rate": 1.3721559587050216e-05, - "loss": 2.5507, + "epoch": 0.39, + "grad_norm": 20.877609252929688, + "learning_rate": 1.7379408442454924e-05, + "loss": 2.6945, "step": 3133 }, { - "epoch": 0.94, - "grad_norm": 24.444250106811523, - "learning_rate": 1.3719554976445827e-05, - "loss": 1.7889, + "epoch": 0.39, + "grad_norm": 14.954484939575195, + "learning_rate": 1.7378571727398238e-05, + "loss": 3.6158, "step": 3134 }, { - "epoch": 0.94, - "grad_norm": 12.126347541809082, - "learning_rate": 1.3717550365841437e-05, - "loss": 1.2326, + "epoch": 0.39, + "grad_norm": 17.07823944091797, + "learning_rate": 1.7377735012341548e-05, + "loss": 1.9067, "step": 3135 }, { - "epoch": 0.94, - "grad_norm": 10.488152503967285, - "learning_rate": 1.3715545755237045e-05, - "loss": 1.2664, + "epoch": 0.39, + "grad_norm": 18.759994506835938, + "learning_rate": 1.7376898297284862e-05, + "loss": 1.8789, "step": 3136 }, { - "epoch": 0.94, - "grad_norm": 30.025480270385742, - "learning_rate": 1.3713541144632657e-05, - "loss": 1.5859, + "epoch": 0.39, + "grad_norm": 13.437284469604492, + "learning_rate": 1.7376061582228176e-05, + "loss": 3.1343, "step": 3137 }, { - "epoch": 0.94, - "grad_norm": 13.164788246154785, - "learning_rate": 1.3711536534028267e-05, - "loss": 2.1662, + "epoch": 0.39, + "grad_norm": 10.817352294921875, + "learning_rate": 1.7375224867171486e-05, + "loss": 2.9662, "step": 3138 }, { - "epoch": 0.94, - "grad_norm": 18.30267333984375, - "learning_rate": 1.3709531923423875e-05, - "loss": 2.1693, + "epoch": 0.39, + "grad_norm": 15.099756240844727, + "learning_rate": 1.73743881521148e-05, + "loss": 1.8025, "step": 3139 }, { - "epoch": 0.94, - "grad_norm": 16.636390686035156, - "learning_rate": 1.3707527312819487e-05, - "loss": 2.3369, + "epoch": 0.39, + "grad_norm": 31.67547607421875, + "learning_rate": 1.7373551437058113e-05, + "loss": 1.9846, "step": 3140 }, { - "epoch": 0.94, - "grad_norm": 13.153757095336914, - "learning_rate": 1.3705522702215095e-05, - "loss": 2.3091, + "epoch": 0.39, + "grad_norm": 7.465260028839111, + "learning_rate": 1.7372714722001423e-05, + "loss": 0.8328, "step": 3141 }, { - "epoch": 0.94, - "grad_norm": 30.390827178955078, - "learning_rate": 1.3703518091610706e-05, - "loss": 1.786, + "epoch": 0.39, + "grad_norm": 8.22437858581543, + "learning_rate": 1.7371878006944737e-05, + "loss": 0.545, "step": 3142 }, { - "epoch": 0.94, - "grad_norm": 41.70283889770508, - "learning_rate": 1.3701513481006317e-05, - "loss": 3.5995, + "epoch": 0.39, + "grad_norm": 20.757461547851562, + "learning_rate": 1.7371041291888047e-05, + "loss": 3.8331, "step": 3143 }, { - "epoch": 0.95, - "grad_norm": 32.86235809326172, - "learning_rate": 1.3699508870401926e-05, - "loss": 2.4132, + "epoch": 0.39, + "grad_norm": 23.26932716369629, + "learning_rate": 1.737020457683136e-05, + "loss": 2.3774, "step": 3144 }, { - "epoch": 0.95, - "grad_norm": 14.464203834533691, - "learning_rate": 1.3697504259797536e-05, - "loss": 2.0061, + "epoch": 0.39, + "grad_norm": 11.285542488098145, + "learning_rate": 1.7369367861774675e-05, + "loss": 1.4571, "step": 3145 }, { - "epoch": 0.95, - "grad_norm": 11.750015258789062, - "learning_rate": 1.3695499649193144e-05, - "loss": 2.1382, + "epoch": 0.39, + "grad_norm": 38.73393630981445, + "learning_rate": 1.7368531146717985e-05, + "loss": 1.7928, "step": 3146 }, { - "epoch": 0.95, - "grad_norm": 13.602632522583008, - "learning_rate": 1.3693495038588756e-05, - "loss": 2.4135, + "epoch": 0.39, + "grad_norm": 20.771852493286133, + "learning_rate": 1.73676944316613e-05, + "loss": 1.804, "step": 3147 }, { - "epoch": 0.95, - "grad_norm": 15.727476119995117, - "learning_rate": 1.3691490427984364e-05, - "loss": 2.9052, + "epoch": 0.4, + "grad_norm": 24.225017547607422, + "learning_rate": 1.7366857716604612e-05, + "loss": 3.4957, "step": 3148 }, { - "epoch": 0.95, - "grad_norm": 17.43680763244629, - "learning_rate": 1.3689485817379974e-05, - "loss": 1.3355, + "epoch": 0.4, + "grad_norm": 7.701132774353027, + "learning_rate": 1.7366021001547922e-05, + "loss": 1.9459, "step": 3149 }, { - "epoch": 0.95, - "grad_norm": 14.329429626464844, - "learning_rate": 1.3687481206775586e-05, - "loss": 3.1391, + "epoch": 0.4, + "grad_norm": 14.99876594543457, + "learning_rate": 1.7365184286491236e-05, + "loss": 1.73, "step": 3150 }, { - "epoch": 0.95, - "grad_norm": 25.729965209960938, - "learning_rate": 1.3685476596171195e-05, - "loss": 1.6923, + "epoch": 0.4, + "grad_norm": 20.34157943725586, + "learning_rate": 1.736434757143455e-05, + "loss": 1.83, "step": 3151 }, { - "epoch": 0.95, - "grad_norm": 95.43783569335938, - "learning_rate": 1.3683471985566805e-05, - "loss": 2.0273, + "epoch": 0.4, + "grad_norm": 24.359678268432617, + "learning_rate": 1.736351085637786e-05, + "loss": 2.2814, "step": 3152 }, { - "epoch": 0.95, - "grad_norm": 17.75636100769043, - "learning_rate": 1.3681467374962413e-05, - "loss": 1.5341, + "epoch": 0.4, + "grad_norm": 8.147969245910645, + "learning_rate": 1.7362674141321174e-05, + "loss": 1.2628, "step": 3153 }, { - "epoch": 0.95, - "grad_norm": 19.18214988708496, - "learning_rate": 1.3679462764358025e-05, - "loss": 1.8777, + "epoch": 0.4, + "grad_norm": 15.603983879089355, + "learning_rate": 1.7361837426264487e-05, + "loss": 2.999, "step": 3154 }, { - "epoch": 0.95, - "grad_norm": 15.892659187316895, - "learning_rate": 1.3677458153753635e-05, - "loss": 1.9435, + "epoch": 0.4, + "grad_norm": 7.318797588348389, + "learning_rate": 1.73610007112078e-05, + "loss": 1.5075, "step": 3155 }, { - "epoch": 0.95, - "grad_norm": 20.043859481811523, - "learning_rate": 1.3675453543149243e-05, - "loss": 2.1202, + "epoch": 0.4, + "grad_norm": 27.904884338378906, + "learning_rate": 1.736016399615111e-05, + "loss": 2.882, "step": 3156 }, { - "epoch": 0.95, - "grad_norm": 13.109265327453613, - "learning_rate": 1.3673448932544855e-05, - "loss": 2.429, + "epoch": 0.4, + "grad_norm": 14.651124954223633, + "learning_rate": 1.7359327281094425e-05, + "loss": 1.8119, "step": 3157 }, { - "epoch": 0.95, - "grad_norm": 15.493414878845215, - "learning_rate": 1.3671444321940463e-05, - "loss": 1.768, + "epoch": 0.4, + "grad_norm": 15.951963424682617, + "learning_rate": 1.735849056603774e-05, + "loss": 1.1661, "step": 3158 }, { - "epoch": 0.95, - "grad_norm": 36.695735931396484, - "learning_rate": 1.3669439711336074e-05, - "loss": 2.0781, + "epoch": 0.4, + "grad_norm": 23.046314239501953, + "learning_rate": 1.735765385098105e-05, + "loss": 2.3287, "step": 3159 }, { - "epoch": 0.95, - "grad_norm": 14.086923599243164, - "learning_rate": 1.3667435100731684e-05, - "loss": 0.8197, + "epoch": 0.4, + "grad_norm": 18.682222366333008, + "learning_rate": 1.7356817135924362e-05, + "loss": 2.2169, "step": 3160 }, { - "epoch": 0.95, - "grad_norm": 13.209424018859863, - "learning_rate": 1.3665430490127294e-05, - "loss": 1.9733, + "epoch": 0.4, + "grad_norm": 11.814753532409668, + "learning_rate": 1.7355980420867676e-05, + "loss": 2.3565, "step": 3161 }, { - "epoch": 0.95, - "grad_norm": 17.077037811279297, - "learning_rate": 1.3663425879522905e-05, - "loss": 2.5923, + "epoch": 0.4, + "grad_norm": 17.43982696533203, + "learning_rate": 1.735514370581099e-05, + "loss": 1.3372, "step": 3162 }, { - "epoch": 0.95, - "grad_norm": 49.46474075317383, - "learning_rate": 1.3661421268918514e-05, - "loss": 2.0692, + "epoch": 0.4, + "grad_norm": 10.957281112670898, + "learning_rate": 1.73543069907543e-05, + "loss": 1.3389, "step": 3163 }, { - "epoch": 0.95, - "grad_norm": 14.314860343933105, - "learning_rate": 1.3659416658314124e-05, - "loss": 2.8229, + "epoch": 0.4, + "grad_norm": 6.849418640136719, + "learning_rate": 1.7353470275697614e-05, + "loss": 1.378, "step": 3164 }, { - "epoch": 0.95, - "grad_norm": 8.489893913269043, - "learning_rate": 1.3657412047709732e-05, - "loss": 1.3168, + "epoch": 0.4, + "grad_norm": 13.42451286315918, + "learning_rate": 1.7352633560640927e-05, + "loss": 2.8483, "step": 3165 }, { - "epoch": 0.95, - "grad_norm": 32.04702377319336, - "learning_rate": 1.3655407437105344e-05, - "loss": 1.6409, + "epoch": 0.4, + "grad_norm": 8.09776496887207, + "learning_rate": 1.7351796845584238e-05, + "loss": 0.6772, "step": 3166 }, { - "epoch": 0.95, - "grad_norm": 16.088096618652344, - "learning_rate": 1.3653402826500954e-05, - "loss": 1.1645, + "epoch": 0.4, + "grad_norm": 7.820061683654785, + "learning_rate": 1.735096013052755e-05, + "loss": 0.4904, "step": 3167 }, { - "epoch": 0.95, - "grad_norm": 12.829047203063965, - "learning_rate": 1.3651398215896563e-05, - "loss": 1.3659, + "epoch": 0.4, + "grad_norm": 15.628246307373047, + "learning_rate": 1.735012341547086e-05, + "loss": 2.5608, "step": 3168 }, { - "epoch": 0.95, - "grad_norm": 19.811403274536133, - "learning_rate": 1.3649393605292174e-05, - "loss": 2.6096, + "epoch": 0.4, + "grad_norm": 13.724005699157715, + "learning_rate": 1.7349286700414175e-05, + "loss": 2.8304, "step": 3169 }, { - "epoch": 0.95, - "grad_norm": 15.218836784362793, - "learning_rate": 1.3647388994687783e-05, - "loss": 1.4327, + "epoch": 0.4, + "grad_norm": 10.125463485717773, + "learning_rate": 1.734844998535749e-05, + "loss": 2.0583, "step": 3170 }, { - "epoch": 0.95, - "grad_norm": 11.305002212524414, - "learning_rate": 1.3645384384083393e-05, - "loss": 1.1141, + "epoch": 0.4, + "grad_norm": 9.629894256591797, + "learning_rate": 1.73476132703008e-05, + "loss": 1.896, "step": 3171 }, { - "epoch": 0.95, - "grad_norm": 12.872888565063477, - "learning_rate": 1.3643379773479001e-05, - "loss": 1.6382, + "epoch": 0.4, + "grad_norm": 7.258272647857666, + "learning_rate": 1.7346776555244113e-05, + "loss": 0.7141, "step": 3172 }, { - "epoch": 0.95, - "grad_norm": 18.41777801513672, - "learning_rate": 1.3641375162874613e-05, - "loss": 2.1711, + "epoch": 0.4, + "grad_norm": 15.432266235351562, + "learning_rate": 1.7345939840187426e-05, + "loss": 2.3333, "step": 3173 }, { - "epoch": 0.95, - "grad_norm": 14.97694206237793, - "learning_rate": 1.3639370552270223e-05, - "loss": 2.1335, + "epoch": 0.4, + "grad_norm": 17.209692001342773, + "learning_rate": 1.7345103125130737e-05, + "loss": 1.8321, "step": 3174 }, { - "epoch": 0.95, - "grad_norm": 17.031770706176758, - "learning_rate": 1.3637365941665832e-05, - "loss": 2.1181, + "epoch": 0.4, + "grad_norm": 28.579994201660156, + "learning_rate": 1.734426641007405e-05, + "loss": 2.4984, "step": 3175 }, { - "epoch": 0.95, - "grad_norm": 15.743291854858398, - "learning_rate": 1.3635361331061443e-05, - "loss": 2.4674, + "epoch": 0.4, + "grad_norm": 12.683948516845703, + "learning_rate": 1.7343429695017364e-05, + "loss": 0.7704, "step": 3176 }, { - "epoch": 0.96, - "grad_norm": 9.414546966552734, - "learning_rate": 1.3633356720457052e-05, - "loss": 2.1942, + "epoch": 0.4, + "grad_norm": 15.408038139343262, + "learning_rate": 1.7342592979960674e-05, + "loss": 3.2682, "step": 3177 }, { - "epoch": 0.96, - "grad_norm": 10.637990951538086, - "learning_rate": 1.3631352109852662e-05, - "loss": 1.5859, + "epoch": 0.4, + "grad_norm": 8.45233154296875, + "learning_rate": 1.7341756264903988e-05, + "loss": 1.3961, "step": 3178 }, { - "epoch": 0.96, - "grad_norm": 13.900732040405273, - "learning_rate": 1.3629347499248272e-05, - "loss": 1.6751, + "epoch": 0.4, + "grad_norm": 35.033164978027344, + "learning_rate": 1.73409195498473e-05, + "loss": 1.7532, "step": 3179 }, { - "epoch": 0.96, - "grad_norm": 20.666290283203125, - "learning_rate": 1.3627342888643882e-05, - "loss": 2.6414, + "epoch": 0.4, + "grad_norm": 23.8881893157959, + "learning_rate": 1.734008283479061e-05, + "loss": 2.8757, "step": 3180 }, { - "epoch": 0.96, - "grad_norm": 35.015647888183594, - "learning_rate": 1.3625338278039492e-05, - "loss": 2.7838, + "epoch": 0.4, + "grad_norm": 11.64614200592041, + "learning_rate": 1.7339246119733925e-05, + "loss": 2.8886, "step": 3181 }, { - "epoch": 0.96, - "grad_norm": 15.191834449768066, - "learning_rate": 1.3623333667435102e-05, - "loss": 1.831, + "epoch": 0.4, + "grad_norm": 23.38692283630371, + "learning_rate": 1.733840940467724e-05, + "loss": 4.9053, "step": 3182 }, { - "epoch": 0.96, - "grad_norm": 34.37846755981445, - "learning_rate": 1.3621329056830712e-05, - "loss": 2.7531, + "epoch": 0.4, + "grad_norm": 9.945205688476562, + "learning_rate": 1.7337572689620553e-05, + "loss": 0.6558, "step": 3183 }, { - "epoch": 0.96, - "grad_norm": 12.335997581481934, - "learning_rate": 1.361932444622632e-05, - "loss": 1.0529, + "epoch": 0.4, + "grad_norm": 8.835298538208008, + "learning_rate": 1.7336735974563863e-05, + "loss": 2.348, "step": 3184 }, { - "epoch": 0.96, - "grad_norm": 25.120954513549805, - "learning_rate": 1.3617319835621932e-05, - "loss": 2.2635, + "epoch": 0.4, + "grad_norm": 15.17750072479248, + "learning_rate": 1.7335899259507177e-05, + "loss": 1.2624, "step": 3185 }, { - "epoch": 0.96, - "grad_norm": 11.803996086120605, - "learning_rate": 1.3615315225017542e-05, - "loss": 1.8602, + "epoch": 0.4, + "grad_norm": 14.124608993530273, + "learning_rate": 1.733506254445049e-05, + "loss": 1.0089, "step": 3186 }, { - "epoch": 0.96, - "grad_norm": 14.749687194824219, - "learning_rate": 1.361331061441315e-05, - "loss": 1.0415, + "epoch": 0.4, + "grad_norm": 14.128517150878906, + "learning_rate": 1.73342258293938e-05, + "loss": 3.2298, "step": 3187 }, { - "epoch": 0.96, - "grad_norm": 15.90971851348877, - "learning_rate": 1.3611306003808763e-05, - "loss": 1.9429, + "epoch": 0.4, + "grad_norm": 10.97527027130127, + "learning_rate": 1.7333389114337114e-05, + "loss": 3.3839, "step": 3188 }, { - "epoch": 0.96, - "grad_norm": 42.85434341430664, - "learning_rate": 1.3609301393204371e-05, - "loss": 2.2651, + "epoch": 0.4, + "grad_norm": 8.833992004394531, + "learning_rate": 1.7332552399280428e-05, + "loss": 1.568, "step": 3189 }, { - "epoch": 0.96, - "grad_norm": 22.296979904174805, - "learning_rate": 1.3607296782599981e-05, - "loss": 2.2514, + "epoch": 0.4, + "grad_norm": 12.724773406982422, + "learning_rate": 1.733171568422374e-05, + "loss": 2.8502, "step": 3190 }, { - "epoch": 0.96, - "grad_norm": 16.979684829711914, - "learning_rate": 1.360529217199559e-05, - "loss": 1.9144, + "epoch": 0.4, + "grad_norm": 14.78669548034668, + "learning_rate": 1.733087896916705e-05, + "loss": 3.0281, "step": 3191 }, { - "epoch": 0.96, - "grad_norm": 51.25554656982422, - "learning_rate": 1.3603287561391201e-05, - "loss": 1.4266, + "epoch": 0.4, + "grad_norm": 8.72022819519043, + "learning_rate": 1.7330042254110365e-05, + "loss": 3.1723, "step": 3192 }, { - "epoch": 0.96, - "grad_norm": 32.474002838134766, - "learning_rate": 1.3601282950786811e-05, - "loss": 2.3617, + "epoch": 0.4, + "grad_norm": 12.51976490020752, + "learning_rate": 1.732920553905368e-05, + "loss": 1.8388, "step": 3193 }, { - "epoch": 0.96, - "grad_norm": 12.489779472351074, - "learning_rate": 1.359927834018242e-05, - "loss": 1.4284, + "epoch": 0.4, + "grad_norm": 27.790912628173828, + "learning_rate": 1.732836882399699e-05, + "loss": 2.4745, "step": 3194 }, { - "epoch": 0.96, - "grad_norm": 14.795775413513184, - "learning_rate": 1.3597273729578032e-05, - "loss": 0.9304, + "epoch": 0.4, + "grad_norm": 12.896369934082031, + "learning_rate": 1.7327532108940303e-05, + "loss": 2.2489, "step": 3195 }, { - "epoch": 0.96, - "grad_norm": 35.45854568481445, - "learning_rate": 1.359526911897364e-05, - "loss": 3.7488, + "epoch": 0.4, + "grad_norm": 23.709392547607422, + "learning_rate": 1.7326695393883613e-05, + "loss": 2.3466, "step": 3196 }, { - "epoch": 0.96, - "grad_norm": 6.524545669555664, - "learning_rate": 1.359326450836925e-05, - "loss": 1.08, + "epoch": 0.4, + "grad_norm": 13.660783767700195, + "learning_rate": 1.7325858678826927e-05, + "loss": 1.4347, "step": 3197 }, { - "epoch": 0.96, - "grad_norm": 13.13879680633545, - "learning_rate": 1.3591259897764862e-05, - "loss": 2.1342, + "epoch": 0.4, + "grad_norm": 10.898387908935547, + "learning_rate": 1.732502196377024e-05, + "loss": 2.9651, "step": 3198 }, { - "epoch": 0.96, - "grad_norm": 8.252395629882812, - "learning_rate": 1.358925528716047e-05, - "loss": 1.6381, + "epoch": 0.4, + "grad_norm": 15.468527793884277, + "learning_rate": 1.732418524871355e-05, + "loss": 1.7693, "step": 3199 }, { - "epoch": 0.96, - "grad_norm": 12.134532928466797, - "learning_rate": 1.358725067655608e-05, - "loss": 1.7298, + "epoch": 0.4, + "grad_norm": 12.056096076965332, + "learning_rate": 1.7323348533656864e-05, + "loss": 3.39, "step": 3200 }, { - "epoch": 0.96, - "grad_norm": 17.264564514160156, - "learning_rate": 1.3585246065951689e-05, - "loss": 1.6213, + "epoch": 0.4, + "eval_loss": 0.15101495385169983, + "eval_runtime": 94.045, + "eval_samples_per_second": 37.663, + "eval_steps_per_second": 37.663, + "step": 3200 + }, + { + "epoch": 0.4, + "grad_norm": 12.105559349060059, + "learning_rate": 1.7322511818600175e-05, + "loss": 3.396, "step": 3201 }, { - "epoch": 0.96, - "grad_norm": 17.299379348754883, - "learning_rate": 1.35832414553473e-05, - "loss": 1.9645, + "epoch": 0.4, + "grad_norm": 7.145201683044434, + "learning_rate": 1.7321675103543488e-05, + "loss": 1.2291, "step": 3202 }, { - "epoch": 0.96, - "grad_norm": 9.386181831359863, - "learning_rate": 1.3581236844742909e-05, - "loss": 1.0434, + "epoch": 0.4, + "grad_norm": 10.953343391418457, + "learning_rate": 1.7320838388486802e-05, + "loss": 1.6902, "step": 3203 }, { - "epoch": 0.96, - "grad_norm": 10.785475730895996, - "learning_rate": 1.3579232234138519e-05, - "loss": 1.387, + "epoch": 0.4, + "grad_norm": 15.755420684814453, + "learning_rate": 1.7320001673430116e-05, + "loss": 2.4279, "step": 3204 }, { - "epoch": 0.96, - "grad_norm": 18.141578674316406, - "learning_rate": 1.357722762353413e-05, - "loss": 1.816, + "epoch": 0.4, + "grad_norm": 14.094854354858398, + "learning_rate": 1.7319164958373426e-05, + "loss": 2.5785, "step": 3205 }, { - "epoch": 0.96, - "grad_norm": 10.100055694580078, - "learning_rate": 1.3575223012929739e-05, - "loss": 2.0753, + "epoch": 0.4, + "grad_norm": 14.587738990783691, + "learning_rate": 1.731832824331674e-05, + "loss": 1.3048, "step": 3206 }, { - "epoch": 0.96, - "grad_norm": 13.844127655029297, - "learning_rate": 1.3573218402325349e-05, - "loss": 1.4975, + "epoch": 0.4, + "grad_norm": 23.59645652770996, + "learning_rate": 1.7317491528260053e-05, + "loss": 2.2371, "step": 3207 }, { - "epoch": 0.96, - "grad_norm": 13.211896896362305, - "learning_rate": 1.357121379172096e-05, - "loss": 2.011, + "epoch": 0.4, + "grad_norm": 15.983088493347168, + "learning_rate": 1.7316654813203363e-05, + "loss": 1.6247, "step": 3208 }, { - "epoch": 0.96, - "grad_norm": 16.434125900268555, - "learning_rate": 1.356920918111657e-05, - "loss": 1.3007, + "epoch": 0.4, + "grad_norm": 9.358094215393066, + "learning_rate": 1.7315818098146677e-05, + "loss": 2.544, "step": 3209 }, { - "epoch": 0.97, - "grad_norm": 25.20293426513672, - "learning_rate": 1.356720457051218e-05, - "loss": 1.4808, + "epoch": 0.4, + "grad_norm": 18.30937957763672, + "learning_rate": 1.731498138308999e-05, + "loss": 2.3604, "step": 3210 }, { - "epoch": 0.97, - "grad_norm": 17.422204971313477, - "learning_rate": 1.356519995990779e-05, - "loss": 2.0575, + "epoch": 0.4, + "grad_norm": 9.7167329788208, + "learning_rate": 1.7314144668033304e-05, + "loss": 2.4223, "step": 3211 }, { - "epoch": 0.97, - "grad_norm": 22.5146484375, - "learning_rate": 1.35631953493034e-05, - "loss": 2.0943, + "epoch": 0.4, + "grad_norm": 21.414541244506836, + "learning_rate": 1.7313307952976615e-05, + "loss": 2.3931, "step": 3212 }, { - "epoch": 0.97, - "grad_norm": 46.014678955078125, - "learning_rate": 1.3561190738699008e-05, - "loss": 2.7204, + "epoch": 0.4, + "grad_norm": 12.109614372253418, + "learning_rate": 1.7312471237919928e-05, + "loss": 0.2461, "step": 3213 }, { - "epoch": 0.97, - "grad_norm": 16.03790283203125, - "learning_rate": 1.355918612809462e-05, - "loss": 2.5307, + "epoch": 0.4, + "grad_norm": 22.76210594177246, + "learning_rate": 1.7311634522863242e-05, + "loss": 3.2436, "step": 3214 }, { - "epoch": 0.97, - "grad_norm": 17.003280639648438, - "learning_rate": 1.3557181517490228e-05, - "loss": 2.766, + "epoch": 0.4, + "grad_norm": 7.287879467010498, + "learning_rate": 1.7310797807806552e-05, + "loss": 0.9333, "step": 3215 }, { - "epoch": 0.97, - "grad_norm": 21.383115768432617, - "learning_rate": 1.3555176906885838e-05, - "loss": 1.8888, + "epoch": 0.4, + "grad_norm": 10.937824249267578, + "learning_rate": 1.7309961092749866e-05, + "loss": 2.7631, "step": 3216 }, { - "epoch": 0.97, - "grad_norm": 29.028945922851562, - "learning_rate": 1.355317229628145e-05, - "loss": 1.2596, + "epoch": 0.4, + "grad_norm": 18.63129425048828, + "learning_rate": 1.730912437769318e-05, + "loss": 3.2587, "step": 3217 }, { - "epoch": 0.97, - "grad_norm": 14.80164623260498, - "learning_rate": 1.3551167685677058e-05, - "loss": 2.1869, + "epoch": 0.4, + "grad_norm": 12.023300170898438, + "learning_rate": 1.7308287662636493e-05, + "loss": 1.2662, "step": 3218 }, { - "epoch": 0.97, - "grad_norm": 35.633094787597656, - "learning_rate": 1.3549163075072668e-05, - "loss": 3.2196, + "epoch": 0.4, + "grad_norm": 8.753905296325684, + "learning_rate": 1.7307450947579803e-05, + "loss": 1.474, "step": 3219 }, { - "epoch": 0.97, - "grad_norm": 17.463098526000977, - "learning_rate": 1.3547158464468277e-05, - "loss": 2.077, + "epoch": 0.4, + "grad_norm": 26.231277465820312, + "learning_rate": 1.7306614232523117e-05, + "loss": 2.2326, "step": 3220 }, { - "epoch": 0.97, - "grad_norm": 15.604183197021484, - "learning_rate": 1.3545153853863889e-05, - "loss": 1.6204, + "epoch": 0.4, + "grad_norm": 19.625652313232422, + "learning_rate": 1.7305777517466427e-05, + "loss": 2.0372, "step": 3221 }, { - "epoch": 0.97, - "grad_norm": 28.1143798828125, - "learning_rate": 1.3543149243259497e-05, - "loss": 2.9363, + "epoch": 0.4, + "grad_norm": 23.23263931274414, + "learning_rate": 1.730494080240974e-05, + "loss": 3.344, "step": 3222 }, { - "epoch": 0.97, - "grad_norm": 18.119604110717773, - "learning_rate": 1.3541144632655107e-05, - "loss": 1.3793, + "epoch": 0.4, + "grad_norm": 14.629230499267578, + "learning_rate": 1.7304104087353055e-05, + "loss": 2.0983, "step": 3223 }, { - "epoch": 0.97, - "grad_norm": 16.877845764160156, - "learning_rate": 1.3539140022050719e-05, - "loss": 2.0204, + "epoch": 0.4, + "grad_norm": 16.69709014892578, + "learning_rate": 1.7303267372296365e-05, + "loss": 1.9195, "step": 3224 }, { - "epoch": 0.97, - "grad_norm": 20.121807098388672, - "learning_rate": 1.3537135411446327e-05, - "loss": 2.4456, + "epoch": 0.4, + "grad_norm": 14.589938163757324, + "learning_rate": 1.730243065723968e-05, + "loss": 2.1431, "step": 3225 }, { - "epoch": 0.97, - "grad_norm": 15.562324523925781, - "learning_rate": 1.3535130800841937e-05, - "loss": 1.8663, + "epoch": 0.4, + "grad_norm": 25.879173278808594, + "learning_rate": 1.7301593942182992e-05, + "loss": 3.4365, "step": 3226 }, { - "epoch": 0.97, - "grad_norm": 16.265233993530273, - "learning_rate": 1.3533126190237547e-05, - "loss": 1.7686, + "epoch": 0.4, + "grad_norm": 16.38176727294922, + "learning_rate": 1.7300757227126302e-05, + "loss": 2.1582, "step": 3227 }, { - "epoch": 0.97, - "grad_norm": 24.303163528442383, - "learning_rate": 1.3531121579633158e-05, - "loss": 2.5792, + "epoch": 0.41, + "grad_norm": 8.85407829284668, + "learning_rate": 1.7299920512069616e-05, + "loss": 1.8366, "step": 3228 }, { - "epoch": 0.97, - "grad_norm": 19.46893310546875, - "learning_rate": 1.3529116969028768e-05, - "loss": 1.8132, + "epoch": 0.41, + "grad_norm": 21.186267852783203, + "learning_rate": 1.7299083797012926e-05, + "loss": 0.9108, "step": 3229 }, { - "epoch": 0.97, - "grad_norm": 39.980438232421875, - "learning_rate": 1.3527112358424378e-05, - "loss": 2.3918, + "epoch": 0.41, + "grad_norm": 38.015869140625, + "learning_rate": 1.729824708195624e-05, + "loss": 3.4078, "step": 3230 }, { - "epoch": 0.97, - "grad_norm": 18.492206573486328, - "learning_rate": 1.3525107747819988e-05, - "loss": 1.6302, + "epoch": 0.41, + "grad_norm": 10.098426818847656, + "learning_rate": 1.7297410366899554e-05, + "loss": 1.6497, "step": 3231 }, { - "epoch": 0.97, - "grad_norm": 11.828949928283691, - "learning_rate": 1.3523103137215596e-05, - "loss": 1.7631, + "epoch": 0.41, + "grad_norm": 5.503562927246094, + "learning_rate": 1.7296573651842867e-05, + "loss": 0.1918, "step": 3232 }, { - "epoch": 0.97, - "grad_norm": 13.994562149047852, - "learning_rate": 1.3521098526611208e-05, - "loss": 1.9207, + "epoch": 0.41, + "grad_norm": 16.33560562133789, + "learning_rate": 1.7295736936786177e-05, + "loss": 1.874, "step": 3233 }, { - "epoch": 0.97, - "grad_norm": 22.90715789794922, - "learning_rate": 1.3519093916006816e-05, - "loss": 3.6763, + "epoch": 0.41, + "grad_norm": 12.88305377960205, + "learning_rate": 1.729490022172949e-05, + "loss": 2.6811, "step": 3234 }, { - "epoch": 0.97, - "grad_norm": 45.03823471069336, - "learning_rate": 1.3517089305402426e-05, - "loss": 2.1425, + "epoch": 0.41, + "grad_norm": 14.501304626464844, + "learning_rate": 1.7294063506672805e-05, + "loss": 0.6626, "step": 3235 }, { - "epoch": 0.97, - "grad_norm": 12.193424224853516, - "learning_rate": 1.3515084694798038e-05, - "loss": 1.4887, + "epoch": 0.41, + "grad_norm": 23.03480339050293, + "learning_rate": 1.7293226791616115e-05, + "loss": 2.6398, "step": 3236 }, { - "epoch": 0.97, - "grad_norm": 21.530868530273438, - "learning_rate": 1.3513080084193647e-05, - "loss": 2.9298, + "epoch": 0.41, + "grad_norm": 20.49584197998047, + "learning_rate": 1.729239007655943e-05, + "loss": 1.5989, "step": 3237 }, { - "epoch": 0.97, - "grad_norm": 30.308269500732422, - "learning_rate": 1.3511075473589257e-05, - "loss": 2.4928, + "epoch": 0.41, + "grad_norm": 38.74869918823242, + "learning_rate": 1.7291553361502742e-05, + "loss": 2.7613, "step": 3238 }, { - "epoch": 0.97, - "grad_norm": 11.13912582397461, - "learning_rate": 1.3509070862984865e-05, - "loss": 1.7033, + "epoch": 0.41, + "grad_norm": 19.7424259185791, + "learning_rate": 1.7290716646446056e-05, + "loss": 3.1999, "step": 3239 }, { - "epoch": 0.97, - "grad_norm": 11.577181816101074, - "learning_rate": 1.3507066252380477e-05, - "loss": 1.4173, - "step": 3240 - }, - { - "epoch": 0.97, - "eval_loss": 0.22535113990306854, - "eval_runtime": 43.4738, - "eval_samples_per_second": 34.021, - "eval_steps_per_second": 34.021, + "epoch": 0.41, + "grad_norm": 13.532513618469238, + "learning_rate": 1.7289879931389366e-05, + "loss": 2.4183, "step": 3240 }, { - "epoch": 0.97, - "grad_norm": 27.40611457824707, - "learning_rate": 1.3505061641776087e-05, - "loss": 1.9135, + "epoch": 0.41, + "grad_norm": 19.050241470336914, + "learning_rate": 1.728904321633268e-05, + "loss": 3.016, "step": 3241 }, { - "epoch": 0.97, - "grad_norm": 19.149837493896484, - "learning_rate": 1.3503057031171695e-05, - "loss": 1.7921, + "epoch": 0.41, + "grad_norm": 13.003695487976074, + "learning_rate": 1.7288206501275994e-05, + "loss": 2.5176, "step": 3242 }, { - "epoch": 0.98, - "grad_norm": 15.974568367004395, - "learning_rate": 1.3501052420567307e-05, - "loss": 2.5266, + "epoch": 0.41, + "grad_norm": 7.656522750854492, + "learning_rate": 1.7287369786219304e-05, + "loss": 2.0928, "step": 3243 }, { - "epoch": 0.98, - "grad_norm": 19.187986373901367, - "learning_rate": 1.3499047809962915e-05, - "loss": 1.9754, + "epoch": 0.41, + "grad_norm": 39.75627517700195, + "learning_rate": 1.7286533071162617e-05, + "loss": 1.4883, "step": 3244 }, { - "epoch": 0.98, - "grad_norm": 10.218606948852539, - "learning_rate": 1.3497043199358526e-05, - "loss": 0.9435, + "epoch": 0.41, + "grad_norm": 11.915572166442871, + "learning_rate": 1.728569635610593e-05, + "loss": 1.5882, "step": 3245 }, { - "epoch": 0.98, - "grad_norm": 16.653217315673828, - "learning_rate": 1.3495038588754134e-05, - "loss": 1.8804, + "epoch": 0.41, + "grad_norm": 11.25812816619873, + "learning_rate": 1.7284859641049245e-05, + "loss": 1.5589, "step": 3246 }, { - "epoch": 0.98, - "grad_norm": 19.304424285888672, - "learning_rate": 1.3493033978149746e-05, - "loss": 2.038, + "epoch": 0.41, + "grad_norm": 12.67131519317627, + "learning_rate": 1.7284022925992555e-05, + "loss": 3.2036, "step": 3247 }, { - "epoch": 0.98, - "grad_norm": 17.04471206665039, - "learning_rate": 1.3491029367545356e-05, - "loss": 1.9324, + "epoch": 0.41, + "grad_norm": 22.16834259033203, + "learning_rate": 1.728318621093587e-05, + "loss": 3.1681, "step": 3248 }, { - "epoch": 0.98, - "grad_norm": 16.119550704956055, - "learning_rate": 1.3489024756940964e-05, - "loss": 2.1206, + "epoch": 0.41, + "grad_norm": 18.441844940185547, + "learning_rate": 1.728234949587918e-05, + "loss": 4.0314, "step": 3249 }, { - "epoch": 0.98, - "grad_norm": 16.321861267089844, - "learning_rate": 1.3487020146336576e-05, - "loss": 2.3402, + "epoch": 0.41, + "grad_norm": 8.221848487854004, + "learning_rate": 1.7281512780822493e-05, + "loss": 1.0007, "step": 3250 }, { - "epoch": 0.98, - "grad_norm": 30.2368221282959, - "learning_rate": 1.3485015535732184e-05, - "loss": 1.683, + "epoch": 0.41, + "grad_norm": 13.110788345336914, + "learning_rate": 1.7280676065765806e-05, + "loss": 1.006, "step": 3251 }, { - "epoch": 0.98, - "grad_norm": 18.461002349853516, - "learning_rate": 1.3483010925127794e-05, - "loss": 2.1962, + "epoch": 0.41, + "grad_norm": 21.545730590820312, + "learning_rate": 1.7279839350709116e-05, + "loss": 2.5467, "step": 3252 }, { - "epoch": 0.98, - "grad_norm": 20.721721649169922, - "learning_rate": 1.3481006314523406e-05, - "loss": 2.1013, + "epoch": 0.41, + "grad_norm": 11.939325332641602, + "learning_rate": 1.727900263565243e-05, + "loss": 0.92, "step": 3253 }, { - "epoch": 0.98, - "grad_norm": 31.358562469482422, - "learning_rate": 1.3479001703919015e-05, - "loss": 2.7242, + "epoch": 0.41, + "grad_norm": 8.76098918914795, + "learning_rate": 1.727816592059574e-05, + "loss": 3.1085, "step": 3254 }, { - "epoch": 0.98, - "grad_norm": 20.12774085998535, - "learning_rate": 1.3476997093314625e-05, - "loss": 1.5558, + "epoch": 0.41, + "grad_norm": 19.767866134643555, + "learning_rate": 1.7277329205539054e-05, + "loss": 2.4461, "step": 3255 }, { - "epoch": 0.98, - "grad_norm": 28.31780242919922, - "learning_rate": 1.3474992482710235e-05, - "loss": 2.1186, + "epoch": 0.41, + "grad_norm": 12.018153190612793, + "learning_rate": 1.7276492490482368e-05, + "loss": 1.4399, "step": 3256 }, { - "epoch": 0.98, - "grad_norm": 11.361303329467773, - "learning_rate": 1.3472987872105845e-05, - "loss": 1.4945, + "epoch": 0.41, + "grad_norm": 5.61497163772583, + "learning_rate": 1.7275655775425678e-05, + "loss": 0.6077, "step": 3257 }, { - "epoch": 0.98, - "grad_norm": 112.40892028808594, - "learning_rate": 1.3470983261501453e-05, - "loss": 1.87, + "epoch": 0.41, + "grad_norm": 20.77367401123047, + "learning_rate": 1.727481906036899e-05, + "loss": 2.672, "step": 3258 }, { - "epoch": 0.98, - "grad_norm": 48.792320251464844, - "learning_rate": 1.3468978650897065e-05, - "loss": 2.829, + "epoch": 0.41, + "grad_norm": 21.33584976196289, + "learning_rate": 1.7273982345312305e-05, + "loss": 0.8965, "step": 3259 }, { - "epoch": 0.98, - "grad_norm": 15.994609832763672, - "learning_rate": 1.3466974040292675e-05, - "loss": 2.33, + "epoch": 0.41, + "grad_norm": 11.513010025024414, + "learning_rate": 1.727314563025562e-05, + "loss": 1.8187, "step": 3260 }, { - "epoch": 0.98, - "grad_norm": 14.680255889892578, - "learning_rate": 1.3464969429688284e-05, - "loss": 1.5227, + "epoch": 0.41, + "grad_norm": 19.68485450744629, + "learning_rate": 1.727230891519893e-05, + "loss": 1.6764, "step": 3261 }, { - "epoch": 0.98, - "grad_norm": 20.413484573364258, - "learning_rate": 1.3462964819083895e-05, - "loss": 1.3934, + "epoch": 0.41, + "grad_norm": 4.071218013763428, + "learning_rate": 1.7271472200142243e-05, + "loss": 0.5521, "step": 3262 }, { - "epoch": 0.98, - "grad_norm": 10.785961151123047, - "learning_rate": 1.3460960208479504e-05, - "loss": 1.6991, + "epoch": 0.41, + "grad_norm": 11.459487915039062, + "learning_rate": 1.7270635485085556e-05, + "loss": 2.8706, "step": 3263 }, { - "epoch": 0.98, - "grad_norm": 23.43621063232422, - "learning_rate": 1.3458955597875114e-05, - "loss": 2.1189, + "epoch": 0.41, + "grad_norm": 16.798728942871094, + "learning_rate": 1.7269798770028867e-05, + "loss": 2.188, "step": 3264 }, { - "epoch": 0.98, - "grad_norm": 18.55811882019043, - "learning_rate": 1.3456950987270722e-05, - "loss": 1.6837, + "epoch": 0.41, + "grad_norm": 38.427467346191406, + "learning_rate": 1.726896205497218e-05, + "loss": 3.6072, "step": 3265 }, { - "epoch": 0.98, - "grad_norm": 25.324520111083984, - "learning_rate": 1.3454946376666334e-05, - "loss": 2.5444, + "epoch": 0.41, + "grad_norm": 11.476984977722168, + "learning_rate": 1.7268125339915494e-05, + "loss": 1.1716, "step": 3266 }, { - "epoch": 0.98, - "grad_norm": 29.4528751373291, - "learning_rate": 1.3452941766061944e-05, - "loss": 1.7111, + "epoch": 0.41, + "grad_norm": 10.207488059997559, + "learning_rate": 1.7267288624858808e-05, + "loss": 1.4146, "step": 3267 }, { - "epoch": 0.98, - "grad_norm": 10.598390579223633, - "learning_rate": 1.3450937155457552e-05, - "loss": 1.1832, + "epoch": 0.41, + "grad_norm": 8.6456298828125, + "learning_rate": 1.7266451909802118e-05, + "loss": 2.5119, "step": 3268 }, { - "epoch": 0.98, - "grad_norm": 16.23030662536621, - "learning_rate": 1.3448932544853164e-05, - "loss": 1.845, + "epoch": 0.41, + "grad_norm": 9.207403182983398, + "learning_rate": 1.726561519474543e-05, + "loss": 1.3832, "step": 3269 }, { - "epoch": 0.98, - "grad_norm": 16.992780685424805, - "learning_rate": 1.3446927934248773e-05, - "loss": 1.5488, + "epoch": 0.41, + "grad_norm": 12.22926139831543, + "learning_rate": 1.7264778479688745e-05, + "loss": 2.1751, "step": 3270 }, { - "epoch": 0.98, - "grad_norm": 16.57977294921875, - "learning_rate": 1.3444923323644383e-05, - "loss": 2.4538, + "epoch": 0.41, + "grad_norm": 15.51880931854248, + "learning_rate": 1.7263941764632055e-05, + "loss": 3.7102, "step": 3271 }, { - "epoch": 0.98, - "grad_norm": 17.394916534423828, - "learning_rate": 1.3442918713039994e-05, - "loss": 2.4952, + "epoch": 0.41, + "grad_norm": 15.530855178833008, + "learning_rate": 1.726310504957537e-05, + "loss": 1.5472, "step": 3272 }, { - "epoch": 0.98, - "grad_norm": 10.907772064208984, - "learning_rate": 1.3440914102435603e-05, - "loss": 2.2693, + "epoch": 0.41, + "grad_norm": 9.8801908493042, + "learning_rate": 1.7262268334518683e-05, + "loss": 0.8076, "step": 3273 }, { - "epoch": 0.98, - "grad_norm": 15.381023406982422, - "learning_rate": 1.3438909491831213e-05, - "loss": 1.9541, + "epoch": 0.41, + "grad_norm": 16.780431747436523, + "learning_rate": 1.7261431619461993e-05, + "loss": 2.3568, "step": 3274 }, { - "epoch": 0.98, - "grad_norm": 13.303698539733887, - "learning_rate": 1.3436904881226821e-05, - "loss": 1.7353, + "epoch": 0.41, + "grad_norm": 22.34392547607422, + "learning_rate": 1.7260594904405307e-05, + "loss": 2.1313, "step": 3275 }, { - "epoch": 0.98, - "grad_norm": 22.471403121948242, - "learning_rate": 1.3434900270622433e-05, - "loss": 1.8707, + "epoch": 0.41, + "grad_norm": 10.950980186462402, + "learning_rate": 1.725975818934862e-05, + "loss": 1.5318, "step": 3276 }, { - "epoch": 0.99, - "grad_norm": 12.483773231506348, - "learning_rate": 1.3432895660018041e-05, - "loss": 1.862, + "epoch": 0.41, + "grad_norm": 15.691773414611816, + "learning_rate": 1.725892147429193e-05, + "loss": 4.148, "step": 3277 }, { - "epoch": 0.99, - "grad_norm": 24.261032104492188, - "learning_rate": 1.3430891049413652e-05, - "loss": 3.0355, + "epoch": 0.41, + "grad_norm": 19.82890510559082, + "learning_rate": 1.7258084759235244e-05, + "loss": 2.2056, "step": 3278 }, { - "epoch": 0.99, - "grad_norm": 27.064979553222656, - "learning_rate": 1.3428886438809263e-05, - "loss": 2.6927, + "epoch": 0.41, + "grad_norm": 13.338221549987793, + "learning_rate": 1.7257248044178554e-05, + "loss": 1.574, "step": 3279 }, { - "epoch": 0.99, - "grad_norm": 23.447118759155273, - "learning_rate": 1.3426881828204872e-05, - "loss": 2.2596, + "epoch": 0.41, + "grad_norm": 12.845236778259277, + "learning_rate": 1.7256411329121868e-05, + "loss": 1.8639, "step": 3280 }, { - "epoch": 0.99, - "grad_norm": 19.78642463684082, - "learning_rate": 1.3424877217600484e-05, - "loss": 2.1217, + "epoch": 0.41, + "grad_norm": 21.712717056274414, + "learning_rate": 1.7255574614065182e-05, + "loss": 1.6217, "step": 3281 }, { - "epoch": 0.99, - "grad_norm": 11.758055686950684, - "learning_rate": 1.3422872606996092e-05, - "loss": 1.0749, + "epoch": 0.41, + "grad_norm": 7.393682956695557, + "learning_rate": 1.7254737899008492e-05, + "loss": 0.6024, "step": 3282 }, { - "epoch": 0.99, - "grad_norm": 15.738198280334473, - "learning_rate": 1.3420867996391702e-05, - "loss": 2.25, + "epoch": 0.41, + "grad_norm": 8.766348838806152, + "learning_rate": 1.7253901183951806e-05, + "loss": 0.8492, "step": 3283 }, { - "epoch": 0.99, - "grad_norm": 21.461877822875977, - "learning_rate": 1.3418863385787314e-05, - "loss": 2.1378, + "epoch": 0.41, + "grad_norm": 17.923397064208984, + "learning_rate": 1.725306446889512e-05, + "loss": 3.0523, "step": 3284 }, { - "epoch": 0.99, - "grad_norm": 39.54304504394531, - "learning_rate": 1.3416858775182922e-05, - "loss": 3.1118, + "epoch": 0.41, + "grad_norm": 17.999114990234375, + "learning_rate": 1.725222775383843e-05, + "loss": 2.9979, "step": 3285 }, { - "epoch": 0.99, - "grad_norm": 14.904977798461914, - "learning_rate": 1.3414854164578532e-05, - "loss": 1.2001, + "epoch": 0.41, + "grad_norm": 7.008960247039795, + "learning_rate": 1.7251391038781743e-05, + "loss": 1.8619, "step": 3286 }, { - "epoch": 0.99, - "grad_norm": 19.5516414642334, - "learning_rate": 1.341284955397414e-05, - "loss": 2.2941, + "epoch": 0.41, + "grad_norm": 22.732624053955078, + "learning_rate": 1.7250554323725057e-05, + "loss": 1.7138, "step": 3287 }, { - "epoch": 0.99, - "grad_norm": 18.030540466308594, - "learning_rate": 1.3410844943369752e-05, - "loss": 2.1409, + "epoch": 0.41, + "grad_norm": 15.985183715820312, + "learning_rate": 1.724971760866837e-05, + "loss": 2.6133, "step": 3288 }, { - "epoch": 0.99, - "grad_norm": 17.5920352935791, - "learning_rate": 1.340884033276536e-05, - "loss": 3.0416, + "epoch": 0.41, + "grad_norm": 8.475821495056152, + "learning_rate": 1.724888089361168e-05, + "loss": 1.7767, "step": 3289 }, { - "epoch": 0.99, - "grad_norm": 18.530750274658203, - "learning_rate": 1.3406835722160971e-05, - "loss": 1.1762, + "epoch": 0.41, + "grad_norm": 15.1087064743042, + "learning_rate": 1.7248044178554994e-05, + "loss": 1.368, "step": 3290 }, { - "epoch": 0.99, - "grad_norm": 15.765362739562988, - "learning_rate": 1.3404831111556583e-05, - "loss": 2.0459, + "epoch": 0.41, + "grad_norm": 13.120559692382812, + "learning_rate": 1.7247207463498308e-05, + "loss": 2.2243, "step": 3291 }, { - "epoch": 0.99, - "grad_norm": 27.75279998779297, - "learning_rate": 1.3402826500952191e-05, - "loss": 2.1783, + "epoch": 0.41, + "grad_norm": 12.287973403930664, + "learning_rate": 1.724637074844162e-05, + "loss": 2.1131, "step": 3292 }, { - "epoch": 0.99, - "grad_norm": 16.64308738708496, - "learning_rate": 1.3400821890347801e-05, - "loss": 2.4156, + "epoch": 0.41, + "grad_norm": 17.94985580444336, + "learning_rate": 1.7245534033384932e-05, + "loss": 3.8318, "step": 3293 }, { - "epoch": 0.99, - "grad_norm": 19.890602111816406, - "learning_rate": 1.339881727974341e-05, - "loss": 1.0157, + "epoch": 0.41, + "grad_norm": 22.18570327758789, + "learning_rate": 1.7244697318328246e-05, + "loss": 1.9355, "step": 3294 }, { - "epoch": 0.99, - "grad_norm": 15.657976150512695, - "learning_rate": 1.3396812669139021e-05, - "loss": 1.3915, + "epoch": 0.41, + "grad_norm": 10.512239456176758, + "learning_rate": 1.724386060327156e-05, + "loss": 1.9186, "step": 3295 }, { - "epoch": 0.99, - "grad_norm": 14.942137718200684, - "learning_rate": 1.3394808058534631e-05, - "loss": 2.0334, + "epoch": 0.41, + "grad_norm": 47.57612991333008, + "learning_rate": 1.724302388821487e-05, + "loss": 1.8597, "step": 3296 }, { - "epoch": 0.99, - "grad_norm": 10.705955505371094, - "learning_rate": 1.339280344793024e-05, - "loss": 1.7803, + "epoch": 0.41, + "grad_norm": 14.717118263244629, + "learning_rate": 1.7242187173158183e-05, + "loss": 3.2491, "step": 3297 }, { - "epoch": 0.99, - "grad_norm": 11.542771339416504, - "learning_rate": 1.3390798837325852e-05, - "loss": 1.2149, + "epoch": 0.41, + "grad_norm": 12.336624145507812, + "learning_rate": 1.7241350458101497e-05, + "loss": 1.408, "step": 3298 }, { - "epoch": 0.99, - "grad_norm": 16.316530227661133, - "learning_rate": 1.338879422672146e-05, - "loss": 2.4369, + "epoch": 0.41, + "grad_norm": 22.74224853515625, + "learning_rate": 1.7240513743044807e-05, + "loss": 1.4211, "step": 3299 }, { - "epoch": 0.99, - "grad_norm": 23.6247615814209, - "learning_rate": 1.338678961611707e-05, - "loss": 2.1207, + "epoch": 0.41, + "grad_norm": 18.7371883392334, + "learning_rate": 1.723967702798812e-05, + "loss": 4.5971, "step": 3300 }, { - "epoch": 0.99, - "grad_norm": 17.557518005371094, - "learning_rate": 1.338478500551268e-05, - "loss": 1.5633, + "epoch": 0.41, + "grad_norm": 19.930030822753906, + "learning_rate": 1.7238840312931434e-05, + "loss": 1.3958, "step": 3301 }, { - "epoch": 0.99, - "grad_norm": 13.458683967590332, - "learning_rate": 1.338278039490829e-05, - "loss": 2.0549, + "epoch": 0.41, + "grad_norm": 17.725854873657227, + "learning_rate": 1.7238003597874745e-05, + "loss": 1.8913, "step": 3302 }, { - "epoch": 0.99, - "grad_norm": 10.493450164794922, - "learning_rate": 1.33807757843039e-05, - "loss": 2.0356, + "epoch": 0.41, + "grad_norm": 9.200337409973145, + "learning_rate": 1.723716688281806e-05, + "loss": 2.1126, "step": 3303 }, { - "epoch": 0.99, - "grad_norm": 16.16809844970703, - "learning_rate": 1.337877117369951e-05, - "loss": 1.918, + "epoch": 0.41, + "grad_norm": 19.342815399169922, + "learning_rate": 1.7236330167761372e-05, + "loss": 2.3606, "step": 3304 }, { - "epoch": 0.99, - "grad_norm": 15.41076374053955, - "learning_rate": 1.337676656309512e-05, - "loss": 2.8471, + "epoch": 0.41, + "grad_norm": 11.29781436920166, + "learning_rate": 1.7235493452704682e-05, + "loss": 2.2782, "step": 3305 }, { - "epoch": 0.99, - "grad_norm": 18.323535919189453, - "learning_rate": 1.3374761952490729e-05, - "loss": 1.6805, + "epoch": 0.41, + "grad_norm": 40.30754089355469, + "learning_rate": 1.7234656737647996e-05, + "loss": 2.1958, "step": 3306 }, { - "epoch": 0.99, - "grad_norm": 10.888114929199219, - "learning_rate": 1.337275734188634e-05, - "loss": 1.4534, + "epoch": 0.42, + "grad_norm": 10.85988712310791, + "learning_rate": 1.7233820022591306e-05, + "loss": 3.3253, "step": 3307 }, { - "epoch": 0.99, - "grad_norm": 23.824207305908203, - "learning_rate": 1.3370752731281949e-05, - "loss": 2.1954, + "epoch": 0.42, + "grad_norm": 8.996675491333008, + "learning_rate": 1.723298330753462e-05, + "loss": 2.2486, "step": 3308 }, { - "epoch": 0.99, - "grad_norm": 12.056889533996582, - "learning_rate": 1.3368748120677559e-05, - "loss": 1.9769, + "epoch": 0.42, + "grad_norm": 9.021422386169434, + "learning_rate": 1.7232146592477933e-05, + "loss": 1.5477, "step": 3309 }, { - "epoch": 1.0, - "grad_norm": 20.176801681518555, - "learning_rate": 1.3366743510073171e-05, - "loss": 2.3137, + "epoch": 0.42, + "grad_norm": 12.186882972717285, + "learning_rate": 1.7231309877421244e-05, + "loss": 1.0307, "step": 3310 }, { - "epoch": 1.0, - "grad_norm": 12.740705490112305, - "learning_rate": 1.336473889946878e-05, - "loss": 2.1326, + "epoch": 0.42, + "grad_norm": 16.202449798583984, + "learning_rate": 1.7230473162364557e-05, + "loss": 2.3638, "step": 3311 }, { - "epoch": 1.0, - "grad_norm": 13.5363130569458, - "learning_rate": 1.336273428886439e-05, - "loss": 1.5278, + "epoch": 0.42, + "grad_norm": 9.610941886901855, + "learning_rate": 1.722963644730787e-05, + "loss": 2.0287, "step": 3312 }, { - "epoch": 1.0, - "grad_norm": 11.068243026733398, - "learning_rate": 1.3360729678259998e-05, - "loss": 1.186, + "epoch": 0.42, + "grad_norm": 20.206573486328125, + "learning_rate": 1.722879973225118e-05, + "loss": 2.0326, "step": 3313 }, { - "epoch": 1.0, - "grad_norm": 16.938138961791992, - "learning_rate": 1.335872506765561e-05, - "loss": 1.0064, + "epoch": 0.42, + "grad_norm": 8.457555770874023, + "learning_rate": 1.7227963017194495e-05, + "loss": 3.5017, "step": 3314 }, { - "epoch": 1.0, - "grad_norm": 13.62263298034668, - "learning_rate": 1.335672045705122e-05, - "loss": 1.909, + "epoch": 0.42, + "grad_norm": 45.251060485839844, + "learning_rate": 1.722712630213781e-05, + "loss": 2.96, "step": 3315 }, { - "epoch": 1.0, - "grad_norm": 11.283188819885254, - "learning_rate": 1.3354715846446828e-05, - "loss": 1.6886, + "epoch": 0.42, + "grad_norm": 8.923137664794922, + "learning_rate": 1.7226289587081122e-05, + "loss": 0.499, "step": 3316 }, { - "epoch": 1.0, - "grad_norm": 14.703868865966797, - "learning_rate": 1.335271123584244e-05, - "loss": 1.9791, + "epoch": 0.42, + "grad_norm": 15.219002723693848, + "learning_rate": 1.7225452872024432e-05, + "loss": 1.5465, "step": 3317 }, { - "epoch": 1.0, - "grad_norm": 13.806490898132324, - "learning_rate": 1.3350706625238048e-05, - "loss": 1.516, + "epoch": 0.42, + "grad_norm": 92.25977325439453, + "learning_rate": 1.7224616156967746e-05, + "loss": 1.0988, "step": 3318 }, { - "epoch": 1.0, - "grad_norm": 14.787043571472168, - "learning_rate": 1.3348702014633658e-05, - "loss": 1.8667, + "epoch": 0.42, + "grad_norm": 15.196372985839844, + "learning_rate": 1.722377944191106e-05, + "loss": 2.8061, "step": 3319 }, { - "epoch": 1.0, - "grad_norm": 17.8718318939209, - "learning_rate": 1.3346697404029267e-05, - "loss": 2.0014, + "epoch": 0.42, + "grad_norm": 10.119935989379883, + "learning_rate": 1.722294272685437e-05, + "loss": 0.7383, "step": 3320 }, { - "epoch": 1.0, - "grad_norm": 23.68500518798828, - "learning_rate": 1.3344692793424878e-05, - "loss": 1.7045, + "epoch": 0.42, + "grad_norm": 18.1783390045166, + "learning_rate": 1.7222106011797684e-05, + "loss": 0.8793, "step": 3321 }, { - "epoch": 1.0, - "grad_norm": 10.535503387451172, - "learning_rate": 1.3342688182820488e-05, - "loss": 1.183, + "epoch": 0.42, + "grad_norm": 24.50297737121582, + "learning_rate": 1.7221269296740997e-05, + "loss": 3.5501, "step": 3322 }, { - "epoch": 1.0, - "grad_norm": 25.851089477539062, - "learning_rate": 1.3340683572216097e-05, - "loss": 2.7104, + "epoch": 0.42, + "grad_norm": 9.117147445678711, + "learning_rate": 1.722043258168431e-05, + "loss": 1.0697, "step": 3323 }, { - "epoch": 1.0, - "grad_norm": 10.395030975341797, - "learning_rate": 1.3338678961611709e-05, - "loss": 1.7227, + "epoch": 0.42, + "grad_norm": 12.340163230895996, + "learning_rate": 1.721959586662762e-05, + "loss": 1.7272, "step": 3324 }, { - "epoch": 1.0, - "grad_norm": 14.099095344543457, - "learning_rate": 1.3336674351007317e-05, - "loss": 1.7941, + "epoch": 0.42, + "grad_norm": 35.53330612182617, + "learning_rate": 1.7218759151570935e-05, + "loss": 2.7344, "step": 3325 }, { - "epoch": 1.0, - "grad_norm": 15.447168350219727, - "learning_rate": 1.3334669740402927e-05, - "loss": 2.0854, + "epoch": 0.42, + "grad_norm": 14.975069046020508, + "learning_rate": 1.721792243651425e-05, + "loss": 2.8875, "step": 3326 }, { - "epoch": 1.0, - "grad_norm": 20.244766235351562, - "learning_rate": 1.3332665129798539e-05, - "loss": 3.0187, + "epoch": 0.42, + "grad_norm": 10.984192848205566, + "learning_rate": 1.721708572145756e-05, + "loss": 2.2025, "step": 3327 }, { - "epoch": 1.0, - "grad_norm": 56.30267333984375, - "learning_rate": 1.3330660519194147e-05, - "loss": 2.1849, + "epoch": 0.42, + "grad_norm": 8.138824462890625, + "learning_rate": 1.7216249006400872e-05, + "loss": 1.3284, "step": 3328 }, { - "epoch": 1.0, - "grad_norm": 28.362627029418945, - "learning_rate": 1.3328655908589757e-05, - "loss": 1.7998, + "epoch": 0.42, + "grad_norm": 8.697530746459961, + "learning_rate": 1.7215412291344186e-05, + "loss": 3.7476, "step": 3329 }, { - "epoch": 1.0, - "grad_norm": 12.778199195861816, - "learning_rate": 1.3326651297985367e-05, - "loss": 1.5023, + "epoch": 0.42, + "grad_norm": 16.129322052001953, + "learning_rate": 1.7214575576287496e-05, + "loss": 2.1301, "step": 3330 }, { - "epoch": 1.0, - "grad_norm": 9.352411270141602, - "learning_rate": 1.3324646687380978e-05, - "loss": 1.3491, + "epoch": 0.42, + "grad_norm": 16.356788635253906, + "learning_rate": 1.721373886123081e-05, + "loss": 3.1456, "step": 3331 }, { - "epoch": 1.0, - "grad_norm": 32.7597770690918, - "learning_rate": 1.3322642076776586e-05, - "loss": 2.2412, + "epoch": 0.42, + "grad_norm": 11.671011924743652, + "learning_rate": 1.721290214617412e-05, + "loss": 2.0738, "step": 3332 }, { - "epoch": 1.0, - "grad_norm": 14.371981620788574, - "learning_rate": 1.3320637466172198e-05, - "loss": 2.2198, + "epoch": 0.42, + "grad_norm": 12.250069618225098, + "learning_rate": 1.7212065431117434e-05, + "loss": 3.1575, "step": 3333 }, { - "epoch": 1.0, - "grad_norm": 15.051918029785156, - "learning_rate": 1.3318632855567808e-05, - "loss": 1.5761, + "epoch": 0.42, + "grad_norm": 15.652741432189941, + "learning_rate": 1.7211228716060748e-05, + "loss": 3.4465, "step": 3334 }, { - "epoch": 1.0, - "grad_norm": 18.553064346313477, - "learning_rate": 1.3316628244963416e-05, - "loss": 2.2955, + "epoch": 0.42, + "grad_norm": 8.946577072143555, + "learning_rate": 1.7210392001004058e-05, + "loss": 1.8496, "step": 3335 }, { - "epoch": 1.0, - "grad_norm": 13.871806144714355, - "learning_rate": 1.3314623634359028e-05, - "loss": 1.7798, + "epoch": 0.42, + "grad_norm": 20.72052764892578, + "learning_rate": 1.720955528594737e-05, + "loss": 1.8472, "step": 3336 }, { - "epoch": 1.0, - "grad_norm": 24.853288650512695, - "learning_rate": 1.3312619023754636e-05, - "loss": 2.5074, + "epoch": 0.42, + "grad_norm": 11.61407470703125, + "learning_rate": 1.7208718570890685e-05, + "loss": 1.032, "step": 3337 }, { - "epoch": 1.0, - "grad_norm": 7.72559928894043, - "learning_rate": 1.3310614413150246e-05, - "loss": 2.0241, + "epoch": 0.42, + "grad_norm": 16.900367736816406, + "learning_rate": 1.7207881855833995e-05, + "loss": 1.6605, "step": 3338 }, { - "epoch": 1.0, - "grad_norm": 67.96630096435547, - "learning_rate": 1.3308609802545858e-05, - "loss": 2.9062, + "epoch": 0.42, + "grad_norm": 10.358806610107422, + "learning_rate": 1.720704514077731e-05, + "loss": 0.8799, "step": 3339 }, { - "epoch": 1.0, - "grad_norm": 15.404458045959473, - "learning_rate": 1.3306605191941467e-05, - "loss": 1.2167, + "epoch": 0.42, + "grad_norm": 13.098993301391602, + "learning_rate": 1.7206208425720623e-05, + "loss": 1.7514, "step": 3340 }, { - "epoch": 1.0, - "grad_norm": 20.57508087158203, - "learning_rate": 1.3304600581337077e-05, - "loss": 2.3393, + "epoch": 0.42, + "grad_norm": 14.889695167541504, + "learning_rate": 1.7205371710663933e-05, + "loss": 2.7566, "step": 3341 }, { - "epoch": 1.0, - "grad_norm": 13.644542694091797, - "learning_rate": 1.3302595970732685e-05, - "loss": 2.7504, + "epoch": 0.42, + "grad_norm": 17.534114837646484, + "learning_rate": 1.7204534995607247e-05, + "loss": 1.4067, "step": 3342 }, { - "epoch": 1.01, - "grad_norm": 27.4990291595459, - "learning_rate": 1.3300591360128297e-05, - "loss": 2.4179, + "epoch": 0.42, + "grad_norm": 20.281309127807617, + "learning_rate": 1.720369828055056e-05, + "loss": 2.097, "step": 3343 }, { - "epoch": 1.01, - "grad_norm": 18.17327308654785, - "learning_rate": 1.3298586749523905e-05, - "loss": 1.8711, + "epoch": 0.42, + "grad_norm": 22.249191284179688, + "learning_rate": 1.7202861565493874e-05, + "loss": 1.0598, "step": 3344 }, { - "epoch": 1.01, - "grad_norm": 11.454172134399414, - "learning_rate": 1.3296582138919515e-05, - "loss": 2.2247, + "epoch": 0.42, + "grad_norm": 4.688612461090088, + "learning_rate": 1.7202024850437184e-05, + "loss": 1.986, "step": 3345 }, { - "epoch": 1.01, - "grad_norm": 14.537096977233887, - "learning_rate": 1.3294577528315127e-05, - "loss": 1.3509, + "epoch": 0.42, + "grad_norm": 11.584707260131836, + "learning_rate": 1.7201188135380498e-05, + "loss": 1.064, "step": 3346 }, { - "epoch": 1.01, - "grad_norm": 17.769039154052734, - "learning_rate": 1.3292572917710736e-05, - "loss": 1.6675, + "epoch": 0.42, + "grad_norm": 13.541443824768066, + "learning_rate": 1.720035142032381e-05, + "loss": 2.173, "step": 3347 }, { - "epoch": 1.01, - "grad_norm": 20.219247817993164, - "learning_rate": 1.3290568307106346e-05, - "loss": 2.1718, + "epoch": 0.42, + "grad_norm": 10.384590148925781, + "learning_rate": 1.7199514705267122e-05, + "loss": 2.0759, "step": 3348 }, { - "epoch": 1.01, - "grad_norm": 12.704767227172852, - "learning_rate": 1.3288563696501956e-05, - "loss": 1.2209, + "epoch": 0.42, + "grad_norm": 10.545215606689453, + "learning_rate": 1.7198677990210435e-05, + "loss": 1.4586, "step": 3349 }, { - "epoch": 1.01, - "grad_norm": 15.621160507202148, - "learning_rate": 1.3286559085897566e-05, - "loss": 1.5298, + "epoch": 0.42, + "grad_norm": 19.95136260986328, + "learning_rate": 1.719784127515375e-05, + "loss": 2.455, "step": 3350 }, { - "epoch": 1.01, - "grad_norm": 17.41335678100586, - "learning_rate": 1.3284554475293174e-05, - "loss": 2.7119, + "epoch": 0.42, + "grad_norm": 22.57049560546875, + "learning_rate": 1.7197004560097063e-05, + "loss": 2.7227, "step": 3351 }, { - "epoch": 1.01, - "grad_norm": 27.469499588012695, - "learning_rate": 1.3282549864688786e-05, - "loss": 2.821, + "epoch": 0.42, + "grad_norm": 26.080463409423828, + "learning_rate": 1.7196167845040373e-05, + "loss": 1.8036, "step": 3352 }, { - "epoch": 1.01, - "grad_norm": 47.88648986816406, - "learning_rate": 1.3280545254084396e-05, - "loss": 2.4446, + "epoch": 0.42, + "grad_norm": 7.422852039337158, + "learning_rate": 1.7195331129983687e-05, + "loss": 1.4551, "step": 3353 }, { - "epoch": 1.01, - "grad_norm": 9.525798797607422, - "learning_rate": 1.3278540643480004e-05, - "loss": 0.9091, + "epoch": 0.42, + "grad_norm": 5.2106733322143555, + "learning_rate": 1.7194494414927e-05, + "loss": 2.3569, "step": 3354 }, { - "epoch": 1.01, - "grad_norm": 22.738584518432617, - "learning_rate": 1.3276536032875616e-05, - "loss": 2.3079, + "epoch": 0.42, + "grad_norm": 12.15693187713623, + "learning_rate": 1.719365769987031e-05, + "loss": 3.4812, "step": 3355 }, { - "epoch": 1.01, - "grad_norm": 11.072502136230469, - "learning_rate": 1.3274531422271225e-05, - "loss": 2.7678, + "epoch": 0.42, + "grad_norm": 12.908676147460938, + "learning_rate": 1.7192820984813624e-05, + "loss": 0.6298, "step": 3356 }, { - "epoch": 1.01, - "grad_norm": 12.42871379852295, - "learning_rate": 1.3272526811666835e-05, - "loss": 1.1835, + "epoch": 0.42, + "grad_norm": 11.007267951965332, + "learning_rate": 1.7191984269756938e-05, + "loss": 2.6467, "step": 3357 }, { - "epoch": 1.01, - "grad_norm": 10.291549682617188, - "learning_rate": 1.3270522201062446e-05, - "loss": 1.3435, + "epoch": 0.42, + "grad_norm": 12.836434364318848, + "learning_rate": 1.7191147554700248e-05, + "loss": 2.1427, "step": 3358 }, { - "epoch": 1.01, - "grad_norm": 18.563772201538086, - "learning_rate": 1.3268517590458055e-05, - "loss": 1.2068, + "epoch": 0.42, + "grad_norm": 5.820464134216309, + "learning_rate": 1.719031083964356e-05, + "loss": 0.6328, "step": 3359 }, { - "epoch": 1.01, - "grad_norm": 16.957143783569336, - "learning_rate": 1.3266512979853665e-05, - "loss": 2.197, - "step": 3360 - }, - { - "epoch": 1.01, - "eval_loss": 0.22327160835266113, - "eval_runtime": 43.5349, - "eval_samples_per_second": 33.973, - "eval_steps_per_second": 33.973, + "epoch": 0.42, + "grad_norm": 13.101532936096191, + "learning_rate": 1.7189474124586872e-05, + "loss": 1.8192, "step": 3360 }, { - "epoch": 1.01, - "grad_norm": 12.542174339294434, - "learning_rate": 1.3264508369249273e-05, - "loss": 1.7032, + "epoch": 0.42, + "grad_norm": 8.012151718139648, + "learning_rate": 1.7188637409530186e-05, + "loss": 3.1705, "step": 3361 }, { - "epoch": 1.01, - "grad_norm": 12.516151428222656, - "learning_rate": 1.3262503758644885e-05, - "loss": 1.4823, + "epoch": 0.42, + "grad_norm": 14.383709907531738, + "learning_rate": 1.71878006944735e-05, + "loss": 2.7463, "step": 3362 }, { - "epoch": 1.01, - "grad_norm": 13.449585914611816, - "learning_rate": 1.3260499148040493e-05, - "loss": 1.9671, + "epoch": 0.42, + "grad_norm": 12.688018798828125, + "learning_rate": 1.718696397941681e-05, + "loss": 2.3427, "step": 3363 }, { - "epoch": 1.01, - "grad_norm": 20.43147850036621, - "learning_rate": 1.3258494537436104e-05, - "loss": 1.5756, + "epoch": 0.42, + "grad_norm": 28.22138214111328, + "learning_rate": 1.7186127264360123e-05, + "loss": 2.986, "step": 3364 }, { - "epoch": 1.01, - "grad_norm": 9.373948097229004, - "learning_rate": 1.3256489926831715e-05, - "loss": 2.1166, + "epoch": 0.42, + "grad_norm": 7.574335098266602, + "learning_rate": 1.7185290549303437e-05, + "loss": 2.1322, "step": 3365 }, { - "epoch": 1.01, - "grad_norm": 28.55181312561035, - "learning_rate": 1.3254485316227324e-05, - "loss": 2.7417, + "epoch": 0.42, + "grad_norm": 8.840447425842285, + "learning_rate": 1.7184453834246747e-05, + "loss": 1.3887, "step": 3366 }, { - "epoch": 1.01, - "grad_norm": 23.936809539794922, - "learning_rate": 1.3252480705622934e-05, - "loss": 1.5005, + "epoch": 0.42, + "grad_norm": 23.318866729736328, + "learning_rate": 1.718361711919006e-05, + "loss": 3.8719, "step": 3367 }, { - "epoch": 1.01, - "grad_norm": 14.939528465270996, - "learning_rate": 1.3250476095018542e-05, - "loss": 2.3812, + "epoch": 0.42, + "grad_norm": 33.8458137512207, + "learning_rate": 1.7182780404133374e-05, + "loss": 2.5061, "step": 3368 }, { - "epoch": 1.01, - "grad_norm": 23.387401580810547, - "learning_rate": 1.3248471484414154e-05, - "loss": 2.5629, + "epoch": 0.42, + "grad_norm": 41.34958267211914, + "learning_rate": 1.7181943689076685e-05, + "loss": 2.972, "step": 3369 }, { - "epoch": 1.01, - "grad_norm": 17.187917709350586, - "learning_rate": 1.3246466873809764e-05, - "loss": 1.8393, + "epoch": 0.42, + "grad_norm": 12.544574737548828, + "learning_rate": 1.7181106974019998e-05, + "loss": 0.7907, "step": 3370 }, { - "epoch": 1.01, - "grad_norm": 15.523499488830566, - "learning_rate": 1.3244462263205372e-05, - "loss": 2.066, + "epoch": 0.42, + "grad_norm": 21.2841796875, + "learning_rate": 1.7180270258963312e-05, + "loss": 3.0655, "step": 3371 }, { - "epoch": 1.01, - "grad_norm": 8.489646911621094, - "learning_rate": 1.3242457652600984e-05, - "loss": 1.5775, + "epoch": 0.42, + "grad_norm": 11.424372673034668, + "learning_rate": 1.7179433543906626e-05, + "loss": 2.7168, "step": 3372 }, { - "epoch": 1.01, - "grad_norm": 12.514199256896973, - "learning_rate": 1.3240453041996593e-05, - "loss": 1.3532, + "epoch": 0.42, + "grad_norm": 24.937965393066406, + "learning_rate": 1.7178596828849936e-05, + "loss": 2.7555, "step": 3373 }, { - "epoch": 1.01, - "grad_norm": 39.9799919128418, - "learning_rate": 1.3238448431392203e-05, - "loss": 1.7734, + "epoch": 0.42, + "grad_norm": 20.2894229888916, + "learning_rate": 1.717776011379325e-05, + "loss": 2.5918, "step": 3374 }, { - "epoch": 1.01, - "grad_norm": 12.430253028869629, - "learning_rate": 1.3236443820787813e-05, - "loss": 2.0699, + "epoch": 0.42, + "grad_norm": 16.203311920166016, + "learning_rate": 1.7176923398736563e-05, + "loss": 1.8023, "step": 3375 }, { - "epoch": 1.02, - "grad_norm": 32.21778106689453, - "learning_rate": 1.3234439210183423e-05, - "loss": 2.4161, + "epoch": 0.42, + "grad_norm": 15.349499702453613, + "learning_rate": 1.7176086683679873e-05, + "loss": 2.4695, "step": 3376 }, { - "epoch": 1.02, - "grad_norm": 10.491954803466797, - "learning_rate": 1.3232434599579033e-05, - "loss": 2.0041, + "epoch": 0.42, + "grad_norm": 37.25876998901367, + "learning_rate": 1.7175249968623187e-05, + "loss": 2.3071, "step": 3377 }, { - "epoch": 1.02, - "grad_norm": 24.133899688720703, - "learning_rate": 1.3230429988974643e-05, - "loss": 2.884, + "epoch": 0.42, + "grad_norm": 9.62696647644043, + "learning_rate": 1.71744132535665e-05, + "loss": 1.7059, "step": 3378 }, { - "epoch": 1.02, - "grad_norm": 13.717731475830078, - "learning_rate": 1.3228425378370253e-05, - "loss": 2.4387, + "epoch": 0.42, + "grad_norm": 13.929464340209961, + "learning_rate": 1.7173576538509814e-05, + "loss": 2.8409, "step": 3379 }, { - "epoch": 1.02, - "grad_norm": 20.04202651977539, - "learning_rate": 1.3226420767765862e-05, - "loss": 1.764, + "epoch": 0.42, + "grad_norm": 10.758846282958984, + "learning_rate": 1.7172739823453125e-05, + "loss": 1.1154, "step": 3380 }, { - "epoch": 1.02, - "grad_norm": 16.394018173217773, - "learning_rate": 1.3224416157161473e-05, - "loss": 1.6656, + "epoch": 0.42, + "grad_norm": 6.401970863342285, + "learning_rate": 1.7171903108396438e-05, + "loss": 1.3731, "step": 3381 }, { - "epoch": 1.02, - "grad_norm": 6.424349308013916, - "learning_rate": 1.3222411546557083e-05, - "loss": 1.2659, + "epoch": 0.42, + "grad_norm": 15.83933162689209, + "learning_rate": 1.7171066393339752e-05, + "loss": 3.2017, "step": 3382 }, { - "epoch": 1.02, - "grad_norm": 12.825157165527344, - "learning_rate": 1.3220406935952692e-05, - "loss": 1.5172, + "epoch": 0.42, + "grad_norm": 8.87679386138916, + "learning_rate": 1.7170229678283062e-05, + "loss": 0.891, "step": 3383 }, { - "epoch": 1.02, - "grad_norm": 32.225006103515625, - "learning_rate": 1.3218402325348304e-05, - "loss": 2.4318, + "epoch": 0.42, + "grad_norm": 15.874035835266113, + "learning_rate": 1.7169392963226376e-05, + "loss": 3.5279, "step": 3384 }, { - "epoch": 1.02, - "grad_norm": 10.880355834960938, - "learning_rate": 1.3216397714743912e-05, - "loss": 1.5311, + "epoch": 0.42, + "grad_norm": 9.200181007385254, + "learning_rate": 1.7168556248169686e-05, + "loss": 1.3639, "step": 3385 }, { - "epoch": 1.02, - "grad_norm": 23.808944702148438, - "learning_rate": 1.3214393104139522e-05, - "loss": 2.809, + "epoch": 0.42, + "grad_norm": 12.776347160339355, + "learning_rate": 1.7167719533113e-05, + "loss": 1.8228, "step": 3386 }, { - "epoch": 1.02, - "grad_norm": 18.621950149536133, - "learning_rate": 1.321238849353513e-05, - "loss": 1.5185, + "epoch": 0.43, + "grad_norm": 19.877140045166016, + "learning_rate": 1.7166882818056313e-05, + "loss": 2.9879, "step": 3387 }, { - "epoch": 1.02, - "grad_norm": 13.235946655273438, - "learning_rate": 1.3210383882930742e-05, - "loss": 1.5264, + "epoch": 0.43, + "grad_norm": 6.658298969268799, + "learning_rate": 1.7166046102999624e-05, + "loss": 0.5819, "step": 3388 }, { - "epoch": 1.02, - "grad_norm": 12.991135597229004, - "learning_rate": 1.3208379272326352e-05, - "loss": 1.8659, + "epoch": 0.43, + "grad_norm": 51.25400161743164, + "learning_rate": 1.7165209387942937e-05, + "loss": 4.3137, "step": 3389 }, { - "epoch": 1.02, - "grad_norm": 38.41598129272461, - "learning_rate": 1.320637466172196e-05, - "loss": 2.2308, + "epoch": 0.43, + "grad_norm": 12.011516571044922, + "learning_rate": 1.7164372672886248e-05, + "loss": 2.2849, "step": 3390 }, { - "epoch": 1.02, - "grad_norm": 16.32402801513672, - "learning_rate": 1.3204370051117572e-05, - "loss": 1.7613, + "epoch": 0.43, + "grad_norm": 9.610902786254883, + "learning_rate": 1.716353595782956e-05, + "loss": 3.3038, "step": 3391 }, { - "epoch": 1.02, - "grad_norm": 18.9952335357666, - "learning_rate": 1.320236544051318e-05, - "loss": 1.6083, + "epoch": 0.43, + "grad_norm": 16.154939651489258, + "learning_rate": 1.7162699242772875e-05, + "loss": 1.5948, "step": 3392 }, { - "epoch": 1.02, - "grad_norm": 20.658859252929688, - "learning_rate": 1.3200360829908791e-05, - "loss": 1.7974, + "epoch": 0.43, + "grad_norm": 21.398250579833984, + "learning_rate": 1.7161862527716185e-05, + "loss": 2.7455, "step": 3393 }, { - "epoch": 1.02, - "grad_norm": 15.612691879272461, - "learning_rate": 1.31983562193044e-05, - "loss": 2.5692, + "epoch": 0.43, + "grad_norm": 10.393528938293457, + "learning_rate": 1.71610258126595e-05, + "loss": 1.631, "step": 3394 }, { - "epoch": 1.02, - "grad_norm": 49.652915954589844, - "learning_rate": 1.3196351608700011e-05, - "loss": 2.6861, + "epoch": 0.43, + "grad_norm": 16.182029724121094, + "learning_rate": 1.7160189097602812e-05, + "loss": 1.3885, "step": 3395 }, { - "epoch": 1.02, - "grad_norm": 10.916786193847656, - "learning_rate": 1.3194346998095621e-05, - "loss": 1.4085, + "epoch": 0.43, + "grad_norm": 13.672966003417969, + "learning_rate": 1.7159352382546126e-05, + "loss": 1.5113, "step": 3396 }, { - "epoch": 1.02, - "grad_norm": 21.12449073791504, - "learning_rate": 1.3192342387491231e-05, - "loss": 2.6789, + "epoch": 0.43, + "grad_norm": 16.824934005737305, + "learning_rate": 1.7158515667489436e-05, + "loss": 2.0353, "step": 3397 }, { - "epoch": 1.02, - "grad_norm": 18.018606185913086, - "learning_rate": 1.3190337776886841e-05, - "loss": 1.6874, + "epoch": 0.43, + "grad_norm": 24.244441986083984, + "learning_rate": 1.715767895243275e-05, + "loss": 3.1288, "step": 3398 }, { - "epoch": 1.02, - "grad_norm": 22.249736785888672, - "learning_rate": 1.318833316628245e-05, - "loss": 2.0898, + "epoch": 0.43, + "grad_norm": 13.222444534301758, + "learning_rate": 1.7156842237376064e-05, + "loss": 1.6778, "step": 3399 }, { - "epoch": 1.02, - "grad_norm": 14.536703109741211, - "learning_rate": 1.3186328555678062e-05, - "loss": 2.2679, + "epoch": 0.43, + "grad_norm": 19.14320182800293, + "learning_rate": 1.7156005522319374e-05, + "loss": 3.5595, "step": 3400 }, { - "epoch": 1.02, - "grad_norm": 16.38414764404297, - "learning_rate": 1.3184323945073672e-05, - "loss": 1.3831, + "epoch": 0.43, + "grad_norm": 9.551873207092285, + "learning_rate": 1.7155168807262687e-05, + "loss": 1.6328, "step": 3401 }, { - "epoch": 1.02, - "grad_norm": 16.283510208129883, - "learning_rate": 1.318231933446928e-05, - "loss": 1.622, + "epoch": 0.43, + "grad_norm": 15.074509620666504, + "learning_rate": 1.7154332092206e-05, + "loss": 3.466, "step": 3402 }, { - "epoch": 1.02, - "grad_norm": 12.426440238952637, - "learning_rate": 1.3180314723864892e-05, - "loss": 1.574, + "epoch": 0.43, + "grad_norm": 16.020980834960938, + "learning_rate": 1.7153495377149315e-05, + "loss": 1.7498, "step": 3403 }, { - "epoch": 1.02, - "grad_norm": 10.712419509887695, - "learning_rate": 1.31783101132605e-05, - "loss": 1.8414, + "epoch": 0.43, + "grad_norm": 13.791266441345215, + "learning_rate": 1.7152658662092625e-05, + "loss": 1.2065, "step": 3404 }, { - "epoch": 1.02, - "grad_norm": 12.0217924118042, - "learning_rate": 1.317630550265611e-05, - "loss": 1.817, + "epoch": 0.43, + "grad_norm": 14.800347328186035, + "learning_rate": 1.715182194703594e-05, + "loss": 1.5002, "step": 3405 }, { - "epoch": 1.02, - "grad_norm": 10.142142295837402, - "learning_rate": 1.3174300892051719e-05, - "loss": 1.7129, + "epoch": 0.43, + "grad_norm": 15.264331817626953, + "learning_rate": 1.7150985231979252e-05, + "loss": 1.3831, "step": 3406 }, { - "epoch": 1.02, - "grad_norm": 17.107044219970703, - "learning_rate": 1.317229628144733e-05, - "loss": 2.2242, + "epoch": 0.43, + "grad_norm": 5.368287086486816, + "learning_rate": 1.7150148516922563e-05, + "loss": 0.4597, "step": 3407 }, { - "epoch": 1.02, - "grad_norm": 30.633485794067383, - "learning_rate": 1.317029167084294e-05, - "loss": 2.3054, + "epoch": 0.43, + "grad_norm": 15.589975357055664, + "learning_rate": 1.7149311801865876e-05, + "loss": 2.8628, "step": 3408 }, { - "epoch": 1.02, - "grad_norm": 10.093416213989258, - "learning_rate": 1.3168287060238549e-05, - "loss": 1.6337, + "epoch": 0.43, + "grad_norm": 7.814126968383789, + "learning_rate": 1.714847508680919e-05, + "loss": 0.5112, "step": 3409 }, { - "epoch": 1.03, - "grad_norm": 16.132856369018555, - "learning_rate": 1.316628244963416e-05, - "loss": 1.8572, + "epoch": 0.43, + "grad_norm": 8.66985034942627, + "learning_rate": 1.7147638371752504e-05, + "loss": 0.4126, "step": 3410 }, { - "epoch": 1.03, - "grad_norm": 26.83838653564453, - "learning_rate": 1.3164277839029769e-05, - "loss": 1.6212, + "epoch": 0.43, + "grad_norm": 10.87762451171875, + "learning_rate": 1.7146801656695814e-05, + "loss": 0.5436, "step": 3411 }, { - "epoch": 1.03, - "grad_norm": 18.7099609375, - "learning_rate": 1.3162273228425379e-05, - "loss": 2.4694, + "epoch": 0.43, + "grad_norm": 10.160980224609375, + "learning_rate": 1.7145964941639127e-05, + "loss": 2.5114, "step": 3412 }, { - "epoch": 1.03, - "grad_norm": 15.113785743713379, - "learning_rate": 1.3160268617820991e-05, - "loss": 1.8307, + "epoch": 0.43, + "grad_norm": 20.5477294921875, + "learning_rate": 1.7145128226582438e-05, + "loss": 4.0677, "step": 3413 }, { - "epoch": 1.03, - "grad_norm": 13.659159660339355, - "learning_rate": 1.31582640072166e-05, - "loss": 2.0593, + "epoch": 0.43, + "grad_norm": 15.280623435974121, + "learning_rate": 1.714429151152575e-05, + "loss": 4.0942, "step": 3414 }, { - "epoch": 1.03, - "grad_norm": 25.388036727905273, - "learning_rate": 1.315625939661221e-05, - "loss": 2.2341, + "epoch": 0.43, + "grad_norm": 17.571733474731445, + "learning_rate": 1.7143454796469065e-05, + "loss": 2.6576, "step": 3415 }, { - "epoch": 1.03, - "grad_norm": 7.588253021240234, - "learning_rate": 1.3154254786007818e-05, - "loss": 1.8049, + "epoch": 0.43, + "grad_norm": 11.830584526062012, + "learning_rate": 1.7142618081412375e-05, + "loss": 2.3105, "step": 3416 }, { - "epoch": 1.03, - "grad_norm": 18.153966903686523, - "learning_rate": 1.315225017540343e-05, - "loss": 2.1183, + "epoch": 0.43, + "grad_norm": 15.482280731201172, + "learning_rate": 1.714178136635569e-05, + "loss": 3.3672, "step": 3417 }, { - "epoch": 1.03, - "grad_norm": 12.177376747131348, - "learning_rate": 1.3150245564799038e-05, - "loss": 1.611, + "epoch": 0.43, + "grad_norm": 12.5259370803833, + "learning_rate": 1.7140944651299e-05, + "loss": 1.351, "step": 3418 }, { - "epoch": 1.03, - "grad_norm": 15.74305534362793, - "learning_rate": 1.3148240954194648e-05, - "loss": 2.5822, + "epoch": 0.43, + "grad_norm": 5.563350677490234, + "learning_rate": 1.7140107936242313e-05, + "loss": 0.5526, "step": 3419 }, { - "epoch": 1.03, - "grad_norm": 13.683473587036133, - "learning_rate": 1.314623634359026e-05, - "loss": 1.4373, + "epoch": 0.43, + "grad_norm": 8.950671195983887, + "learning_rate": 1.7139271221185626e-05, + "loss": 2.2944, "step": 3420 }, { - "epoch": 1.03, - "grad_norm": 17.658405303955078, - "learning_rate": 1.3144231732985868e-05, - "loss": 1.9642, + "epoch": 0.43, + "grad_norm": 19.60506248474121, + "learning_rate": 1.7138434506128937e-05, + "loss": 4.1311, "step": 3421 }, { - "epoch": 1.03, - "grad_norm": 10.713934898376465, - "learning_rate": 1.3142227122381478e-05, - "loss": 1.7652, + "epoch": 0.43, + "grad_norm": 9.252056121826172, + "learning_rate": 1.713759779107225e-05, + "loss": 0.5094, "step": 3422 }, { - "epoch": 1.03, - "grad_norm": 10.191012382507324, - "learning_rate": 1.3140222511777088e-05, - "loss": 1.7921, + "epoch": 0.43, + "grad_norm": 19.239261627197266, + "learning_rate": 1.7136761076015564e-05, + "loss": 2.1528, "step": 3423 }, { - "epoch": 1.03, - "grad_norm": 9.874408721923828, - "learning_rate": 1.3138217901172698e-05, - "loss": 0.9375, + "epoch": 0.43, + "grad_norm": 6.057199478149414, + "learning_rate": 1.7135924360958878e-05, + "loss": 0.1486, "step": 3424 }, { - "epoch": 1.03, - "grad_norm": 18.47734260559082, - "learning_rate": 1.3136213290568307e-05, - "loss": 2.3192, + "epoch": 0.43, + "grad_norm": 9.986576080322266, + "learning_rate": 1.7135087645902188e-05, + "loss": 0.3078, "step": 3425 }, { - "epoch": 1.03, - "grad_norm": 16.030399322509766, - "learning_rate": 1.3134208679963919e-05, - "loss": 1.957, + "epoch": 0.43, + "grad_norm": 13.740504264831543, + "learning_rate": 1.71342509308455e-05, + "loss": 2.2573, "step": 3426 }, { - "epoch": 1.03, - "grad_norm": 8.997057914733887, - "learning_rate": 1.3132204069359529e-05, - "loss": 1.5734, + "epoch": 0.43, + "grad_norm": 11.266700744628906, + "learning_rate": 1.7133414215788815e-05, + "loss": 1.1035, "step": 3427 }, { - "epoch": 1.03, - "grad_norm": 174.50979614257812, - "learning_rate": 1.3130199458755137e-05, - "loss": 1.8404, + "epoch": 0.43, + "grad_norm": 24.04230499267578, + "learning_rate": 1.7132577500732126e-05, + "loss": 2.5449, "step": 3428 }, { - "epoch": 1.03, - "grad_norm": 15.005576133728027, - "learning_rate": 1.3128194848150749e-05, - "loss": 1.4347, + "epoch": 0.43, + "grad_norm": 14.336504936218262, + "learning_rate": 1.713174078567544e-05, + "loss": 2.1929, "step": 3429 }, { - "epoch": 1.03, - "grad_norm": 13.528023719787598, - "learning_rate": 1.3126190237546357e-05, - "loss": 2.0983, + "epoch": 0.43, + "grad_norm": 4.596050262451172, + "learning_rate": 1.7130904070618753e-05, + "loss": 1.9653, "step": 3430 }, { - "epoch": 1.03, - "grad_norm": 61.08424377441406, - "learning_rate": 1.3124185626941967e-05, - "loss": 1.3849, + "epoch": 0.43, + "grad_norm": 19.6068115234375, + "learning_rate": 1.7130067355562066e-05, + "loss": 3.055, "step": 3431 }, { - "epoch": 1.03, - "grad_norm": 19.432321548461914, - "learning_rate": 1.3122181016337579e-05, - "loss": 2.5755, + "epoch": 0.43, + "grad_norm": 22.83165740966797, + "learning_rate": 1.7129230640505377e-05, + "loss": 1.9638, "step": 3432 }, { - "epoch": 1.03, - "grad_norm": 10.8148775100708, - "learning_rate": 1.3120176405733188e-05, - "loss": 2.1681, + "epoch": 0.43, + "grad_norm": 13.930540084838867, + "learning_rate": 1.712839392544869e-05, + "loss": 3.5265, "step": 3433 }, { - "epoch": 1.03, - "grad_norm": 19.197267532348633, - "learning_rate": 1.3118171795128798e-05, - "loss": 1.7698, + "epoch": 0.43, + "grad_norm": 12.648981094360352, + "learning_rate": 1.7127557210392004e-05, + "loss": 0.4018, "step": 3434 }, { - "epoch": 1.03, - "grad_norm": 16.004776000976562, - "learning_rate": 1.3116167184524406e-05, - "loss": 2.1169, + "epoch": 0.43, + "grad_norm": 30.49363136291504, + "learning_rate": 1.7126720495335314e-05, + "loss": 2.0054, "step": 3435 }, { - "epoch": 1.03, - "grad_norm": 36.55406188964844, - "learning_rate": 1.3114162573920018e-05, - "loss": 2.0371, + "epoch": 0.43, + "grad_norm": 18.023563385009766, + "learning_rate": 1.7125883780278628e-05, + "loss": 3.2501, "step": 3436 }, { - "epoch": 1.03, - "grad_norm": 20.354867935180664, - "learning_rate": 1.3112157963315626e-05, - "loss": 1.5596, + "epoch": 0.43, + "grad_norm": 11.947152137756348, + "learning_rate": 1.712504706522194e-05, + "loss": 3.3163, "step": 3437 }, { - "epoch": 1.03, - "grad_norm": 13.395331382751465, - "learning_rate": 1.3110153352711236e-05, - "loss": 1.3172, + "epoch": 0.43, + "grad_norm": 22.248554229736328, + "learning_rate": 1.7124210350165252e-05, + "loss": 3.5797, "step": 3438 }, { - "epoch": 1.03, - "grad_norm": 17.100326538085938, - "learning_rate": 1.3108148742106848e-05, - "loss": 1.9774, + "epoch": 0.43, + "grad_norm": 22.091278076171875, + "learning_rate": 1.7123373635108565e-05, + "loss": 1.3709, "step": 3439 }, { - "epoch": 1.03, - "grad_norm": 13.835416793823242, - "learning_rate": 1.3106144131502456e-05, - "loss": 1.8643, + "epoch": 0.43, + "grad_norm": 15.646050453186035, + "learning_rate": 1.712253692005188e-05, + "loss": 2.442, "step": 3440 }, { - "epoch": 1.03, - "grad_norm": 15.755050659179688, - "learning_rate": 1.3104139520898067e-05, - "loss": 3.0466, + "epoch": 0.43, + "grad_norm": 11.564681053161621, + "learning_rate": 1.712170020499519e-05, + "loss": 2.0503, "step": 3441 }, { - "epoch": 1.03, - "grad_norm": 19.994091033935547, - "learning_rate": 1.3102134910293675e-05, - "loss": 1.9489, + "epoch": 0.43, + "grad_norm": 10.883163452148438, + "learning_rate": 1.7120863489938503e-05, + "loss": 3.0713, "step": 3442 }, { - "epoch": 1.04, - "grad_norm": 10.072383880615234, - "learning_rate": 1.3100130299689287e-05, - "loss": 2.0764, + "epoch": 0.43, + "grad_norm": 17.00910758972168, + "learning_rate": 1.7120026774881813e-05, + "loss": 2.0418, "step": 3443 }, { - "epoch": 1.04, - "grad_norm": 10.777514457702637, - "learning_rate": 1.3098125689084897e-05, - "loss": 1.2584, + "epoch": 0.43, + "grad_norm": 6.5379958152771, + "learning_rate": 1.7119190059825127e-05, + "loss": 0.9993, "step": 3444 }, { - "epoch": 1.04, - "grad_norm": 24.992101669311523, - "learning_rate": 1.3096121078480505e-05, - "loss": 2.3589, + "epoch": 0.43, + "grad_norm": 25.576587677001953, + "learning_rate": 1.711835334476844e-05, + "loss": 4.4599, "step": 3445 }, { - "epoch": 1.04, - "grad_norm": 13.744710922241211, - "learning_rate": 1.3094116467876117e-05, - "loss": 1.6164, + "epoch": 0.43, + "grad_norm": 14.60910415649414, + "learning_rate": 1.711751662971175e-05, + "loss": 2.8837, "step": 3446 }, { - "epoch": 1.04, - "grad_norm": 9.5302095413208, - "learning_rate": 1.3092111857271725e-05, - "loss": 1.4418, + "epoch": 0.43, + "grad_norm": 15.904330253601074, + "learning_rate": 1.7116679914655065e-05, + "loss": 2.9878, "step": 3447 }, { - "epoch": 1.04, - "grad_norm": 16.148855209350586, - "learning_rate": 1.3090107246667335e-05, - "loss": 1.4431, + "epoch": 0.43, + "grad_norm": 13.964025497436523, + "learning_rate": 1.7115843199598378e-05, + "loss": 2.119, "step": 3448 }, { - "epoch": 1.04, - "grad_norm": 31.971755981445312, - "learning_rate": 1.3088102636062945e-05, - "loss": 1.506, + "epoch": 0.43, + "grad_norm": 11.501665115356445, + "learning_rate": 1.711500648454169e-05, + "loss": 1.7467, "step": 3449 }, { - "epoch": 1.04, - "grad_norm": 18.346967697143555, - "learning_rate": 1.3086098025458556e-05, - "loss": 1.6131, + "epoch": 0.43, + "grad_norm": 13.98098087310791, + "learning_rate": 1.7114169769485002e-05, + "loss": 2.5961, "step": 3450 }, { - "epoch": 1.04, - "grad_norm": 29.8282527923584, - "learning_rate": 1.3084093414854167e-05, - "loss": 2.3438, + "epoch": 0.43, + "grad_norm": 32.3702278137207, + "learning_rate": 1.7113333054428316e-05, + "loss": 1.9346, "step": 3451 }, { - "epoch": 1.04, - "grad_norm": 19.195152282714844, - "learning_rate": 1.3082088804249776e-05, - "loss": 1.9727, + "epoch": 0.43, + "grad_norm": 21.77657127380371, + "learning_rate": 1.711249633937163e-05, + "loss": 2.1284, "step": 3452 }, { - "epoch": 1.04, - "grad_norm": 17.298931121826172, - "learning_rate": 1.3080084193645386e-05, - "loss": 1.5604, + "epoch": 0.43, + "grad_norm": 14.308854103088379, + "learning_rate": 1.711165962431494e-05, + "loss": 1.0683, "step": 3453 }, { - "epoch": 1.04, - "grad_norm": 10.650527954101562, - "learning_rate": 1.3078079583040994e-05, - "loss": 1.0794, + "epoch": 0.43, + "grad_norm": 15.169837951660156, + "learning_rate": 1.7110822909258253e-05, + "loss": 1.1417, "step": 3454 }, { - "epoch": 1.04, - "grad_norm": 16.008625030517578, - "learning_rate": 1.3076074972436606e-05, - "loss": 1.7114, + "epoch": 0.43, + "grad_norm": 8.071978569030762, + "learning_rate": 1.7109986194201567e-05, + "loss": 1.4408, "step": 3455 }, { - "epoch": 1.04, - "grad_norm": 15.615730285644531, - "learning_rate": 1.3074070361832216e-05, - "loss": 2.0135, + "epoch": 0.43, + "grad_norm": 17.008346557617188, + "learning_rate": 1.7109149479144877e-05, + "loss": 1.1823, "step": 3456 }, { - "epoch": 1.04, - "grad_norm": 17.91425132751465, - "learning_rate": 1.3072065751227824e-05, - "loss": 1.8132, + "epoch": 0.43, + "grad_norm": 29.824209213256836, + "learning_rate": 1.710831276408819e-05, + "loss": 2.3075, "step": 3457 }, { - "epoch": 1.04, - "grad_norm": 16.860435485839844, - "learning_rate": 1.3070061140623436e-05, - "loss": 1.9834, + "epoch": 0.43, + "grad_norm": 16.685344696044922, + "learning_rate": 1.7107476049031504e-05, + "loss": 3.901, "step": 3458 }, { - "epoch": 1.04, - "grad_norm": 12.117476463317871, - "learning_rate": 1.3068056530019045e-05, - "loss": 1.62, + "epoch": 0.43, + "grad_norm": 6.225277423858643, + "learning_rate": 1.7106639333974818e-05, + "loss": 0.1791, "step": 3459 }, { - "epoch": 1.04, - "grad_norm": 18.71920394897461, - "learning_rate": 1.3066051919414655e-05, - "loss": 2.2109, + "epoch": 0.43, + "grad_norm": 25.91197967529297, + "learning_rate": 1.710580261891813e-05, + "loss": 1.9825, "step": 3460 }, { - "epoch": 1.04, - "grad_norm": 18.306224822998047, - "learning_rate": 1.3064047308810263e-05, - "loss": 2.2506, + "epoch": 0.43, + "grad_norm": 19.51018714904785, + "learning_rate": 1.7104965903861442e-05, + "loss": 2.7875, "step": 3461 }, { - "epoch": 1.04, - "grad_norm": 27.221975326538086, - "learning_rate": 1.3062042698205875e-05, - "loss": 2.4873, + "epoch": 0.43, + "grad_norm": 14.964982986450195, + "learning_rate": 1.7104129188804756e-05, + "loss": 1.3888, "step": 3462 }, { - "epoch": 1.04, - "grad_norm": 31.973318099975586, - "learning_rate": 1.3060038087601485e-05, - "loss": 2.6493, + "epoch": 0.43, + "grad_norm": 12.790801048278809, + "learning_rate": 1.7103292473748066e-05, + "loss": 3.0152, "step": 3463 }, { - "epoch": 1.04, - "grad_norm": 10.039262771606445, - "learning_rate": 1.3058033476997093e-05, - "loss": 1.6217, + "epoch": 0.43, + "grad_norm": 22.963237762451172, + "learning_rate": 1.710245575869138e-05, + "loss": 2.8846, "step": 3464 }, { - "epoch": 1.04, - "grad_norm": 17.775131225585938, - "learning_rate": 1.3056028866392705e-05, - "loss": 1.6111, + "epoch": 0.43, + "grad_norm": 5.108229160308838, + "learning_rate": 1.7101619043634693e-05, + "loss": 2.2731, "step": 3465 }, { - "epoch": 1.04, - "grad_norm": 41.91878890991211, - "learning_rate": 1.3054024255788314e-05, - "loss": 2.4916, + "epoch": 0.43, + "grad_norm": 15.958724021911621, + "learning_rate": 1.7100782328578004e-05, + "loss": 0.1816, "step": 3466 }, { - "epoch": 1.04, - "grad_norm": 19.899276733398438, - "learning_rate": 1.3052019645183924e-05, - "loss": 2.2337, + "epoch": 0.44, + "grad_norm": 4.365947246551514, + "learning_rate": 1.7099945613521317e-05, + "loss": 1.2604, "step": 3467 }, { - "epoch": 1.04, - "grad_norm": 12.127092361450195, - "learning_rate": 1.3050015034579534e-05, - "loss": 1.9265, + "epoch": 0.44, + "grad_norm": 10.918221473693848, + "learning_rate": 1.709910889846463e-05, + "loss": 1.8209, "step": 3468 }, { - "epoch": 1.04, - "grad_norm": 28.322265625, - "learning_rate": 1.3048010423975144e-05, - "loss": 1.8237, + "epoch": 0.44, + "grad_norm": 26.80780601501465, + "learning_rate": 1.709827218340794e-05, + "loss": 2.0377, "step": 3469 }, { - "epoch": 1.04, - "grad_norm": 11.918156623840332, - "learning_rate": 1.3046005813370754e-05, - "loss": 2.1019, + "epoch": 0.44, + "grad_norm": 11.148392677307129, + "learning_rate": 1.7097435468351255e-05, + "loss": 1.6302, "step": 3470 }, { - "epoch": 1.04, - "grad_norm": 10.887943267822266, - "learning_rate": 1.3044001202766364e-05, - "loss": 2.118, + "epoch": 0.44, + "grad_norm": 11.951037406921387, + "learning_rate": 1.7096598753294565e-05, + "loss": 1.1111, "step": 3471 }, { - "epoch": 1.04, - "grad_norm": 14.735898971557617, - "learning_rate": 1.3041996592161974e-05, - "loss": 1.5392, + "epoch": 0.44, + "grad_norm": 5.3847975730896, + "learning_rate": 1.709576203823788e-05, + "loss": 0.1638, "step": 3472 }, { - "epoch": 1.04, - "grad_norm": 22.728878021240234, - "learning_rate": 1.3039991981557582e-05, - "loss": 1.9459, + "epoch": 0.44, + "grad_norm": 28.452150344848633, + "learning_rate": 1.7094925323181192e-05, + "loss": 5.5813, "step": 3473 }, { - "epoch": 1.04, - "grad_norm": 9.619507789611816, - "learning_rate": 1.3037987370953194e-05, - "loss": 1.1765, + "epoch": 0.44, + "grad_norm": 16.50628662109375, + "learning_rate": 1.7094088608124503e-05, + "loss": 2.004, "step": 3474 }, { - "epoch": 1.04, - "grad_norm": 13.704413414001465, - "learning_rate": 1.3035982760348804e-05, - "loss": 1.5395, + "epoch": 0.44, + "grad_norm": 27.793264389038086, + "learning_rate": 1.7093251893067816e-05, + "loss": 3.2009, "step": 3475 }, { - "epoch": 1.05, - "grad_norm": 13.936915397644043, - "learning_rate": 1.3033978149744413e-05, - "loss": 1.2055, + "epoch": 0.44, + "grad_norm": 16.817726135253906, + "learning_rate": 1.709241517801113e-05, + "loss": 1.409, "step": 3476 }, { - "epoch": 1.05, - "grad_norm": 23.567424774169922, - "learning_rate": 1.3031973539140024e-05, - "loss": 1.7821, + "epoch": 0.44, + "grad_norm": 17.985727310180664, + "learning_rate": 1.709157846295444e-05, + "loss": 1.8906, "step": 3477 }, { - "epoch": 1.05, - "grad_norm": 17.09320640563965, - "learning_rate": 1.3029968928535633e-05, - "loss": 2.4582, + "epoch": 0.44, + "grad_norm": 13.15288257598877, + "learning_rate": 1.7090741747897754e-05, + "loss": 2.555, "step": 3478 }, { - "epoch": 1.05, - "grad_norm": 12.296806335449219, - "learning_rate": 1.3027964317931243e-05, - "loss": 1.4382, + "epoch": 0.44, + "grad_norm": 24.595491409301758, + "learning_rate": 1.7089905032841067e-05, + "loss": 1.6678, "step": 3479 }, { - "epoch": 1.05, - "grad_norm": 12.535650253295898, - "learning_rate": 1.3025959707326851e-05, - "loss": 1.2141, - "step": 3480 - }, - { - "epoch": 1.05, - "eval_loss": 0.22333772480487823, - "eval_runtime": 43.6637, - "eval_samples_per_second": 33.873, - "eval_steps_per_second": 33.873, + "epoch": 0.44, + "grad_norm": 12.645052909851074, + "learning_rate": 1.708906831778438e-05, + "loss": 2.2637, "step": 3480 }, { - "epoch": 1.05, - "grad_norm": 8.423393249511719, - "learning_rate": 1.3023955096722463e-05, - "loss": 1.2467, + "epoch": 0.44, + "grad_norm": 9.597436904907227, + "learning_rate": 1.708823160272769e-05, + "loss": 2.2271, "step": 3481 }, { - "epoch": 1.05, - "grad_norm": 13.054819107055664, - "learning_rate": 1.3021950486118073e-05, - "loss": 1.2692, + "epoch": 0.44, + "grad_norm": 10.511176109313965, + "learning_rate": 1.7087394887671005e-05, + "loss": 1.7628, "step": 3482 }, { - "epoch": 1.05, - "grad_norm": 67.5218734741211, - "learning_rate": 1.3019945875513682e-05, - "loss": 2.0009, + "epoch": 0.44, + "grad_norm": 7.529833793640137, + "learning_rate": 1.708655817261432e-05, + "loss": 1.633, "step": 3483 }, { - "epoch": 1.05, - "grad_norm": 51.58919143676758, - "learning_rate": 1.3017941264909293e-05, - "loss": 3.2618, + "epoch": 0.44, + "grad_norm": 24.71387481689453, + "learning_rate": 1.708572145755763e-05, + "loss": 0.5247, "step": 3484 }, { - "epoch": 1.05, - "grad_norm": 24.077619552612305, - "learning_rate": 1.3015936654304902e-05, - "loss": 2.4753, + "epoch": 0.44, + "grad_norm": 28.796314239501953, + "learning_rate": 1.7084884742500943e-05, + "loss": 2.1186, "step": 3485 }, { - "epoch": 1.05, - "grad_norm": 25.09623146057129, - "learning_rate": 1.3013932043700512e-05, - "loss": 1.9661, + "epoch": 0.44, + "grad_norm": 17.669151306152344, + "learning_rate": 1.7084048027444256e-05, + "loss": 3.1068, "step": 3486 }, { - "epoch": 1.05, - "grad_norm": 33.02434539794922, - "learning_rate": 1.3011927433096124e-05, - "loss": 2.3949, + "epoch": 0.44, + "grad_norm": 4.942477703094482, + "learning_rate": 1.708321131238757e-05, + "loss": 0.2292, "step": 3487 }, { - "epoch": 1.05, - "grad_norm": 16.488039016723633, - "learning_rate": 1.3009922822491732e-05, - "loss": 2.4064, + "epoch": 0.44, + "grad_norm": 37.090293884277344, + "learning_rate": 1.708237459733088e-05, + "loss": 3.1592, "step": 3488 }, { - "epoch": 1.05, - "grad_norm": 24.7977237701416, - "learning_rate": 1.3007918211887342e-05, - "loss": 1.2102, + "epoch": 0.44, + "grad_norm": 12.799302101135254, + "learning_rate": 1.7081537882274194e-05, + "loss": 2.6949, "step": 3489 }, { - "epoch": 1.05, - "grad_norm": 16.36675262451172, - "learning_rate": 1.300591360128295e-05, - "loss": 1.4976, + "epoch": 0.44, + "grad_norm": 11.44233512878418, + "learning_rate": 1.7080701167217507e-05, + "loss": 1.5554, "step": 3490 }, { - "epoch": 1.05, - "grad_norm": 16.940837860107422, - "learning_rate": 1.3003908990678562e-05, - "loss": 1.9573, + "epoch": 0.44, + "grad_norm": 16.598114013671875, + "learning_rate": 1.7079864452160818e-05, + "loss": 2.6653, "step": 3491 }, { - "epoch": 1.05, - "grad_norm": 17.5327091217041, - "learning_rate": 1.300190438007417e-05, - "loss": 1.9867, + "epoch": 0.44, + "grad_norm": 19.33988380432129, + "learning_rate": 1.707902773710413e-05, + "loss": 3.1783, "step": 3492 }, { - "epoch": 1.05, - "grad_norm": 14.204058647155762, - "learning_rate": 1.299989976946978e-05, - "loss": 1.8624, + "epoch": 0.44, + "grad_norm": 12.424285888671875, + "learning_rate": 1.7078191022047445e-05, + "loss": 0.5065, "step": 3493 }, { - "epoch": 1.05, - "grad_norm": 41.586021423339844, - "learning_rate": 1.2997895158865392e-05, - "loss": 3.3605, + "epoch": 0.44, + "grad_norm": 26.837005615234375, + "learning_rate": 1.7077354306990755e-05, + "loss": 2.4939, "step": 3494 }, { - "epoch": 1.05, - "grad_norm": 18.94367218017578, - "learning_rate": 1.2995890548261001e-05, - "loss": 1.3739, + "epoch": 0.44, + "grad_norm": 28.414806365966797, + "learning_rate": 1.707651759193407e-05, + "loss": 2.4368, "step": 3495 }, { - "epoch": 1.05, - "grad_norm": 23.99241828918457, - "learning_rate": 1.2993885937656611e-05, - "loss": 2.0589, + "epoch": 0.44, + "grad_norm": 23.595781326293945, + "learning_rate": 1.707568087687738e-05, + "loss": 1.5891, "step": 3496 }, { - "epoch": 1.05, - "grad_norm": 11.752195358276367, - "learning_rate": 1.2991881327052221e-05, - "loss": 1.3307, + "epoch": 0.44, + "grad_norm": 11.007417678833008, + "learning_rate": 1.7074844161820693e-05, + "loss": 2.7005, "step": 3497 }, { - "epoch": 1.05, - "grad_norm": 16.153244018554688, - "learning_rate": 1.2989876716447831e-05, - "loss": 1.6504, + "epoch": 0.44, + "grad_norm": 14.353242874145508, + "learning_rate": 1.7074007446764006e-05, + "loss": 2.0073, "step": 3498 }, { - "epoch": 1.05, - "grad_norm": 15.57225513458252, - "learning_rate": 1.2987872105843441e-05, - "loss": 2.247, + "epoch": 0.44, + "grad_norm": 15.713659286499023, + "learning_rate": 1.7073170731707317e-05, + "loss": 1.9655, "step": 3499 }, { - "epoch": 1.05, - "grad_norm": 14.485071182250977, - "learning_rate": 1.2985867495239051e-05, - "loss": 1.7557, + "epoch": 0.44, + "grad_norm": 40.209938049316406, + "learning_rate": 1.707233401665063e-05, + "loss": 2.9224, "step": 3500 }, { - "epoch": 1.05, - "grad_norm": 8.054557800292969, - "learning_rate": 1.2983862884634661e-05, - "loss": 1.3367, + "epoch": 0.44, + "grad_norm": 11.3770170211792, + "learning_rate": 1.7071497301593944e-05, + "loss": 2.0867, "step": 3501 }, { - "epoch": 1.05, - "grad_norm": 19.42696189880371, - "learning_rate": 1.298185827403027e-05, - "loss": 1.7359, + "epoch": 0.44, + "grad_norm": 7.715713977813721, + "learning_rate": 1.7070660586537254e-05, + "loss": 0.7953, "step": 3502 }, { - "epoch": 1.05, - "grad_norm": 21.340444564819336, - "learning_rate": 1.2979853663425882e-05, - "loss": 2.8674, + "epoch": 0.44, + "grad_norm": 21.278911590576172, + "learning_rate": 1.7069823871480568e-05, + "loss": 1.9252, "step": 3503 }, { - "epoch": 1.05, - "grad_norm": 16.97898292541504, - "learning_rate": 1.297784905282149e-05, - "loss": 2.4501, + "epoch": 0.44, + "grad_norm": 20.147659301757812, + "learning_rate": 1.706898715642388e-05, + "loss": 2.7377, "step": 3504 }, { - "epoch": 1.05, - "grad_norm": 17.556812286376953, - "learning_rate": 1.29758444422171e-05, - "loss": 1.421, + "epoch": 0.44, + "grad_norm": 42.69921112060547, + "learning_rate": 1.7068150441367192e-05, + "loss": 2.561, "step": 3505 }, { - "epoch": 1.05, - "grad_norm": 12.702077865600586, - "learning_rate": 1.2973839831612712e-05, - "loss": 2.1779, + "epoch": 0.44, + "grad_norm": 8.338047981262207, + "learning_rate": 1.7067313726310505e-05, + "loss": 1.3805, "step": 3506 }, { - "epoch": 1.05, - "grad_norm": 17.422500610351562, - "learning_rate": 1.297183522100832e-05, - "loss": 2.1388, + "epoch": 0.44, + "grad_norm": 7.393989562988281, + "learning_rate": 1.706647701125382e-05, + "loss": 0.876, "step": 3507 }, { - "epoch": 1.05, - "grad_norm": 27.59862518310547, - "learning_rate": 1.296983061040393e-05, - "loss": 3.23, + "epoch": 0.44, + "grad_norm": 11.099953651428223, + "learning_rate": 1.7065640296197133e-05, + "loss": 1.3944, "step": 3508 }, { - "epoch": 1.06, - "grad_norm": 11.814319610595703, - "learning_rate": 1.2967825999799539e-05, - "loss": 1.5021, + "epoch": 0.44, + "grad_norm": 12.176742553710938, + "learning_rate": 1.7064803581140443e-05, + "loss": 2.0733, "step": 3509 }, { - "epoch": 1.06, - "grad_norm": 10.947737693786621, - "learning_rate": 1.296582138919515e-05, - "loss": 1.298, + "epoch": 0.44, + "grad_norm": 15.4348783493042, + "learning_rate": 1.7063966866083757e-05, + "loss": 2.6984, "step": 3510 }, { - "epoch": 1.06, - "grad_norm": 14.834168434143066, - "learning_rate": 1.2963816778590759e-05, - "loss": 1.6569, + "epoch": 0.44, + "grad_norm": 6.117305278778076, + "learning_rate": 1.706313015102707e-05, + "loss": 1.8807, "step": 3511 }, { - "epoch": 1.06, - "grad_norm": 27.23177719116211, - "learning_rate": 1.2961812167986369e-05, - "loss": 1.6755, + "epoch": 0.44, + "grad_norm": 11.953407287597656, + "learning_rate": 1.706229343597038e-05, + "loss": 1.2245, "step": 3512 }, { - "epoch": 1.06, - "grad_norm": 16.08486557006836, - "learning_rate": 1.295980755738198e-05, - "loss": 1.8572, + "epoch": 0.44, + "grad_norm": 29.934341430664062, + "learning_rate": 1.7061456720913694e-05, + "loss": 2.3307, "step": 3513 }, { - "epoch": 1.06, - "grad_norm": 23.723173141479492, - "learning_rate": 1.2957802946777589e-05, - "loss": 2.7123, + "epoch": 0.44, + "grad_norm": 29.04018783569336, + "learning_rate": 1.7060620005857008e-05, + "loss": 2.1927, "step": 3514 }, { - "epoch": 1.06, - "grad_norm": 15.125565528869629, - "learning_rate": 1.29557983361732e-05, - "loss": 2.5594, + "epoch": 0.44, + "grad_norm": 16.454509735107422, + "learning_rate": 1.705978329080032e-05, + "loss": 2.3173, "step": 3515 }, { - "epoch": 1.06, - "grad_norm": 25.76398468017578, - "learning_rate": 1.295379372556881e-05, - "loss": 1.7984, + "epoch": 0.44, + "grad_norm": 16.549463272094727, + "learning_rate": 1.7058946575743632e-05, + "loss": 1.8457, "step": 3516 }, { - "epoch": 1.06, - "grad_norm": 18.840059280395508, - "learning_rate": 1.295178911496442e-05, - "loss": 2.232, + "epoch": 0.44, + "grad_norm": 9.819560050964355, + "learning_rate": 1.7058109860686945e-05, + "loss": 2.0945, "step": 3517 }, { - "epoch": 1.06, - "grad_norm": 35.28987503051758, - "learning_rate": 1.294978450436003e-05, - "loss": 2.267, + "epoch": 0.44, + "grad_norm": 14.210806846618652, + "learning_rate": 1.705727314563026e-05, + "loss": 2.162, "step": 3518 }, { - "epoch": 1.06, - "grad_norm": 7.588108062744141, - "learning_rate": 1.294777989375564e-05, - "loss": 1.2528, + "epoch": 0.44, + "grad_norm": 37.674293518066406, + "learning_rate": 1.705643643057357e-05, + "loss": 1.5869, "step": 3519 }, { - "epoch": 1.06, - "grad_norm": 13.926599502563477, - "learning_rate": 1.294577528315125e-05, - "loss": 1.7358, + "epoch": 0.44, + "grad_norm": 11.009414672851562, + "learning_rate": 1.7055599715516883e-05, + "loss": 1.1545, "step": 3520 }, { - "epoch": 1.06, - "grad_norm": 16.25490951538086, - "learning_rate": 1.2943770672546858e-05, - "loss": 2.2304, + "epoch": 0.44, + "grad_norm": 21.234272003173828, + "learning_rate": 1.7054763000460197e-05, + "loss": 2.4492, "step": 3521 }, { - "epoch": 1.06, - "grad_norm": 30.820024490356445, - "learning_rate": 1.294176606194247e-05, - "loss": 2.0343, + "epoch": 0.44, + "grad_norm": 19.242658615112305, + "learning_rate": 1.7053926285403507e-05, + "loss": 2.7148, "step": 3522 }, { - "epoch": 1.06, - "grad_norm": 15.902755737304688, - "learning_rate": 1.2939761451338078e-05, - "loss": 1.5815, + "epoch": 0.44, + "grad_norm": 16.382221221923828, + "learning_rate": 1.705308957034682e-05, + "loss": 2.6571, "step": 3523 }, { - "epoch": 1.06, - "grad_norm": 22.500484466552734, - "learning_rate": 1.2937756840733688e-05, - "loss": 2.0932, + "epoch": 0.44, + "grad_norm": 5.982744216918945, + "learning_rate": 1.705225285529013e-05, + "loss": 2.6794, "step": 3524 }, { - "epoch": 1.06, - "grad_norm": 11.80160903930664, - "learning_rate": 1.29357522301293e-05, - "loss": 1.8902, + "epoch": 0.44, + "grad_norm": 15.731637001037598, + "learning_rate": 1.7051416140233444e-05, + "loss": 1.7883, "step": 3525 }, { - "epoch": 1.06, - "grad_norm": 24.187225341796875, - "learning_rate": 1.2933747619524908e-05, - "loss": 1.9731, + "epoch": 0.44, + "grad_norm": 15.043095588684082, + "learning_rate": 1.7050579425176758e-05, + "loss": 1.3555, "step": 3526 }, { - "epoch": 1.06, - "grad_norm": 13.519028663635254, - "learning_rate": 1.2931743008920519e-05, - "loss": 1.9801, + "epoch": 0.44, + "grad_norm": 15.139749526977539, + "learning_rate": 1.704974271012007e-05, + "loss": 4.0904, "step": 3527 }, { - "epoch": 1.06, - "grad_norm": 54.62118911743164, - "learning_rate": 1.2929738398316127e-05, - "loss": 3.4219, + "epoch": 0.44, + "grad_norm": 13.007729530334473, + "learning_rate": 1.7048905995063382e-05, + "loss": 1.2369, "step": 3528 }, { - "epoch": 1.06, - "grad_norm": 12.734724044799805, - "learning_rate": 1.2927733787711739e-05, - "loss": 2.3175, + "epoch": 0.44, + "grad_norm": 9.70779037475586, + "learning_rate": 1.7048069280006696e-05, + "loss": 1.2312, "step": 3529 }, { - "epoch": 1.06, - "grad_norm": 15.261761665344238, - "learning_rate": 1.2925729177107349e-05, - "loss": 2.6037, + "epoch": 0.44, + "grad_norm": 12.419896125793457, + "learning_rate": 1.7047232564950006e-05, + "loss": 2.3041, "step": 3530 }, { - "epoch": 1.06, - "grad_norm": 19.809280395507812, - "learning_rate": 1.2923724566502957e-05, - "loss": 1.9901, + "epoch": 0.44, + "grad_norm": 6.6420698165893555, + "learning_rate": 1.704639584989332e-05, + "loss": 0.5756, "step": 3531 }, { - "epoch": 1.06, - "grad_norm": 18.573335647583008, - "learning_rate": 1.2921719955898569e-05, - "loss": 2.4825, + "epoch": 0.44, + "grad_norm": 9.08359146118164, + "learning_rate": 1.7045559134836633e-05, + "loss": 1.7639, "step": 3532 }, { - "epoch": 1.06, - "grad_norm": 17.13604736328125, - "learning_rate": 1.2919715345294177e-05, - "loss": 1.3552, + "epoch": 0.44, + "grad_norm": 11.831747055053711, + "learning_rate": 1.7044722419779943e-05, + "loss": 0.6694, "step": 3533 }, { - "epoch": 1.06, - "grad_norm": 14.92817497253418, - "learning_rate": 1.2917710734689787e-05, - "loss": 2.1547, + "epoch": 0.44, + "grad_norm": 29.910619735717773, + "learning_rate": 1.7043885704723257e-05, + "loss": 2.3868, "step": 3534 }, { - "epoch": 1.06, - "grad_norm": 26.906400680541992, - "learning_rate": 1.2915706124085396e-05, - "loss": 1.741, + "epoch": 0.44, + "grad_norm": 11.793986320495605, + "learning_rate": 1.704304898966657e-05, + "loss": 2.742, "step": 3535 }, { - "epoch": 1.06, - "grad_norm": 17.950481414794922, - "learning_rate": 1.2913701513481008e-05, - "loss": 1.7533, + "epoch": 0.44, + "grad_norm": 15.16977310180664, + "learning_rate": 1.7042212274609884e-05, + "loss": 3.2701, "step": 3536 }, { - "epoch": 1.06, - "grad_norm": 18.598140716552734, - "learning_rate": 1.2911696902876618e-05, - "loss": 1.9817, + "epoch": 0.44, + "grad_norm": 25.637859344482422, + "learning_rate": 1.7041375559553195e-05, + "loss": 2.5424, "step": 3537 }, { - "epoch": 1.06, - "grad_norm": 20.901409149169922, - "learning_rate": 1.2909692292272226e-05, - "loss": 1.8785, + "epoch": 0.44, + "grad_norm": 15.422876358032227, + "learning_rate": 1.7040538844496508e-05, + "loss": 2.5337, "step": 3538 }, { - "epoch": 1.06, - "grad_norm": 13.569160461425781, - "learning_rate": 1.2907687681667838e-05, - "loss": 2.0731, + "epoch": 0.44, + "grad_norm": 24.96222496032715, + "learning_rate": 1.7039702129439822e-05, + "loss": 1.9194, "step": 3539 }, { - "epoch": 1.06, - "grad_norm": 16.909326553344727, - "learning_rate": 1.2905683071063446e-05, - "loss": 1.912, + "epoch": 0.44, + "grad_norm": 26.966215133666992, + "learning_rate": 1.7038865414383132e-05, + "loss": 1.4356, "step": 3540 }, { - "epoch": 1.06, - "grad_norm": 15.05936050415039, - "learning_rate": 1.2903678460459056e-05, - "loss": 3.6215, + "epoch": 0.44, + "grad_norm": 9.480104446411133, + "learning_rate": 1.7038028699326446e-05, + "loss": 1.5081, "step": 3541 }, { - "epoch": 1.06, - "grad_norm": 12.6626615524292, - "learning_rate": 1.2901673849854668e-05, - "loss": 1.6078, + "epoch": 0.44, + "grad_norm": 6.7902445793151855, + "learning_rate": 1.703719198426976e-05, + "loss": 0.9859, "step": 3542 }, { - "epoch": 1.07, - "grad_norm": 12.250699043273926, - "learning_rate": 1.2899669239250276e-05, - "loss": 1.9456, + "epoch": 0.44, + "grad_norm": 7.768956184387207, + "learning_rate": 1.7036355269213073e-05, + "loss": 1.11, "step": 3543 }, { - "epoch": 1.07, - "grad_norm": 28.121313095092773, - "learning_rate": 1.2897664628645887e-05, - "loss": 2.6878, + "epoch": 0.44, + "grad_norm": 11.356424331665039, + "learning_rate": 1.7035518554156383e-05, + "loss": 2.138, "step": 3544 }, { - "epoch": 1.07, - "grad_norm": 21.93025779724121, - "learning_rate": 1.2895660018041497e-05, - "loss": 2.364, + "epoch": 0.44, + "grad_norm": 16.411840438842773, + "learning_rate": 1.7034681839099697e-05, + "loss": 1.1534, "step": 3545 }, { - "epoch": 1.07, - "grad_norm": 11.087515830993652, - "learning_rate": 1.2893655407437107e-05, - "loss": 2.0794, + "epoch": 0.45, + "grad_norm": 17.7725830078125, + "learning_rate": 1.703384512404301e-05, + "loss": 2.2293, "step": 3546 }, { - "epoch": 1.07, - "grad_norm": 38.5289421081543, - "learning_rate": 1.2891650796832715e-05, - "loss": 2.4577, + "epoch": 0.45, + "grad_norm": 12.674851417541504, + "learning_rate": 1.703300840898632e-05, + "loss": 1.0665, "step": 3547 }, { - "epoch": 1.07, - "grad_norm": 29.22125816345215, - "learning_rate": 1.2889646186228327e-05, - "loss": 2.4881, + "epoch": 0.45, + "grad_norm": 6.166348457336426, + "learning_rate": 1.7032171693929635e-05, + "loss": 1.0158, "step": 3548 }, { - "epoch": 1.07, - "grad_norm": 25.916982650756836, - "learning_rate": 1.2887641575623937e-05, - "loss": 1.4721, + "epoch": 0.45, + "grad_norm": 4.511837482452393, + "learning_rate": 1.7031334978872945e-05, + "loss": 0.0695, "step": 3549 }, { - "epoch": 1.07, - "grad_norm": 20.641904830932617, - "learning_rate": 1.2885636965019545e-05, - "loss": 1.6015, + "epoch": 0.45, + "grad_norm": 3.8872060775756836, + "learning_rate": 1.703049826381626e-05, + "loss": 0.1033, "step": 3550 }, { - "epoch": 1.07, - "grad_norm": 14.579924583435059, - "learning_rate": 1.2883632354415157e-05, - "loss": 1.638, + "epoch": 0.45, + "grad_norm": 19.17310905456543, + "learning_rate": 1.7029661548759572e-05, + "loss": 3.2545, "step": 3551 }, { - "epoch": 1.07, - "grad_norm": 36.379661560058594, - "learning_rate": 1.2881627743810766e-05, - "loss": 2.2669, + "epoch": 0.45, + "grad_norm": 3.6682071685791016, + "learning_rate": 1.7028824833702882e-05, + "loss": 0.087, "step": 3552 }, { - "epoch": 1.07, - "grad_norm": 16.51966094970703, - "learning_rate": 1.2879623133206376e-05, - "loss": 1.7893, + "epoch": 0.45, + "grad_norm": 5.926275730133057, + "learning_rate": 1.7027988118646196e-05, + "loss": 0.7951, "step": 3553 }, { - "epoch": 1.07, - "grad_norm": 23.127784729003906, - "learning_rate": 1.2877618522601984e-05, - "loss": 2.266, + "epoch": 0.45, + "grad_norm": 6.377835273742676, + "learning_rate": 1.7027151403589506e-05, + "loss": 1.3846, "step": 3554 }, { - "epoch": 1.07, - "grad_norm": 17.028398513793945, - "learning_rate": 1.2875613911997596e-05, - "loss": 1.6085, + "epoch": 0.45, + "grad_norm": 6.7998127937316895, + "learning_rate": 1.702631468853282e-05, + "loss": 1.9433, "step": 3555 }, { - "epoch": 1.07, - "grad_norm": 9.327510833740234, - "learning_rate": 1.2873609301393206e-05, - "loss": 1.155, + "epoch": 0.45, + "grad_norm": 10.768909454345703, + "learning_rate": 1.7025477973476134e-05, + "loss": 1.9161, "step": 3556 }, { - "epoch": 1.07, - "grad_norm": 9.496240615844727, - "learning_rate": 1.2871604690788814e-05, - "loss": 1.3495, + "epoch": 0.45, + "grad_norm": 19.759462356567383, + "learning_rate": 1.7024641258419447e-05, + "loss": 2.5175, "step": 3557 }, { - "epoch": 1.07, - "grad_norm": 10.282184600830078, - "learning_rate": 1.2869600080184426e-05, - "loss": 1.4117, + "epoch": 0.45, + "grad_norm": 15.046670913696289, + "learning_rate": 1.7023804543362758e-05, + "loss": 3.0608, "step": 3558 }, { - "epoch": 1.07, - "grad_norm": 15.490508079528809, - "learning_rate": 1.2867595469580034e-05, - "loss": 1.5863, + "epoch": 0.45, + "grad_norm": 14.481589317321777, + "learning_rate": 1.702296782830607e-05, + "loss": 1.6803, "step": 3559 }, { - "epoch": 1.07, - "grad_norm": 17.8187198638916, - "learning_rate": 1.2865590858975645e-05, - "loss": 1.2819, + "epoch": 0.45, + "grad_norm": 12.190134048461914, + "learning_rate": 1.7022131113249385e-05, + "loss": 1.4591, "step": 3560 }, { - "epoch": 1.07, - "grad_norm": 17.21719741821289, - "learning_rate": 1.2863586248371256e-05, - "loss": 2.0228, + "epoch": 0.45, + "grad_norm": 3.3954739570617676, + "learning_rate": 1.7021294398192695e-05, + "loss": 1.704, "step": 3561 }, { - "epoch": 1.07, - "grad_norm": 23.23194694519043, - "learning_rate": 1.2861581637766865e-05, - "loss": 1.858, + "epoch": 0.45, + "grad_norm": 10.886055946350098, + "learning_rate": 1.702045768313601e-05, + "loss": 1.1718, "step": 3562 }, { - "epoch": 1.07, - "grad_norm": 25.972341537475586, - "learning_rate": 1.2859577027162475e-05, - "loss": 1.3268, + "epoch": 0.45, + "grad_norm": 14.209492683410645, + "learning_rate": 1.7019620968079322e-05, + "loss": 2.3727, "step": 3563 }, { - "epoch": 1.07, - "grad_norm": 27.290193557739258, - "learning_rate": 1.2857572416558083e-05, - "loss": 2.174, + "epoch": 0.45, + "grad_norm": 15.119983673095703, + "learning_rate": 1.7018784253022636e-05, + "loss": 1.0797, "step": 3564 }, { - "epoch": 1.07, - "grad_norm": 14.838099479675293, - "learning_rate": 1.2855567805953695e-05, - "loss": 1.9506, + "epoch": 0.45, + "grad_norm": 20.525146484375, + "learning_rate": 1.7017947537965946e-05, + "loss": 2.7043, "step": 3565 }, { - "epoch": 1.07, - "grad_norm": 8.05460262298584, - "learning_rate": 1.2853563195349303e-05, - "loss": 1.2423, + "epoch": 0.45, + "grad_norm": 20.406822204589844, + "learning_rate": 1.701711082290926e-05, + "loss": 2.5528, "step": 3566 }, { - "epoch": 1.07, - "grad_norm": 10.019596099853516, - "learning_rate": 1.2851558584744913e-05, - "loss": 1.405, + "epoch": 0.45, + "grad_norm": 12.409043312072754, + "learning_rate": 1.7016274107852574e-05, + "loss": 1.8563, "step": 3567 }, { - "epoch": 1.07, - "grad_norm": 16.532649993896484, - "learning_rate": 1.2849553974140525e-05, - "loss": 1.5799, + "epoch": 0.45, + "grad_norm": 8.90954303741455, + "learning_rate": 1.7015437392795884e-05, + "loss": 1.0523, "step": 3568 }, { - "epoch": 1.07, - "grad_norm": 14.281306266784668, - "learning_rate": 1.2847549363536134e-05, - "loss": 1.1582, + "epoch": 0.45, + "grad_norm": 25.932857513427734, + "learning_rate": 1.7014600677739198e-05, + "loss": 1.5185, "step": 3569 }, { - "epoch": 1.07, - "grad_norm": 9.972299575805664, - "learning_rate": 1.2845544752931745e-05, - "loss": 1.2031, + "epoch": 0.45, + "grad_norm": 21.852935791015625, + "learning_rate": 1.701376396268251e-05, + "loss": 2.45, "step": 3570 }, { - "epoch": 1.07, - "grad_norm": 14.451814651489258, - "learning_rate": 1.2843540142327354e-05, - "loss": 1.5313, + "epoch": 0.45, + "grad_norm": 18.34440803527832, + "learning_rate": 1.7012927247625825e-05, + "loss": 2.2476, "step": 3571 }, { - "epoch": 1.07, - "grad_norm": 15.139348030090332, - "learning_rate": 1.2841535531722964e-05, - "loss": 1.9977, + "epoch": 0.45, + "grad_norm": 8.53489875793457, + "learning_rate": 1.7012090532569135e-05, + "loss": 0.7178, "step": 3572 }, { - "epoch": 1.07, - "grad_norm": 25.56646156311035, - "learning_rate": 1.2839530921118576e-05, - "loss": 2.2934, + "epoch": 0.45, + "grad_norm": 14.003607749938965, + "learning_rate": 1.701125381751245e-05, + "loss": 1.3264, "step": 3573 }, { - "epoch": 1.07, - "grad_norm": 11.112027168273926, - "learning_rate": 1.2837526310514184e-05, - "loss": 1.5678, + "epoch": 0.45, + "grad_norm": 22.757652282714844, + "learning_rate": 1.7010417102455762e-05, + "loss": 2.5163, "step": 3574 }, { - "epoch": 1.07, - "grad_norm": 14.32687759399414, - "learning_rate": 1.2835521699909794e-05, - "loss": 1.1919, + "epoch": 0.45, + "grad_norm": 9.419745445251465, + "learning_rate": 1.7009580387399073e-05, + "loss": 0.6182, "step": 3575 }, { - "epoch": 1.08, - "grad_norm": 11.444255828857422, - "learning_rate": 1.2833517089305402e-05, - "loss": 2.2952, + "epoch": 0.45, + "grad_norm": 12.34254264831543, + "learning_rate": 1.7008743672342386e-05, + "loss": 2.8273, "step": 3576 }, { - "epoch": 1.08, - "grad_norm": 34.31683349609375, - "learning_rate": 1.2831512478701014e-05, - "loss": 1.8054, + "epoch": 0.45, + "grad_norm": 6.758653163909912, + "learning_rate": 1.7007906957285697e-05, + "loss": 0.4665, "step": 3577 }, { - "epoch": 1.08, - "grad_norm": 16.4061336517334, - "learning_rate": 1.2829507868096623e-05, - "loss": 1.1196, + "epoch": 0.45, + "grad_norm": 11.435264587402344, + "learning_rate": 1.700707024222901e-05, + "loss": 1.9646, "step": 3578 }, { - "epoch": 1.08, - "grad_norm": 12.305190086364746, - "learning_rate": 1.2827503257492233e-05, - "loss": 2.2021, + "epoch": 0.45, + "grad_norm": 8.873312950134277, + "learning_rate": 1.7006233527172324e-05, + "loss": 2.4812, "step": 3579 }, { - "epoch": 1.08, - "grad_norm": 16.25429916381836, - "learning_rate": 1.2825498646887845e-05, - "loss": 1.629, + "epoch": 0.45, + "grad_norm": 15.715291976928711, + "learning_rate": 1.7005396812115634e-05, + "loss": 1.6792, "step": 3580 }, { - "epoch": 1.08, - "grad_norm": 25.61264419555664, - "learning_rate": 1.2823494036283453e-05, - "loss": 2.7876, + "epoch": 0.45, + "grad_norm": 23.683542251586914, + "learning_rate": 1.7004560097058948e-05, + "loss": 1.7782, "step": 3581 }, { - "epoch": 1.08, - "grad_norm": 20.26491928100586, - "learning_rate": 1.2821489425679063e-05, - "loss": 1.1916, + "epoch": 0.45, + "grad_norm": 36.01570129394531, + "learning_rate": 1.7003723382002258e-05, + "loss": 1.6319, "step": 3582 }, { - "epoch": 1.08, - "grad_norm": 10.56248950958252, - "learning_rate": 1.2819484815074671e-05, - "loss": 1.302, + "epoch": 0.45, + "grad_norm": 13.715401649475098, + "learning_rate": 1.700288666694557e-05, + "loss": 3.0127, "step": 3583 }, { - "epoch": 1.08, - "grad_norm": 15.701934814453125, - "learning_rate": 1.2817480204470283e-05, - "loss": 1.5957, + "epoch": 0.45, + "grad_norm": 87.07559204101562, + "learning_rate": 1.7002049951888885e-05, + "loss": 3.2245, "step": 3584 }, { - "epoch": 1.08, - "grad_norm": 17.989017486572266, - "learning_rate": 1.2815475593865893e-05, - "loss": 1.2969, + "epoch": 0.45, + "grad_norm": 18.235490798950195, + "learning_rate": 1.70012132368322e-05, + "loss": 3.6731, "step": 3585 }, { - "epoch": 1.08, - "grad_norm": 22.148818969726562, - "learning_rate": 1.2813470983261502e-05, - "loss": 2.5261, + "epoch": 0.45, + "grad_norm": 16.872243881225586, + "learning_rate": 1.700037652177551e-05, + "loss": 3.3169, "step": 3586 }, { - "epoch": 1.08, - "grad_norm": 26.945255279541016, - "learning_rate": 1.2811466372657113e-05, - "loss": 1.8008, + "epoch": 0.45, + "grad_norm": 21.76947784423828, + "learning_rate": 1.6999539806718823e-05, + "loss": 1.0939, "step": 3587 }, { - "epoch": 1.08, - "grad_norm": 13.003181457519531, - "learning_rate": 1.2809461762052722e-05, - "loss": 1.4448, + "epoch": 0.45, + "grad_norm": 9.35506820678711, + "learning_rate": 1.6998703091662137e-05, + "loss": 1.6434, "step": 3588 }, { - "epoch": 1.08, - "grad_norm": 13.033709526062012, - "learning_rate": 1.2807457151448332e-05, - "loss": 2.1353, + "epoch": 0.45, + "grad_norm": 8.141793251037598, + "learning_rate": 1.6997866376605447e-05, + "loss": 2.0123, "step": 3589 }, { - "epoch": 1.08, - "grad_norm": 16.16973876953125, - "learning_rate": 1.2805452540843942e-05, - "loss": 1.847, + "epoch": 0.45, + "grad_norm": 14.612222671508789, + "learning_rate": 1.699702966154876e-05, + "loss": 2.2924, "step": 3590 }, { - "epoch": 1.08, - "grad_norm": 14.109622955322266, - "learning_rate": 1.2803447930239552e-05, - "loss": 2.0083, + "epoch": 0.45, + "grad_norm": 6.75926399230957, + "learning_rate": 1.6996192946492074e-05, + "loss": 1.8664, "step": 3591 }, { - "epoch": 1.08, - "grad_norm": 13.897767066955566, - "learning_rate": 1.2801443319635162e-05, - "loss": 1.2688, + "epoch": 0.45, + "grad_norm": 30.943069458007812, + "learning_rate": 1.6995356231435388e-05, + "loss": 2.8354, "step": 3592 }, { - "epoch": 1.08, - "grad_norm": 18.122596740722656, - "learning_rate": 1.2799438709030772e-05, - "loss": 1.9293, + "epoch": 0.45, + "grad_norm": 13.30562686920166, + "learning_rate": 1.6994519516378698e-05, + "loss": 1.5587, "step": 3593 }, { - "epoch": 1.08, - "grad_norm": 41.87525939941406, - "learning_rate": 1.2797434098426382e-05, - "loss": 3.1812, + "epoch": 0.45, + "grad_norm": 17.608930587768555, + "learning_rate": 1.699368280132201e-05, + "loss": 1.6954, "step": 3594 }, { - "epoch": 1.08, - "grad_norm": 22.537267684936523, - "learning_rate": 1.279542948782199e-05, - "loss": 1.9805, + "epoch": 0.45, + "grad_norm": 13.332588195800781, + "learning_rate": 1.6992846086265325e-05, + "loss": 2.9673, "step": 3595 }, { - "epoch": 1.08, - "grad_norm": 16.393918991088867, - "learning_rate": 1.2793424877217602e-05, - "loss": 1.6751, + "epoch": 0.45, + "grad_norm": 18.777090072631836, + "learning_rate": 1.6992009371208636e-05, + "loss": 2.5276, "step": 3596 }, { - "epoch": 1.08, - "grad_norm": 15.888949394226074, - "learning_rate": 1.2791420266613211e-05, - "loss": 2.2047, + "epoch": 0.45, + "grad_norm": 8.829140663146973, + "learning_rate": 1.699117265615195e-05, + "loss": 2.4567, "step": 3597 }, { - "epoch": 1.08, - "grad_norm": 13.120323181152344, - "learning_rate": 1.2789415656008821e-05, - "loss": 1.527, + "epoch": 0.45, + "grad_norm": 11.514774322509766, + "learning_rate": 1.6990335941095263e-05, + "loss": 0.7589, "step": 3598 }, { - "epoch": 1.08, - "grad_norm": 14.314374923706055, - "learning_rate": 1.2787411045404433e-05, - "loss": 2.3968, + "epoch": 0.45, + "grad_norm": 17.70444679260254, + "learning_rate": 1.6989499226038577e-05, + "loss": 1.0188, "step": 3599 }, { - "epoch": 1.08, - "grad_norm": 20.696285247802734, - "learning_rate": 1.2785406434800041e-05, - "loss": 1.8987, + "epoch": 0.45, + "grad_norm": 3.07735276222229, + "learning_rate": 1.6988662510981887e-05, + "loss": 0.0933, "step": 3600 }, { - "epoch": 1.08, - "eval_loss": 0.23791223764419556, - "eval_runtime": 43.6058, - "eval_samples_per_second": 33.917, - "eval_steps_per_second": 33.917, + "epoch": 0.45, + "eval_loss": 0.15236322581768036, + "eval_runtime": 93.9952, + "eval_samples_per_second": 37.683, + "eval_steps_per_second": 37.683, "step": 3600 }, { - "epoch": 1.08, - "grad_norm": 24.901500701904297, - "learning_rate": 1.2783401824195651e-05, - "loss": 1.9859, + "epoch": 0.45, + "grad_norm": 12.646539688110352, + "learning_rate": 1.69878257959252e-05, + "loss": 2.7868, "step": 3601 }, { - "epoch": 1.08, - "grad_norm": 15.03045654296875, - "learning_rate": 1.278139721359126e-05, - "loss": 1.6967, + "epoch": 0.45, + "grad_norm": 16.227340698242188, + "learning_rate": 1.698698908086851e-05, + "loss": 2.7049, "step": 3602 }, { - "epoch": 1.08, - "grad_norm": 16.57175064086914, - "learning_rate": 1.2779392602986871e-05, - "loss": 1.8149, + "epoch": 0.45, + "grad_norm": 12.179350852966309, + "learning_rate": 1.6986152365811824e-05, + "loss": 1.3154, "step": 3603 }, { - "epoch": 1.08, - "grad_norm": 15.594796180725098, - "learning_rate": 1.2777387992382481e-05, - "loss": 1.4157, + "epoch": 0.45, + "grad_norm": 28.343910217285156, + "learning_rate": 1.6985315650755138e-05, + "loss": 3.3812, "step": 3604 }, { - "epoch": 1.08, - "grad_norm": 14.430771827697754, - "learning_rate": 1.277538338177809e-05, - "loss": 2.3501, + "epoch": 0.45, + "grad_norm": 11.430498123168945, + "learning_rate": 1.6984478935698448e-05, + "loss": 4.2946, "step": 3605 }, { - "epoch": 1.08, - "grad_norm": 38.617862701416016, - "learning_rate": 1.2773378771173702e-05, - "loss": 1.98, + "epoch": 0.45, + "grad_norm": 23.60525131225586, + "learning_rate": 1.6983642220641762e-05, + "loss": 3.3857, "step": 3606 }, { - "epoch": 1.08, - "grad_norm": 19.550251007080078, - "learning_rate": 1.277137416056931e-05, - "loss": 1.2882, + "epoch": 0.45, + "grad_norm": 18.278339385986328, + "learning_rate": 1.6982805505585072e-05, + "loss": 1.5645, "step": 3607 }, { - "epoch": 1.08, - "grad_norm": 19.823959350585938, - "learning_rate": 1.276936954996492e-05, - "loss": 2.7407, + "epoch": 0.45, + "grad_norm": 26.03818130493164, + "learning_rate": 1.6981968790528386e-05, + "loss": 3.5829, "step": 3608 }, { - "epoch": 1.09, - "grad_norm": 46.88768005371094, - "learning_rate": 1.2767364939360528e-05, - "loss": 1.9532, + "epoch": 0.45, + "grad_norm": 24.34980583190918, + "learning_rate": 1.69811320754717e-05, + "loss": 2.2011, "step": 3609 }, { - "epoch": 1.09, - "grad_norm": 15.034628868103027, - "learning_rate": 1.276536032875614e-05, - "loss": 1.2125, + "epoch": 0.45, + "grad_norm": 30.7168025970459, + "learning_rate": 1.698029536041501e-05, + "loss": 0.6755, "step": 3610 }, { - "epoch": 1.09, - "grad_norm": 24.965927124023438, - "learning_rate": 1.276335571815175e-05, - "loss": 2.4233, + "epoch": 0.45, + "grad_norm": 12.209492683410645, + "learning_rate": 1.6979458645358323e-05, + "loss": 2.1438, "step": 3611 }, { - "epoch": 1.09, - "grad_norm": 26.463823318481445, - "learning_rate": 1.2761351107547359e-05, - "loss": 2.3175, + "epoch": 0.45, + "grad_norm": 8.702494621276855, + "learning_rate": 1.6978621930301637e-05, + "loss": 1.2021, "step": 3612 }, { - "epoch": 1.09, - "grad_norm": 15.110565185546875, - "learning_rate": 1.275934649694297e-05, - "loss": 2.4443, + "epoch": 0.45, + "grad_norm": 9.897283554077148, + "learning_rate": 1.697778521524495e-05, + "loss": 1.2587, "step": 3613 }, { - "epoch": 1.09, - "grad_norm": 18.711685180664062, - "learning_rate": 1.2757341886338579e-05, - "loss": 1.5244, + "epoch": 0.45, + "grad_norm": 22.805618286132812, + "learning_rate": 1.697694850018826e-05, + "loss": 0.8335, "step": 3614 }, { - "epoch": 1.09, - "grad_norm": 21.489322662353516, - "learning_rate": 1.2755337275734189e-05, - "loss": 1.9058, + "epoch": 0.45, + "grad_norm": 49.473880767822266, + "learning_rate": 1.6976111785131575e-05, + "loss": 1.9427, "step": 3615 }, { - "epoch": 1.09, - "grad_norm": 15.04526424407959, - "learning_rate": 1.27533326651298e-05, - "loss": 1.4516, + "epoch": 0.45, + "grad_norm": 55.079551696777344, + "learning_rate": 1.6975275070074888e-05, + "loss": 1.6532, "step": 3616 }, { - "epoch": 1.09, - "grad_norm": 12.51637077331543, - "learning_rate": 1.2751328054525409e-05, - "loss": 1.7847, + "epoch": 0.45, + "grad_norm": 12.680688858032227, + "learning_rate": 1.69744383550182e-05, + "loss": 2.6464, "step": 3617 }, { - "epoch": 1.09, - "grad_norm": 29.387882232666016, - "learning_rate": 1.274932344392102e-05, - "loss": 2.5068, + "epoch": 0.45, + "grad_norm": 27.3753662109375, + "learning_rate": 1.6973601639961512e-05, + "loss": 3.3583, "step": 3618 }, { - "epoch": 1.09, - "grad_norm": 18.071352005004883, - "learning_rate": 1.274731883331663e-05, - "loss": 2.6933, + "epoch": 0.45, + "grad_norm": 4.351609706878662, + "learning_rate": 1.6972764924904826e-05, + "loss": 0.6152, "step": 3619 }, { - "epoch": 1.09, - "grad_norm": 20.221973419189453, - "learning_rate": 1.274531422271224e-05, - "loss": 1.3898, + "epoch": 0.45, + "grad_norm": 14.723485946655273, + "learning_rate": 1.697192820984814e-05, + "loss": 2.1768, "step": 3620 }, { - "epoch": 1.09, - "grad_norm": 8.028789520263672, - "learning_rate": 1.2743309612107848e-05, - "loss": 1.2193, + "epoch": 0.45, + "grad_norm": 43.82529830932617, + "learning_rate": 1.697109149479145e-05, + "loss": 1.5029, "step": 3621 }, { - "epoch": 1.09, - "grad_norm": 14.530386924743652, - "learning_rate": 1.274130500150346e-05, - "loss": 2.0765, + "epoch": 0.45, + "grad_norm": 14.625968933105469, + "learning_rate": 1.6970254779734763e-05, + "loss": 2.3635, "step": 3622 }, { - "epoch": 1.09, - "grad_norm": 16.220090866088867, - "learning_rate": 1.273930039089907e-05, - "loss": 1.7681, + "epoch": 0.45, + "grad_norm": 18.74462890625, + "learning_rate": 1.6969418064678077e-05, + "loss": 2.8262, "step": 3623 }, { - "epoch": 1.09, - "grad_norm": 21.331134796142578, - "learning_rate": 1.2737295780294678e-05, - "loss": 2.7775, + "epoch": 0.45, + "grad_norm": 12.347823143005371, + "learning_rate": 1.6968581349621387e-05, + "loss": 2.0164, "step": 3624 }, { - "epoch": 1.09, - "grad_norm": 11.270216941833496, - "learning_rate": 1.273529116969029e-05, - "loss": 2.3103, + "epoch": 0.45, + "grad_norm": 12.963733673095703, + "learning_rate": 1.69677446345647e-05, + "loss": 1.8139, "step": 3625 }, { - "epoch": 1.09, - "grad_norm": 16.944839477539062, - "learning_rate": 1.2733286559085898e-05, - "loss": 1.7782, + "epoch": 0.46, + "grad_norm": 20.923049926757812, + "learning_rate": 1.6966907919508015e-05, + "loss": 2.7124, "step": 3626 }, { - "epoch": 1.09, - "grad_norm": 8.570146560668945, - "learning_rate": 1.2731281948481508e-05, - "loss": 1.6687, + "epoch": 0.46, + "grad_norm": 9.197908401489258, + "learning_rate": 1.6966071204451328e-05, + "loss": 0.5815, "step": 3627 }, { - "epoch": 1.09, - "grad_norm": 53.58586502075195, - "learning_rate": 1.272927733787712e-05, - "loss": 3.6285, + "epoch": 0.46, + "grad_norm": 25.95100975036621, + "learning_rate": 1.696523448939464e-05, + "loss": 2.5109, "step": 3628 }, { - "epoch": 1.09, - "grad_norm": 24.34233856201172, - "learning_rate": 1.2727272727272728e-05, - "loss": 2.7752, + "epoch": 0.46, + "grad_norm": 12.633675575256348, + "learning_rate": 1.6964397774337952e-05, + "loss": 0.919, "step": 3629 }, { - "epoch": 1.09, - "grad_norm": 14.564997673034668, - "learning_rate": 1.2725268116668339e-05, - "loss": 2.5383, + "epoch": 0.46, + "grad_norm": 12.344071388244629, + "learning_rate": 1.6963561059281262e-05, + "loss": 1.5926, "step": 3630 }, { - "epoch": 1.09, - "grad_norm": 6.960912227630615, - "learning_rate": 1.2723263506063947e-05, - "loss": 0.6939, + "epoch": 0.46, + "grad_norm": 7.051496505737305, + "learning_rate": 1.6962724344224576e-05, + "loss": 0.7164, "step": 3631 }, { - "epoch": 1.09, - "grad_norm": 17.01753044128418, - "learning_rate": 1.2721258895459559e-05, - "loss": 1.6664, + "epoch": 0.46, + "grad_norm": 11.98715877532959, + "learning_rate": 1.696188762916789e-05, + "loss": 2.2112, "step": 3632 }, { - "epoch": 1.09, - "grad_norm": 11.095540046691895, - "learning_rate": 1.2719254284855167e-05, - "loss": 1.6678, + "epoch": 0.46, + "grad_norm": 21.208057403564453, + "learning_rate": 1.69610509141112e-05, + "loss": 3.4837, "step": 3633 }, { - "epoch": 1.09, - "grad_norm": 11.197050094604492, - "learning_rate": 1.2717249674250777e-05, - "loss": 1.9901, + "epoch": 0.46, + "grad_norm": 8.355364799499512, + "learning_rate": 1.6960214199054514e-05, + "loss": 1.5355, "step": 3634 }, { - "epoch": 1.09, - "grad_norm": 11.85187816619873, - "learning_rate": 1.2715245063646389e-05, - "loss": 0.7637, + "epoch": 0.46, + "grad_norm": 20.413406372070312, + "learning_rate": 1.6959377483997824e-05, + "loss": 2.0411, "step": 3635 }, { - "epoch": 1.09, - "grad_norm": 46.356109619140625, - "learning_rate": 1.2713240453041997e-05, - "loss": 1.686, + "epoch": 0.46, + "grad_norm": 17.831262588500977, + "learning_rate": 1.6958540768941137e-05, + "loss": 3.666, "step": 3636 }, { - "epoch": 1.09, - "grad_norm": 72.00715637207031, - "learning_rate": 1.2711235842437607e-05, - "loss": 2.1146, + "epoch": 0.46, + "grad_norm": 28.424673080444336, + "learning_rate": 1.695770405388445e-05, + "loss": 3.6548, "step": 3637 }, { - "epoch": 1.09, - "grad_norm": 23.22759437561035, - "learning_rate": 1.2709231231833218e-05, - "loss": 1.1876, + "epoch": 0.46, + "grad_norm": 15.798102378845215, + "learning_rate": 1.695686733882776e-05, + "loss": 2.4799, "step": 3638 }, { - "epoch": 1.09, - "grad_norm": 16.499618530273438, - "learning_rate": 1.2707226621228828e-05, - "loss": 1.4633, + "epoch": 0.46, + "grad_norm": 8.27879810333252, + "learning_rate": 1.6956030623771075e-05, + "loss": 0.5847, "step": 3639 }, { - "epoch": 1.09, - "grad_norm": 35.80891418457031, - "learning_rate": 1.2705222010624436e-05, - "loss": 1.7426, + "epoch": 0.46, + "grad_norm": 9.31849479675293, + "learning_rate": 1.695519390871439e-05, + "loss": 2.0426, "step": 3640 }, { - "epoch": 1.09, - "grad_norm": 18.7283992767334, - "learning_rate": 1.2703217400020048e-05, - "loss": 1.8939, + "epoch": 0.46, + "grad_norm": 13.492223739624023, + "learning_rate": 1.6954357193657702e-05, + "loss": 1.4945, "step": 3641 }, { - "epoch": 1.1, - "grad_norm": 14.170825958251953, - "learning_rate": 1.2701212789415658e-05, - "loss": 1.992, + "epoch": 0.46, + "grad_norm": 12.793575286865234, + "learning_rate": 1.6953520478601013e-05, + "loss": 1.4826, "step": 3642 }, { - "epoch": 1.1, - "grad_norm": 11.59808349609375, - "learning_rate": 1.2699208178811266e-05, - "loss": 1.9479, + "epoch": 0.46, + "grad_norm": 17.997392654418945, + "learning_rate": 1.6952683763544326e-05, + "loss": 1.0301, "step": 3643 }, { - "epoch": 1.1, - "grad_norm": 27.232282638549805, - "learning_rate": 1.2697203568206878e-05, - "loss": 1.9665, + "epoch": 0.46, + "grad_norm": 15.711397171020508, + "learning_rate": 1.695184704848764e-05, + "loss": 1.4051, "step": 3644 }, { - "epoch": 1.1, - "grad_norm": 13.292805671691895, - "learning_rate": 1.2695198957602486e-05, - "loss": 1.4353, + "epoch": 0.46, + "grad_norm": 29.958099365234375, + "learning_rate": 1.695101033343095e-05, + "loss": 1.2986, "step": 3645 }, { - "epoch": 1.1, - "grad_norm": 8.74724292755127, - "learning_rate": 1.2693194346998097e-05, - "loss": 1.3016, + "epoch": 0.46, + "grad_norm": 11.872001647949219, + "learning_rate": 1.6950173618374264e-05, + "loss": 1.717, "step": 3646 }, { - "epoch": 1.1, - "grad_norm": 19.11276626586914, - "learning_rate": 1.2691189736393708e-05, - "loss": 2.3302, + "epoch": 0.46, + "grad_norm": 12.301345825195312, + "learning_rate": 1.6949336903317577e-05, + "loss": 1.7587, "step": 3647 }, { - "epoch": 1.1, - "grad_norm": 46.45100021362305, - "learning_rate": 1.2689185125789317e-05, - "loss": 2.1202, + "epoch": 0.46, + "grad_norm": 12.868897438049316, + "learning_rate": 1.694850018826089e-05, + "loss": 1.4267, "step": 3648 }, { - "epoch": 1.1, - "grad_norm": 15.36929702758789, - "learning_rate": 1.2687180515184927e-05, - "loss": 1.7896, + "epoch": 0.46, + "grad_norm": 16.243345260620117, + "learning_rate": 1.69476634732042e-05, + "loss": 2.4037, "step": 3649 }, { - "epoch": 1.1, - "grad_norm": 20.07041358947754, - "learning_rate": 1.2685175904580535e-05, - "loss": 1.826, + "epoch": 0.46, + "grad_norm": 14.367687225341797, + "learning_rate": 1.6946826758147515e-05, + "loss": 1.7668, "step": 3650 }, { - "epoch": 1.1, - "grad_norm": 20.192039489746094, - "learning_rate": 1.2683171293976147e-05, - "loss": 1.8944, + "epoch": 0.46, + "grad_norm": 16.331218719482422, + "learning_rate": 1.694599004309083e-05, + "loss": 2.1331, "step": 3651 }, { - "epoch": 1.1, - "grad_norm": 15.358658790588379, - "learning_rate": 1.2681166683371755e-05, - "loss": 2.1073, + "epoch": 0.46, + "grad_norm": 12.379274368286133, + "learning_rate": 1.694515332803414e-05, + "loss": 2.9811, "step": 3652 }, { - "epoch": 1.1, - "grad_norm": 11.35305404663086, - "learning_rate": 1.2679162072767365e-05, - "loss": 1.167, + "epoch": 0.46, + "grad_norm": 11.794244766235352, + "learning_rate": 1.6944316612977453e-05, + "loss": 3.0356, "step": 3653 }, { - "epoch": 1.1, - "grad_norm": 14.477778434753418, - "learning_rate": 1.2677157462162977e-05, - "loss": 2.1406, + "epoch": 0.46, + "grad_norm": 5.233652591705322, + "learning_rate": 1.6943479897920766e-05, + "loss": 1.1373, "step": 3654 }, { - "epoch": 1.1, - "grad_norm": 25.864349365234375, - "learning_rate": 1.2675152851558586e-05, - "loss": 1.9479, + "epoch": 0.46, + "grad_norm": 9.71679973602295, + "learning_rate": 1.6942643182864076e-05, + "loss": 1.6338, "step": 3655 }, { - "epoch": 1.1, - "grad_norm": 14.707035064697266, - "learning_rate": 1.2673148240954196e-05, - "loss": 2.2159, + "epoch": 0.46, + "grad_norm": 5.850435256958008, + "learning_rate": 1.694180646780739e-05, + "loss": 2.8447, "step": 3656 }, { - "epoch": 1.1, - "grad_norm": 9.004791259765625, - "learning_rate": 1.2671143630349804e-05, - "loss": 0.9894, + "epoch": 0.46, + "grad_norm": 9.849655151367188, + "learning_rate": 1.6940969752750704e-05, + "loss": 1.5747, "step": 3657 }, { - "epoch": 1.1, - "grad_norm": 16.240209579467773, - "learning_rate": 1.2669139019745416e-05, - "loss": 1.6444, - "step": 3658 + "epoch": 0.46, + "grad_norm": 15.464215278625488, + "learning_rate": 1.6940133037694014e-05, + "loss": 1.9339, + "step": 3658 }, { - "epoch": 1.1, - "grad_norm": 15.315890312194824, - "learning_rate": 1.2667134409141026e-05, - "loss": 2.046, + "epoch": 0.46, + "grad_norm": 6.046443939208984, + "learning_rate": 1.6939296322637328e-05, + "loss": 1.5449, "step": 3659 }, { - "epoch": 1.1, - "grad_norm": 14.05276870727539, - "learning_rate": 1.2665129798536634e-05, - "loss": 2.0308, + "epoch": 0.46, + "grad_norm": 22.741769790649414, + "learning_rate": 1.6938459607580638e-05, + "loss": 0.8067, "step": 3660 }, { - "epoch": 1.1, - "grad_norm": 12.463837623596191, - "learning_rate": 1.2663125187932246e-05, - "loss": 1.3266, + "epoch": 0.46, + "grad_norm": 15.346259117126465, + "learning_rate": 1.693762289252395e-05, + "loss": 2.7109, "step": 3661 }, { - "epoch": 1.1, - "grad_norm": 10.216002464294434, - "learning_rate": 1.2661120577327854e-05, - "loss": 1.4467, + "epoch": 0.46, + "grad_norm": 11.457945823669434, + "learning_rate": 1.6936786177467265e-05, + "loss": 2.1529, "step": 3662 }, { - "epoch": 1.1, - "grad_norm": 52.47303771972656, - "learning_rate": 1.2659115966723465e-05, - "loss": 3.601, + "epoch": 0.46, + "grad_norm": 20.453115463256836, + "learning_rate": 1.6935949462410575e-05, + "loss": 3.8016, "step": 3663 }, { - "epoch": 1.1, - "grad_norm": 19.366987228393555, - "learning_rate": 1.2657111356119075e-05, - "loss": 1.9766, + "epoch": 0.46, + "grad_norm": 24.550575256347656, + "learning_rate": 1.693511274735389e-05, + "loss": 3.59, "step": 3664 }, { - "epoch": 1.1, - "grad_norm": 11.65963077545166, - "learning_rate": 1.2655106745514685e-05, - "loss": 1.6192, + "epoch": 0.46, + "grad_norm": 23.053932189941406, + "learning_rate": 1.6934276032297203e-05, + "loss": 2.6321, "step": 3665 }, { - "epoch": 1.1, - "grad_norm": 14.95705509185791, - "learning_rate": 1.2653102134910295e-05, - "loss": 1.8748, + "epoch": 0.46, + "grad_norm": 9.720477104187012, + "learning_rate": 1.6933439317240513e-05, + "loss": 2.066, "step": 3666 }, { - "epoch": 1.1, - "grad_norm": 21.516704559326172, - "learning_rate": 1.2651097524305905e-05, - "loss": 2.2989, + "epoch": 0.46, + "grad_norm": 8.113615036010742, + "learning_rate": 1.6932602602183827e-05, + "loss": 1.7078, "step": 3667 }, { - "epoch": 1.1, - "grad_norm": 12.6599760055542, - "learning_rate": 1.2649092913701515e-05, - "loss": 1.9577, + "epoch": 0.46, + "grad_norm": 11.102977752685547, + "learning_rate": 1.693176588712714e-05, + "loss": 2.9131, "step": 3668 }, { - "epoch": 1.1, - "grad_norm": 20.421907424926758, - "learning_rate": 1.2647088303097123e-05, - "loss": 1.8006, + "epoch": 0.46, + "grad_norm": 9.469147682189941, + "learning_rate": 1.6930929172070454e-05, + "loss": 1.2561, "step": 3669 }, { - "epoch": 1.1, - "grad_norm": 21.49991226196289, - "learning_rate": 1.2645083692492735e-05, - "loss": 1.5208, + "epoch": 0.46, + "grad_norm": 39.29673767089844, + "learning_rate": 1.6930092457013764e-05, + "loss": 2.2055, "step": 3670 }, { - "epoch": 1.1, - "grad_norm": 16.71851348876953, - "learning_rate": 1.2643079081888345e-05, - "loss": 1.3844, + "epoch": 0.46, + "grad_norm": 22.410480499267578, + "learning_rate": 1.6929255741957078e-05, + "loss": 2.6519, "step": 3671 }, { - "epoch": 1.1, - "grad_norm": 12.691519737243652, - "learning_rate": 1.2641074471283954e-05, - "loss": 1.6805, + "epoch": 0.46, + "grad_norm": 6.620682716369629, + "learning_rate": 1.692841902690039e-05, + "loss": 0.4279, "step": 3672 }, { - "epoch": 1.1, - "grad_norm": 13.60329532623291, - "learning_rate": 1.2639069860679565e-05, - "loss": 1.3261, + "epoch": 0.46, + "grad_norm": 20.994232177734375, + "learning_rate": 1.6927582311843702e-05, + "loss": 1.4075, "step": 3673 }, { - "epoch": 1.1, - "grad_norm": 12.163605690002441, - "learning_rate": 1.2637065250075174e-05, - "loss": 1.2984, + "epoch": 0.46, + "grad_norm": 14.912308692932129, + "learning_rate": 1.6926745596787015e-05, + "loss": 2.9889, "step": 3674 }, { - "epoch": 1.1, - "grad_norm": 11.735462188720703, - "learning_rate": 1.2635060639470784e-05, - "loss": 1.3691, + "epoch": 0.46, + "grad_norm": 23.213647842407227, + "learning_rate": 1.692590888173033e-05, + "loss": 2.9207, "step": 3675 }, { - "epoch": 1.11, - "grad_norm": 10.237204551696777, - "learning_rate": 1.2633056028866392e-05, - "loss": 1.4796, + "epoch": 0.46, + "grad_norm": 16.951650619506836, + "learning_rate": 1.6925072166673643e-05, + "loss": 1.9713, "step": 3676 }, { - "epoch": 1.11, - "grad_norm": 16.992961883544922, - "learning_rate": 1.2631051418262004e-05, - "loss": 1.5819, + "epoch": 0.46, + "grad_norm": 11.632166862487793, + "learning_rate": 1.6924235451616953e-05, + "loss": 1.0241, "step": 3677 }, { - "epoch": 1.11, - "grad_norm": 25.052305221557617, - "learning_rate": 1.2629046807657614e-05, - "loss": 2.6452, + "epoch": 0.46, + "grad_norm": 17.42290687561035, + "learning_rate": 1.6923398736560267e-05, + "loss": 2.1315, "step": 3678 }, { - "epoch": 1.11, - "grad_norm": 34.612735748291016, - "learning_rate": 1.2627042197053223e-05, - "loss": 1.888, + "epoch": 0.46, + "grad_norm": 3.987685203552246, + "learning_rate": 1.692256202150358e-05, + "loss": 0.2983, "step": 3679 }, { - "epoch": 1.11, - "grad_norm": 12.312444686889648, - "learning_rate": 1.2625037586448834e-05, - "loss": 1.6545, + "epoch": 0.46, + "grad_norm": 12.946403503417969, + "learning_rate": 1.692172530644689e-05, + "loss": 1.0987, "step": 3680 }, { - "epoch": 1.11, - "grad_norm": 17.487689971923828, - "learning_rate": 1.2623032975844443e-05, - "loss": 2.225, + "epoch": 0.46, + "grad_norm": 13.340381622314453, + "learning_rate": 1.6920888591390204e-05, + "loss": 2.515, "step": 3681 }, { - "epoch": 1.11, - "grad_norm": 12.337897300720215, - "learning_rate": 1.2621028365240053e-05, - "loss": 1.285, + "epoch": 0.46, + "grad_norm": 9.504284858703613, + "learning_rate": 1.6920051876333518e-05, + "loss": 1.8395, "step": 3682 }, { - "epoch": 1.11, - "grad_norm": 12.9970121383667, - "learning_rate": 1.2619023754635661e-05, - "loss": 1.5476, + "epoch": 0.46, + "grad_norm": 15.068203926086426, + "learning_rate": 1.6919215161276828e-05, + "loss": 2.3602, "step": 3683 }, { - "epoch": 1.11, - "grad_norm": 36.430641174316406, - "learning_rate": 1.2617019144031273e-05, - "loss": 2.409, + "epoch": 0.46, + "grad_norm": 14.113088607788086, + "learning_rate": 1.6918378446220142e-05, + "loss": 2.6057, "step": 3684 }, { - "epoch": 1.11, - "grad_norm": 11.384819030761719, - "learning_rate": 1.2615014533426883e-05, - "loss": 1.2839, + "epoch": 0.46, + "grad_norm": 37.74652862548828, + "learning_rate": 1.6917541731163455e-05, + "loss": 1.9992, "step": 3685 }, { - "epoch": 1.11, - "grad_norm": 14.997862815856934, - "learning_rate": 1.2613009922822491e-05, - "loss": 2.1861, + "epoch": 0.46, + "grad_norm": 20.58168601989746, + "learning_rate": 1.6916705016106766e-05, + "loss": 2.5807, "step": 3686 }, { - "epoch": 1.11, - "grad_norm": 9.791463851928711, - "learning_rate": 1.2611005312218103e-05, - "loss": 1.6862, + "epoch": 0.46, + "grad_norm": 24.944747924804688, + "learning_rate": 1.691586830105008e-05, + "loss": 1.9458, "step": 3687 }, { - "epoch": 1.11, - "grad_norm": 20.38786506652832, - "learning_rate": 1.2609000701613712e-05, - "loss": 1.8915, + "epoch": 0.46, + "grad_norm": 8.984786033630371, + "learning_rate": 1.691503158599339e-05, + "loss": 1.8054, "step": 3688 }, { - "epoch": 1.11, - "grad_norm": 56.49119567871094, - "learning_rate": 1.2606996091009323e-05, - "loss": 1.4701, + "epoch": 0.46, + "grad_norm": 12.315727233886719, + "learning_rate": 1.6914194870936703e-05, + "loss": 4.6691, "step": 3689 }, { - "epoch": 1.11, - "grad_norm": 9.169239044189453, - "learning_rate": 1.2604991480404933e-05, - "loss": 1.0245, + "epoch": 0.46, + "grad_norm": 17.678552627563477, + "learning_rate": 1.6913358155880017e-05, + "loss": 1.7385, "step": 3690 }, { - "epoch": 1.11, - "grad_norm": 18.84854507446289, - "learning_rate": 1.2602986869800542e-05, - "loss": 1.6371, + "epoch": 0.46, + "grad_norm": 14.618388175964355, + "learning_rate": 1.6912521440823327e-05, + "loss": 2.2065, "step": 3691 }, { - "epoch": 1.11, - "grad_norm": 15.238136291503906, - "learning_rate": 1.2600982259196154e-05, - "loss": 1.4109, + "epoch": 0.46, + "grad_norm": 14.206609725952148, + "learning_rate": 1.691168472576664e-05, + "loss": 2.552, "step": 3692 }, { - "epoch": 1.11, - "grad_norm": 24.6544189453125, - "learning_rate": 1.2598977648591762e-05, - "loss": 1.8289, + "epoch": 0.46, + "grad_norm": 13.141923904418945, + "learning_rate": 1.6910848010709954e-05, + "loss": 2.2018, "step": 3693 }, { - "epoch": 1.11, - "grad_norm": 15.802862167358398, - "learning_rate": 1.2596973037987372e-05, - "loss": 2.074, + "epoch": 0.46, + "grad_norm": 10.221967697143555, + "learning_rate": 1.6910011295653265e-05, + "loss": 0.7906, "step": 3694 }, { - "epoch": 1.11, - "grad_norm": 21.196699142456055, - "learning_rate": 1.259496842738298e-05, - "loss": 2.4331, + "epoch": 0.46, + "grad_norm": 7.5393147468566895, + "learning_rate": 1.690917458059658e-05, + "loss": 1.7885, "step": 3695 }, { - "epoch": 1.11, - "grad_norm": 14.442291259765625, - "learning_rate": 1.2592963816778592e-05, - "loss": 1.1208, + "epoch": 0.46, + "grad_norm": 10.141867637634277, + "learning_rate": 1.6908337865539892e-05, + "loss": 1.7606, "step": 3696 }, { - "epoch": 1.11, - "grad_norm": 16.67438316345215, - "learning_rate": 1.2590959206174202e-05, - "loss": 1.9479, + "epoch": 0.46, + "grad_norm": 30.746566772460938, + "learning_rate": 1.6907501150483206e-05, + "loss": 3.0198, "step": 3697 }, { - "epoch": 1.11, - "grad_norm": 30.792251586914062, - "learning_rate": 1.258895459556981e-05, - "loss": 1.0711, + "epoch": 0.46, + "grad_norm": 198.8036346435547, + "learning_rate": 1.6906664435426516e-05, + "loss": 1.4779, "step": 3698 }, { - "epoch": 1.11, - "grad_norm": 21.835054397583008, - "learning_rate": 1.2586949984965423e-05, - "loss": 1.4139, + "epoch": 0.46, + "grad_norm": 20.587520599365234, + "learning_rate": 1.690582772036983e-05, + "loss": 1.9991, "step": 3699 }, { - "epoch": 1.11, - "grad_norm": 23.054439544677734, - "learning_rate": 1.2584945374361031e-05, - "loss": 2.3756, + "epoch": 0.46, + "grad_norm": 20.063173294067383, + "learning_rate": 1.6904991005313143e-05, + "loss": 2.9646, "step": 3700 }, { - "epoch": 1.11, - "grad_norm": 15.562328338623047, - "learning_rate": 1.2582940763756641e-05, - "loss": 1.6298, + "epoch": 0.46, + "grad_norm": 12.660489082336426, + "learning_rate": 1.6904154290256453e-05, + "loss": 1.038, "step": 3701 }, { - "epoch": 1.11, - "grad_norm": 19.600475311279297, - "learning_rate": 1.2580936153152253e-05, - "loss": 2.0049, + "epoch": 0.46, + "grad_norm": 14.9254150390625, + "learning_rate": 1.6903317575199767e-05, + "loss": 1.8169, "step": 3702 }, { - "epoch": 1.11, - "grad_norm": 15.934944152832031, - "learning_rate": 1.2578931542547861e-05, - "loss": 2.3699, + "epoch": 0.46, + "grad_norm": 9.880672454833984, + "learning_rate": 1.690248086014308e-05, + "loss": 1.564, "step": 3703 }, { - "epoch": 1.11, - "grad_norm": 13.748173713684082, - "learning_rate": 1.2576926931943471e-05, - "loss": 2.802, + "epoch": 0.46, + "grad_norm": 13.512040138244629, + "learning_rate": 1.6901644145086394e-05, + "loss": 3.2549, "step": 3704 }, { - "epoch": 1.11, - "grad_norm": 10.191062927246094, - "learning_rate": 1.257492232133908e-05, - "loss": 0.9145, + "epoch": 0.46, + "grad_norm": 8.003437042236328, + "learning_rate": 1.6900807430029705e-05, + "loss": 0.9277, "step": 3705 }, { - "epoch": 1.11, - "grad_norm": 14.744169235229492, - "learning_rate": 1.2572917710734691e-05, - "loss": 1.7588, + "epoch": 0.47, + "grad_norm": 15.903909683227539, + "learning_rate": 1.689997071497302e-05, + "loss": 3.1238, "step": 3706 }, { - "epoch": 1.11, - "grad_norm": 17.905502319335938, - "learning_rate": 1.25709131001303e-05, - "loss": 1.4409, + "epoch": 0.47, + "grad_norm": 9.854524612426758, + "learning_rate": 1.6899133999916332e-05, + "loss": 0.7731, "step": 3707 }, { - "epoch": 1.11, - "grad_norm": 22.17319679260254, - "learning_rate": 1.256890848952591e-05, - "loss": 1.6653, + "epoch": 0.47, + "grad_norm": 21.493879318237305, + "learning_rate": 1.6898297284859642e-05, + "loss": 2.0419, "step": 3708 }, { - "epoch": 1.12, - "grad_norm": 14.994551658630371, - "learning_rate": 1.2566903878921522e-05, - "loss": 2.6141, + "epoch": 0.47, + "grad_norm": 15.608744621276855, + "learning_rate": 1.6897460569802956e-05, + "loss": 2.7725, "step": 3709 }, { - "epoch": 1.12, - "grad_norm": 27.965063095092773, - "learning_rate": 1.256489926831713e-05, - "loss": 1.4174, + "epoch": 0.47, + "grad_norm": 15.87423324584961, + "learning_rate": 1.689662385474627e-05, + "loss": 2.4522, "step": 3710 }, { - "epoch": 1.12, - "grad_norm": 12.438477516174316, - "learning_rate": 1.256289465771274e-05, - "loss": 2.034, + "epoch": 0.47, + "grad_norm": 12.656645774841309, + "learning_rate": 1.689578713968958e-05, + "loss": 2.3673, "step": 3711 }, { - "epoch": 1.12, - "grad_norm": 18.60542106628418, - "learning_rate": 1.256089004710835e-05, - "loss": 2.1583, + "epoch": 0.47, + "grad_norm": 8.62553882598877, + "learning_rate": 1.6894950424632893e-05, + "loss": 0.6039, "step": 3712 }, { - "epoch": 1.12, - "grad_norm": 11.205132484436035, - "learning_rate": 1.255888543650396e-05, - "loss": 2.2042, + "epoch": 0.47, + "grad_norm": 17.59114646911621, + "learning_rate": 1.6894113709576204e-05, + "loss": 1.466, "step": 3713 }, { - "epoch": 1.12, - "grad_norm": 21.031993865966797, - "learning_rate": 1.255688082589957e-05, - "loss": 1.3351, + "epoch": 0.47, + "grad_norm": 14.852526664733887, + "learning_rate": 1.6893276994519517e-05, + "loss": 2.2656, "step": 3714 }, { - "epoch": 1.12, - "grad_norm": 22.70185661315918, - "learning_rate": 1.255487621529518e-05, - "loss": 2.8246, + "epoch": 0.47, + "grad_norm": 10.289872169494629, + "learning_rate": 1.689244027946283e-05, + "loss": 2.5505, "step": 3715 }, { - "epoch": 1.12, - "grad_norm": 19.754451751708984, - "learning_rate": 1.255287160469079e-05, - "loss": 1.9262, + "epoch": 0.47, + "grad_norm": 8.868825912475586, + "learning_rate": 1.689160356440614e-05, + "loss": 1.4586, "step": 3716 }, { - "epoch": 1.12, - "grad_norm": 14.8783597946167, - "learning_rate": 1.2550866994086399e-05, - "loss": 1.958, + "epoch": 0.47, + "grad_norm": 15.438570976257324, + "learning_rate": 1.6890766849349455e-05, + "loss": 1.4362, "step": 3717 }, { - "epoch": 1.12, - "grad_norm": 13.51906681060791, - "learning_rate": 1.254886238348201e-05, - "loss": 2.0377, + "epoch": 0.47, + "grad_norm": 12.702199935913086, + "learning_rate": 1.688993013429277e-05, + "loss": 2.9112, "step": 3718 }, { - "epoch": 1.12, - "grad_norm": 13.766884803771973, - "learning_rate": 1.2546857772877619e-05, - "loss": 1.8277, + "epoch": 0.47, + "grad_norm": 13.10950756072998, + "learning_rate": 1.688909341923608e-05, + "loss": 0.9233, "step": 3719 }, { - "epoch": 1.12, - "grad_norm": 11.940971374511719, - "learning_rate": 1.254485316227323e-05, - "loss": 2.0253, - "step": 3720 - }, - { - "epoch": 1.12, - "eval_loss": 0.280333936214447, - "eval_runtime": 43.5973, - "eval_samples_per_second": 33.924, - "eval_steps_per_second": 33.924, + "epoch": 0.47, + "grad_norm": 9.561256408691406, + "learning_rate": 1.6888256704179392e-05, + "loss": 0.7627, "step": 3720 }, { - "epoch": 1.12, - "grad_norm": 13.200506210327148, - "learning_rate": 1.2542848551668841e-05, - "loss": 1.8569, + "epoch": 0.47, + "grad_norm": 10.910165786743164, + "learning_rate": 1.6887419989122706e-05, + "loss": 2.432, "step": 3721 }, { - "epoch": 1.12, - "grad_norm": 11.850561141967773, - "learning_rate": 1.254084394106445e-05, - "loss": 1.2431, + "epoch": 0.47, + "grad_norm": 25.648988723754883, + "learning_rate": 1.6886583274066016e-05, + "loss": 1.7628, "step": 3722 }, { - "epoch": 1.12, - "grad_norm": 14.307988166809082, - "learning_rate": 1.253883933046006e-05, - "loss": 1.2535, + "epoch": 0.47, + "grad_norm": 14.003104209899902, + "learning_rate": 1.688574655900933e-05, + "loss": 3.3926, "step": 3723 }, { - "epoch": 1.12, - "grad_norm": 33.0361328125, - "learning_rate": 1.2536834719855668e-05, - "loss": 2.3736, + "epoch": 0.47, + "grad_norm": 11.824846267700195, + "learning_rate": 1.6884909843952644e-05, + "loss": 2.1877, "step": 3724 }, { - "epoch": 1.12, - "grad_norm": 14.496783256530762, - "learning_rate": 1.253483010925128e-05, - "loss": 1.5165, + "epoch": 0.47, + "grad_norm": 8.314187049865723, + "learning_rate": 1.6884073128895957e-05, + "loss": 0.9777, "step": 3725 }, { - "epoch": 1.12, - "grad_norm": 20.50676155090332, - "learning_rate": 1.2532825498646888e-05, - "loss": 1.7852, + "epoch": 0.47, + "grad_norm": 10.272077560424805, + "learning_rate": 1.6883236413839268e-05, + "loss": 1.4694, "step": 3726 }, { - "epoch": 1.12, - "grad_norm": 16.414701461791992, - "learning_rate": 1.2530820888042498e-05, - "loss": 1.5958, + "epoch": 0.47, + "grad_norm": 12.01590633392334, + "learning_rate": 1.688239969878258e-05, + "loss": 0.8113, "step": 3727 }, { - "epoch": 1.12, - "grad_norm": 9.845596313476562, - "learning_rate": 1.252881627743811e-05, - "loss": 1.4953, + "epoch": 0.47, + "grad_norm": 14.180240631103516, + "learning_rate": 1.6881562983725895e-05, + "loss": 3.053, "step": 3728 }, { - "epoch": 1.12, - "grad_norm": 34.321861267089844, - "learning_rate": 1.2526811666833718e-05, - "loss": 1.9955, + "epoch": 0.47, + "grad_norm": 6.562134742736816, + "learning_rate": 1.6880726268669205e-05, + "loss": 0.6537, "step": 3729 }, { - "epoch": 1.12, - "grad_norm": 11.001937866210938, - "learning_rate": 1.2524807056229328e-05, - "loss": 1.7248, + "epoch": 0.47, + "grad_norm": 8.070725440979004, + "learning_rate": 1.687988955361252e-05, + "loss": 0.89, "step": 3730 }, { - "epoch": 1.12, - "grad_norm": 14.16842269897461, - "learning_rate": 1.2522802445624937e-05, - "loss": 0.9204, + "epoch": 0.47, + "grad_norm": 26.202648162841797, + "learning_rate": 1.6879052838555832e-05, + "loss": 2.0353, "step": 3731 }, { - "epoch": 1.12, - "grad_norm": 44.32683181762695, - "learning_rate": 1.2520797835020549e-05, - "loss": 1.5575, + "epoch": 0.47, + "grad_norm": 11.653743743896484, + "learning_rate": 1.6878216123499146e-05, + "loss": 2.4323, "step": 3732 }, { - "epoch": 1.12, - "grad_norm": 21.35757064819336, - "learning_rate": 1.2518793224416159e-05, - "loss": 2.7646, + "epoch": 0.47, + "grad_norm": 10.062137603759766, + "learning_rate": 1.6877379408442456e-05, + "loss": 3.2083, "step": 3733 }, { - "epoch": 1.12, - "grad_norm": 10.033564567565918, - "learning_rate": 1.2516788613811767e-05, - "loss": 1.054, + "epoch": 0.47, + "grad_norm": 13.19880485534668, + "learning_rate": 1.687654269338577e-05, + "loss": 1.9633, "step": 3734 }, { - "epoch": 1.12, - "grad_norm": 11.14671802520752, - "learning_rate": 1.2514784003207379e-05, - "loss": 1.4729, + "epoch": 0.47, + "grad_norm": 10.756131172180176, + "learning_rate": 1.6875705978329084e-05, + "loss": 1.5122, "step": 3735 }, { - "epoch": 1.12, - "grad_norm": 57.583152770996094, - "learning_rate": 1.2512779392602987e-05, - "loss": 4.8937, + "epoch": 0.47, + "grad_norm": 7.860194683074951, + "learning_rate": 1.6874869263272394e-05, + "loss": 1.4545, "step": 3736 }, { - "epoch": 1.12, - "grad_norm": 10.513443946838379, - "learning_rate": 1.2510774781998597e-05, - "loss": 1.1766, + "epoch": 0.47, + "grad_norm": 11.343194007873535, + "learning_rate": 1.6874032548215708e-05, + "loss": 1.8502, "step": 3737 }, { - "epoch": 1.12, - "grad_norm": 20.811086654663086, - "learning_rate": 1.2508770171394207e-05, - "loss": 1.86, + "epoch": 0.47, + "grad_norm": 9.31357192993164, + "learning_rate": 1.687319583315902e-05, + "loss": 2.8584, "step": 3738 }, { - "epoch": 1.12, - "grad_norm": 15.33969497680664, - "learning_rate": 1.2506765560789817e-05, - "loss": 1.9234, + "epoch": 0.47, + "grad_norm": 43.624847412109375, + "learning_rate": 1.687235911810233e-05, + "loss": 2.5072, "step": 3739 }, { - "epoch": 1.12, - "grad_norm": 16.466957092285156, - "learning_rate": 1.250476095018543e-05, - "loss": 1.0944, + "epoch": 0.47, + "grad_norm": 8.754593849182129, + "learning_rate": 1.6871522403045645e-05, + "loss": 2.2842, "step": 3740 }, { - "epoch": 1.12, - "grad_norm": 11.51232624053955, - "learning_rate": 1.2502756339581038e-05, - "loss": 1.9455, + "epoch": 0.47, + "grad_norm": 8.47415542602539, + "learning_rate": 1.6870685687988955e-05, + "loss": 0.5375, "step": 3741 }, { - "epoch": 1.13, - "grad_norm": 16.893362045288086, - "learning_rate": 1.2500751728976648e-05, - "loss": 2.0934, + "epoch": 0.47, + "grad_norm": 10.735517501831055, + "learning_rate": 1.686984897293227e-05, + "loss": 1.5768, "step": 3742 }, { - "epoch": 1.13, - "grad_norm": 10.207427024841309, - "learning_rate": 1.2498747118372256e-05, - "loss": 1.5444, + "epoch": 0.47, + "grad_norm": 15.74870491027832, + "learning_rate": 1.6869012257875583e-05, + "loss": 2.2971, "step": 3743 }, { - "epoch": 1.13, - "grad_norm": 14.303349494934082, - "learning_rate": 1.2496742507767868e-05, - "loss": 2.0237, + "epoch": 0.47, + "grad_norm": 21.359169006347656, + "learning_rate": 1.6868175542818893e-05, + "loss": 1.9346, "step": 3744 }, { - "epoch": 1.13, - "grad_norm": 15.821151733398438, - "learning_rate": 1.2494737897163478e-05, - "loss": 1.2218, + "epoch": 0.47, + "grad_norm": 38.049407958984375, + "learning_rate": 1.6867338827762207e-05, + "loss": 2.1763, "step": 3745 }, { - "epoch": 1.13, - "grad_norm": 19.704866409301758, - "learning_rate": 1.2492733286559086e-05, - "loss": 1.8249, + "epoch": 0.47, + "grad_norm": 15.668584823608398, + "learning_rate": 1.686650211270552e-05, + "loss": 1.9172, "step": 3746 }, { - "epoch": 1.13, - "grad_norm": 15.844225883483887, - "learning_rate": 1.2490728675954698e-05, - "loss": 1.4068, + "epoch": 0.47, + "grad_norm": 18.220617294311523, + "learning_rate": 1.686566539764883e-05, + "loss": 1.6622, "step": 3747 }, { - "epoch": 1.13, - "grad_norm": 26.448633193969727, - "learning_rate": 1.2488724065350306e-05, - "loss": 2.5801, + "epoch": 0.47, + "grad_norm": 26.26845359802246, + "learning_rate": 1.6864828682592144e-05, + "loss": 1.8791, "step": 3748 }, { - "epoch": 1.13, - "grad_norm": 31.50472640991211, - "learning_rate": 1.2486719454745917e-05, - "loss": 2.1224, + "epoch": 0.47, + "grad_norm": 12.238680839538574, + "learning_rate": 1.6863991967535458e-05, + "loss": 2.2067, "step": 3749 }, { - "epoch": 1.13, - "grad_norm": 11.211418151855469, - "learning_rate": 1.2484714844141525e-05, - "loss": 1.3734, + "epoch": 0.47, + "grad_norm": 10.083091735839844, + "learning_rate": 1.6863155252478768e-05, + "loss": 1.3996, "step": 3750 }, { - "epoch": 1.13, - "grad_norm": 16.562360763549805, - "learning_rate": 1.2482710233537137e-05, - "loss": 2.8084, + "epoch": 0.47, + "grad_norm": 6.8707194328308105, + "learning_rate": 1.6862318537422082e-05, + "loss": 0.3253, "step": 3751 }, { - "epoch": 1.13, - "grad_norm": 14.236701965332031, - "learning_rate": 1.2480705622932747e-05, - "loss": 1.3424, + "epoch": 0.47, + "grad_norm": 50.154117584228516, + "learning_rate": 1.6861481822365395e-05, + "loss": 3.0243, "step": 3752 }, { - "epoch": 1.13, - "grad_norm": 17.27580451965332, - "learning_rate": 1.2478701012328355e-05, - "loss": 2.5758, + "epoch": 0.47, + "grad_norm": 9.788782119750977, + "learning_rate": 1.6860645107308706e-05, + "loss": 1.2306, "step": 3753 }, { - "epoch": 1.13, - "grad_norm": 12.679486274719238, - "learning_rate": 1.2476696401723967e-05, - "loss": 1.6972, + "epoch": 0.47, + "grad_norm": 10.830899238586426, + "learning_rate": 1.685980839225202e-05, + "loss": 1.7717, "step": 3754 }, { - "epoch": 1.13, - "grad_norm": 17.295305252075195, - "learning_rate": 1.2474691791119575e-05, - "loss": 1.4085, + "epoch": 0.47, + "grad_norm": 14.998096466064453, + "learning_rate": 1.6858971677195333e-05, + "loss": 3.7802, "step": 3755 }, { - "epoch": 1.13, - "grad_norm": 14.256155014038086, - "learning_rate": 1.2472687180515185e-05, - "loss": 1.5598, + "epoch": 0.47, + "grad_norm": 7.246328830718994, + "learning_rate": 1.6858134962138647e-05, + "loss": 0.4949, "step": 3756 }, { - "epoch": 1.13, - "grad_norm": 14.317242622375488, - "learning_rate": 1.2470682569910797e-05, - "loss": 1.98, + "epoch": 0.47, + "grad_norm": 7.857738494873047, + "learning_rate": 1.6857298247081957e-05, + "loss": 2.1915, "step": 3757 }, { - "epoch": 1.13, - "grad_norm": 14.427403450012207, - "learning_rate": 1.2468677959306406e-05, - "loss": 1.4377, + "epoch": 0.47, + "grad_norm": 12.347123146057129, + "learning_rate": 1.685646153202527e-05, + "loss": 2.5454, "step": 3758 }, { - "epoch": 1.13, - "grad_norm": 19.83202362060547, - "learning_rate": 1.2466673348702016e-05, - "loss": 1.7858, + "epoch": 0.47, + "grad_norm": 13.190940856933594, + "learning_rate": 1.6855624816968584e-05, + "loss": 0.8198, "step": 3759 }, { - "epoch": 1.13, - "grad_norm": 10.979972839355469, - "learning_rate": 1.2464668738097626e-05, - "loss": 0.8914, + "epoch": 0.47, + "grad_norm": 10.892491340637207, + "learning_rate": 1.6854788101911894e-05, + "loss": 1.7259, "step": 3760 }, { - "epoch": 1.13, - "grad_norm": 25.953853607177734, - "learning_rate": 1.2462664127493236e-05, - "loss": 1.768, + "epoch": 0.47, + "grad_norm": 14.431472778320312, + "learning_rate": 1.6853951386855208e-05, + "loss": 2.6695, "step": 3761 }, { - "epoch": 1.13, - "grad_norm": 17.417144775390625, - "learning_rate": 1.2460659516888844e-05, - "loss": 1.9067, + "epoch": 0.47, + "grad_norm": 5.824015140533447, + "learning_rate": 1.685311467179852e-05, + "loss": 1.4626, "step": 3762 }, { - "epoch": 1.13, - "grad_norm": 18.727876663208008, - "learning_rate": 1.2458654906284456e-05, - "loss": 1.8154, + "epoch": 0.47, + "grad_norm": 10.591657638549805, + "learning_rate": 1.6852277956741835e-05, + "loss": 1.7026, "step": 3763 }, { - "epoch": 1.13, - "grad_norm": 28.132505416870117, - "learning_rate": 1.2456650295680066e-05, - "loss": 2.6384, + "epoch": 0.47, + "grad_norm": 19.413419723510742, + "learning_rate": 1.6851441241685146e-05, + "loss": 1.3262, "step": 3764 }, { - "epoch": 1.13, - "grad_norm": 11.409829139709473, - "learning_rate": 1.2454645685075675e-05, - "loss": 1.3703, + "epoch": 0.47, + "grad_norm": 9.156866073608398, + "learning_rate": 1.685060452662846e-05, + "loss": 1.5683, "step": 3765 }, { - "epoch": 1.13, - "grad_norm": 12.39699935913086, - "learning_rate": 1.2452641074471286e-05, - "loss": 1.56, + "epoch": 0.47, + "grad_norm": 15.880305290222168, + "learning_rate": 1.684976781157177e-05, + "loss": 2.2642, "step": 3766 }, { - "epoch": 1.13, - "grad_norm": 12.608708381652832, - "learning_rate": 1.2450636463866895e-05, - "loss": 1.0913, + "epoch": 0.47, + "grad_norm": 13.627202033996582, + "learning_rate": 1.6848931096515083e-05, + "loss": 0.9258, "step": 3767 }, { - "epoch": 1.13, - "grad_norm": 9.895720481872559, - "learning_rate": 1.2448631853262505e-05, - "loss": 1.6166, + "epoch": 0.47, + "grad_norm": 201.3279571533203, + "learning_rate": 1.6848094381458397e-05, + "loss": 1.3532, "step": 3768 }, { - "epoch": 1.13, - "grad_norm": 15.616667747497559, - "learning_rate": 1.2446627242658113e-05, - "loss": 2.4802, + "epoch": 0.47, + "grad_norm": 7.930774688720703, + "learning_rate": 1.6847257666401707e-05, + "loss": 2.4595, "step": 3769 }, { - "epoch": 1.13, - "grad_norm": 28.047149658203125, - "learning_rate": 1.2444622632053725e-05, - "loss": 2.0519, + "epoch": 0.47, + "grad_norm": 22.180068969726562, + "learning_rate": 1.684642095134502e-05, + "loss": 2.2954, "step": 3770 }, { - "epoch": 1.13, - "grad_norm": 15.117725372314453, - "learning_rate": 1.2442618021449335e-05, - "loss": 2.0924, + "epoch": 0.47, + "grad_norm": 6.394443035125732, + "learning_rate": 1.684558423628833e-05, + "loss": 0.7102, "step": 3771 }, { - "epoch": 1.13, - "grad_norm": 11.850608825683594, - "learning_rate": 1.2440613410844943e-05, - "loss": 1.4544, + "epoch": 0.47, + "grad_norm": 37.96142578125, + "learning_rate": 1.6844747521231645e-05, + "loss": 2.581, "step": 3772 }, { - "epoch": 1.13, - "grad_norm": 13.68972396850586, - "learning_rate": 1.2438608800240555e-05, - "loss": 2.0332, + "epoch": 0.47, + "grad_norm": 19.41549301147461, + "learning_rate": 1.6843910806174958e-05, + "loss": 1.3952, "step": 3773 }, { - "epoch": 1.13, - "grad_norm": 12.880284309387207, - "learning_rate": 1.2436604189636164e-05, - "loss": 2.0553, + "epoch": 0.47, + "grad_norm": 9.572205543518066, + "learning_rate": 1.684307409111827e-05, + "loss": 2.2393, "step": 3774 }, { - "epoch": 1.13, - "grad_norm": 14.321048736572266, - "learning_rate": 1.2434599579031774e-05, - "loss": 1.2826, + "epoch": 0.47, + "grad_norm": 17.878549575805664, + "learning_rate": 1.6842237376061582e-05, + "loss": 1.6974, "step": 3775 }, { - "epoch": 1.14, - "grad_norm": 22.247779846191406, - "learning_rate": 1.2432594968427385e-05, - "loss": 1.7716, + "epoch": 0.47, + "grad_norm": 11.891609191894531, + "learning_rate": 1.6841400661004896e-05, + "loss": 2.4472, "step": 3776 }, { - "epoch": 1.14, - "grad_norm": 10.209996223449707, - "learning_rate": 1.2430590357822994e-05, - "loss": 1.5767, + "epoch": 0.47, + "grad_norm": 15.71055793762207, + "learning_rate": 1.684056394594821e-05, + "loss": 1.1206, "step": 3777 }, { - "epoch": 1.14, - "grad_norm": 12.013636589050293, - "learning_rate": 1.2428585747218604e-05, - "loss": 1.114, + "epoch": 0.47, + "grad_norm": 18.522485733032227, + "learning_rate": 1.683972723089152e-05, + "loss": 0.7849, "step": 3778 }, { - "epoch": 1.14, - "grad_norm": 51.70674514770508, - "learning_rate": 1.2426581136614212e-05, - "loss": 2.013, + "epoch": 0.47, + "grad_norm": 23.750144958496094, + "learning_rate": 1.6838890515834833e-05, + "loss": 2.3989, "step": 3779 }, { - "epoch": 1.14, - "grad_norm": 18.054279327392578, - "learning_rate": 1.2424576526009824e-05, - "loss": 1.4477, + "epoch": 0.47, + "grad_norm": 7.889320373535156, + "learning_rate": 1.6838053800778147e-05, + "loss": 0.5604, "step": 3780 }, { - "epoch": 1.14, - "grad_norm": 19.923124313354492, - "learning_rate": 1.2422571915405432e-05, - "loss": 1.3768, + "epoch": 0.47, + "grad_norm": 20.605247497558594, + "learning_rate": 1.6837217085721457e-05, + "loss": 2.7653, "step": 3781 }, { - "epoch": 1.14, - "grad_norm": 23.104625701904297, - "learning_rate": 1.2420567304801043e-05, - "loss": 2.1138, + "epoch": 0.47, + "grad_norm": 10.587319374084473, + "learning_rate": 1.683638037066477e-05, + "loss": 1.3875, "step": 3782 }, { - "epoch": 1.14, - "grad_norm": 8.69080638885498, - "learning_rate": 1.2418562694196654e-05, - "loss": 1.1123, + "epoch": 0.47, + "grad_norm": 7.350720405578613, + "learning_rate": 1.6835543655608085e-05, + "loss": 2.1851, "step": 3783 }, { - "epoch": 1.14, - "grad_norm": 15.912267684936523, - "learning_rate": 1.2416558083592263e-05, - "loss": 2.4516, + "epoch": 0.47, + "grad_norm": 16.164628982543945, + "learning_rate": 1.6834706940551398e-05, + "loss": 1.4781, "step": 3784 }, { - "epoch": 1.14, - "grad_norm": 12.12271785736084, - "learning_rate": 1.2414553472987873e-05, - "loss": 1.4647, + "epoch": 0.48, + "grad_norm": 11.476985931396484, + "learning_rate": 1.683387022549471e-05, + "loss": 0.9459, "step": 3785 }, { - "epoch": 1.14, - "grad_norm": 9.104388236999512, - "learning_rate": 1.2412548862383483e-05, - "loss": 1.8779, + "epoch": 0.48, + "grad_norm": 7.0805792808532715, + "learning_rate": 1.6833033510438022e-05, + "loss": 2.0153, "step": 3786 }, { - "epoch": 1.14, - "grad_norm": 17.152158737182617, - "learning_rate": 1.2410544251779093e-05, - "loss": 1.4711, + "epoch": 0.48, + "grad_norm": 19.46834373474121, + "learning_rate": 1.6832196795381336e-05, + "loss": 0.8265, "step": 3787 }, { - "epoch": 1.14, - "grad_norm": 9.907875061035156, - "learning_rate": 1.2408539641174703e-05, - "loss": 1.1329, + "epoch": 0.48, + "grad_norm": 7.584367752075195, + "learning_rate": 1.6831360080324646e-05, + "loss": 1.703, "step": 3788 }, { - "epoch": 1.14, - "grad_norm": 20.072593688964844, - "learning_rate": 1.2406535030570313e-05, - "loss": 1.9876, + "epoch": 0.48, + "grad_norm": 33.80003356933594, + "learning_rate": 1.683052336526796e-05, + "loss": 3.119, "step": 3789 }, { - "epoch": 1.14, - "grad_norm": 14.026973724365234, - "learning_rate": 1.2404530419965923e-05, - "loss": 0.7741, + "epoch": 0.48, + "grad_norm": 20.491531372070312, + "learning_rate": 1.6829686650211273e-05, + "loss": 1.7394, "step": 3790 }, { - "epoch": 1.14, - "grad_norm": 15.969365119934082, - "learning_rate": 1.2402525809361532e-05, - "loss": 1.6111, + "epoch": 0.48, + "grad_norm": 12.311060905456543, + "learning_rate": 1.6828849935154584e-05, + "loss": 1.6401, "step": 3791 }, { - "epoch": 1.14, - "grad_norm": 15.787557601928711, - "learning_rate": 1.2400521198757143e-05, - "loss": 1.5455, + "epoch": 0.48, + "grad_norm": 10.289591789245605, + "learning_rate": 1.6828013220097897e-05, + "loss": 1.0717, "step": 3792 }, { - "epoch": 1.14, - "grad_norm": 14.39417839050293, - "learning_rate": 1.2398516588152752e-05, - "loss": 1.0369, + "epoch": 0.48, + "grad_norm": 5.99196195602417, + "learning_rate": 1.682717650504121e-05, + "loss": 0.359, "step": 3793 }, { - "epoch": 1.14, - "grad_norm": 13.072067260742188, - "learning_rate": 1.2396511977548362e-05, - "loss": 2.3197, + "epoch": 0.48, + "grad_norm": 9.215882301330566, + "learning_rate": 1.682633978998452e-05, + "loss": 1.2833, "step": 3794 }, { - "epoch": 1.14, - "grad_norm": 14.899435997009277, - "learning_rate": 1.2394507366943974e-05, - "loss": 2.7447, + "epoch": 0.48, + "grad_norm": 13.91613483428955, + "learning_rate": 1.6825503074927835e-05, + "loss": 2.0734, "step": 3795 }, { - "epoch": 1.14, - "grad_norm": 15.155303001403809, - "learning_rate": 1.2392502756339582e-05, - "loss": 1.4796, + "epoch": 0.48, + "grad_norm": 25.02107810974121, + "learning_rate": 1.682466635987115e-05, + "loss": 2.1832, "step": 3796 }, { - "epoch": 1.14, - "grad_norm": 28.191987991333008, - "learning_rate": 1.2390498145735192e-05, - "loss": 1.7832, + "epoch": 0.48, + "grad_norm": 15.298924446105957, + "learning_rate": 1.682382964481446e-05, + "loss": 1.4136, "step": 3797 }, { - "epoch": 1.14, - "grad_norm": 14.950464248657227, - "learning_rate": 1.23884935351308e-05, - "loss": 2.1336, + "epoch": 0.48, + "grad_norm": 18.208202362060547, + "learning_rate": 1.6822992929757772e-05, + "loss": 1.7215, "step": 3798 }, { - "epoch": 1.14, - "grad_norm": 21.477602005004883, - "learning_rate": 1.2386488924526412e-05, - "loss": 1.7525, + "epoch": 0.48, + "grad_norm": 16.074575424194336, + "learning_rate": 1.6822156214701083e-05, + "loss": 1.9483, "step": 3799 }, { - "epoch": 1.14, - "grad_norm": 25.496963500976562, - "learning_rate": 1.238448431392202e-05, - "loss": 1.6689, + "epoch": 0.48, + "grad_norm": 14.745115280151367, + "learning_rate": 1.6821319499644396e-05, + "loss": 1.6011, "step": 3800 }, { - "epoch": 1.14, - "grad_norm": 12.373186111450195, - "learning_rate": 1.238247970331763e-05, - "loss": 2.1899, + "epoch": 0.48, + "grad_norm": 9.882591247558594, + "learning_rate": 1.682048278458771e-05, + "loss": 2.5467, "step": 3801 }, { - "epoch": 1.14, - "grad_norm": 20.684795379638672, - "learning_rate": 1.2380475092713243e-05, - "loss": 2.6932, + "epoch": 0.48, + "grad_norm": 10.847468376159668, + "learning_rate": 1.681964606953102e-05, + "loss": 2.0858, "step": 3802 }, { - "epoch": 1.14, - "grad_norm": 9.77327823638916, - "learning_rate": 1.2378470482108851e-05, - "loss": 1.5063, + "epoch": 0.48, + "grad_norm": 23.348876953125, + "learning_rate": 1.6818809354474334e-05, + "loss": 3.2153, "step": 3803 }, { - "epoch": 1.14, - "grad_norm": 28.525667190551758, - "learning_rate": 1.2376465871504461e-05, - "loss": 1.532, + "epoch": 0.48, + "grad_norm": 21.69083023071289, + "learning_rate": 1.6817972639417648e-05, + "loss": 2.6129, "step": 3804 }, { - "epoch": 1.14, - "grad_norm": 14.671927452087402, - "learning_rate": 1.2374461260900071e-05, - "loss": 2.0253, + "epoch": 0.48, + "grad_norm": 28.29841423034668, + "learning_rate": 1.681713592436096e-05, + "loss": 1.6809, "step": 3805 }, { - "epoch": 1.14, - "grad_norm": 30.834918975830078, - "learning_rate": 1.2372456650295681e-05, - "loss": 2.39, + "epoch": 0.48, + "grad_norm": 11.64278507232666, + "learning_rate": 1.681629920930427e-05, + "loss": 0.8851, "step": 3806 }, { - "epoch": 1.14, - "grad_norm": 16.612564086914062, - "learning_rate": 1.2370452039691291e-05, - "loss": 1.8819, + "epoch": 0.48, + "grad_norm": 10.83991813659668, + "learning_rate": 1.6815462494247585e-05, + "loss": 0.7002, "step": 3807 }, { - "epoch": 1.14, - "grad_norm": 13.142746925354004, - "learning_rate": 1.2368447429086901e-05, - "loss": 1.6061, + "epoch": 0.48, + "grad_norm": 20.13542938232422, + "learning_rate": 1.68146257791909e-05, + "loss": 2.0806, "step": 3808 }, { - "epoch": 1.15, - "grad_norm": 18.517765045166016, - "learning_rate": 1.2366442818482511e-05, - "loss": 1.6218, + "epoch": 0.48, + "grad_norm": 3.9735586643218994, + "learning_rate": 1.681378906413421e-05, + "loss": 0.3442, "step": 3809 }, { - "epoch": 1.15, - "grad_norm": 19.177627563476562, - "learning_rate": 1.236443820787812e-05, - "loss": 2.3172, + "epoch": 0.48, + "grad_norm": 10.199394226074219, + "learning_rate": 1.6812952349077523e-05, + "loss": 1.8834, "step": 3810 }, { - "epoch": 1.15, - "grad_norm": 21.176328659057617, - "learning_rate": 1.2362433597273732e-05, - "loss": 1.8578, + "epoch": 0.48, + "grad_norm": 16.84869384765625, + "learning_rate": 1.6812115634020836e-05, + "loss": 2.7995, "step": 3811 }, { - "epoch": 1.15, - "grad_norm": 21.381567001342773, - "learning_rate": 1.236042898666934e-05, - "loss": 1.5194, + "epoch": 0.48, + "grad_norm": 19.027019500732422, + "learning_rate": 1.681127891896415e-05, + "loss": 2.2333, "step": 3812 }, { - "epoch": 1.15, - "grad_norm": 11.9020414352417, - "learning_rate": 1.235842437606495e-05, - "loss": 1.6908, + "epoch": 0.48, + "grad_norm": 13.716145515441895, + "learning_rate": 1.681044220390746e-05, + "loss": 2.0771, "step": 3813 }, { - "epoch": 1.15, - "grad_norm": 39.47920608520508, - "learning_rate": 1.2356419765460562e-05, - "loss": 1.8726, + "epoch": 0.48, + "grad_norm": 7.44636344909668, + "learning_rate": 1.6809605488850774e-05, + "loss": 2.2718, "step": 3814 }, { - "epoch": 1.15, - "grad_norm": 10.872967720031738, - "learning_rate": 1.235441515485617e-05, - "loss": 1.5569, + "epoch": 0.48, + "grad_norm": 28.244157791137695, + "learning_rate": 1.6808768773794087e-05, + "loss": 2.5986, "step": 3815 }, { - "epoch": 1.15, - "grad_norm": 14.20240592956543, - "learning_rate": 1.235241054425178e-05, - "loss": 1.9525, + "epoch": 0.48, + "grad_norm": 11.873055458068848, + "learning_rate": 1.6807932058737398e-05, + "loss": 2.1554, "step": 3816 }, { - "epoch": 1.15, - "grad_norm": 16.04691505432129, - "learning_rate": 1.2350405933647389e-05, - "loss": 1.4398, + "epoch": 0.48, + "grad_norm": 7.707242488861084, + "learning_rate": 1.680709534368071e-05, + "loss": 2.8684, "step": 3817 }, { - "epoch": 1.15, - "grad_norm": 19.002132415771484, - "learning_rate": 1.2348401323043e-05, - "loss": 1.9253, + "epoch": 0.48, + "grad_norm": 9.765034675598145, + "learning_rate": 1.6806258628624025e-05, + "loss": 1.7397, "step": 3818 }, { - "epoch": 1.15, - "grad_norm": 9.962849617004395, - "learning_rate": 1.234639671243861e-05, - "loss": 2.0496, + "epoch": 0.48, + "grad_norm": 11.618304252624512, + "learning_rate": 1.6805421913567335e-05, + "loss": 2.0023, "step": 3819 }, { - "epoch": 1.15, - "grad_norm": 10.417901039123535, - "learning_rate": 1.2344392101834219e-05, - "loss": 1.9296, + "epoch": 0.48, + "grad_norm": 26.45114517211914, + "learning_rate": 1.680458519851065e-05, + "loss": 2.2092, "step": 3820 }, { - "epoch": 1.15, - "grad_norm": 16.502777099609375, - "learning_rate": 1.234238749122983e-05, - "loss": 1.6108, + "epoch": 0.48, + "grad_norm": 133.9401092529297, + "learning_rate": 1.6803748483453963e-05, + "loss": 1.8951, "step": 3821 }, { - "epoch": 1.15, - "grad_norm": 18.25497817993164, - "learning_rate": 1.234038288062544e-05, - "loss": 1.7453, + "epoch": 0.48, + "grad_norm": 15.46374797821045, + "learning_rate": 1.6802911768397273e-05, + "loss": 1.0543, "step": 3822 }, { - "epoch": 1.15, - "grad_norm": 20.428895950317383, - "learning_rate": 1.233837827002105e-05, - "loss": 2.3424, + "epoch": 0.48, + "grad_norm": 12.88305377960205, + "learning_rate": 1.6802075053340587e-05, + "loss": 3.2489, "step": 3823 }, { - "epoch": 1.15, - "grad_norm": 13.153669357299805, - "learning_rate": 1.2336373659416658e-05, - "loss": 1.3317, + "epoch": 0.48, + "grad_norm": 14.470881462097168, + "learning_rate": 1.6801238338283897e-05, + "loss": 1.6371, "step": 3824 }, { - "epoch": 1.15, - "grad_norm": 13.739114761352539, - "learning_rate": 1.233436904881227e-05, - "loss": 1.4802, + "epoch": 0.48, + "grad_norm": 17.17293357849121, + "learning_rate": 1.680040162322721e-05, + "loss": 2.6381, "step": 3825 }, { - "epoch": 1.15, - "grad_norm": 17.079265594482422, - "learning_rate": 1.233236443820788e-05, - "loss": 1.6825, + "epoch": 0.48, + "grad_norm": 26.487239837646484, + "learning_rate": 1.6799564908170524e-05, + "loss": 2.1349, "step": 3826 }, { - "epoch": 1.15, - "grad_norm": 15.86989974975586, - "learning_rate": 1.2330359827603488e-05, - "loss": 1.7154, + "epoch": 0.48, + "grad_norm": 15.372435569763184, + "learning_rate": 1.6798728193113834e-05, + "loss": 1.4839, "step": 3827 }, { - "epoch": 1.15, - "grad_norm": 14.82314682006836, - "learning_rate": 1.23283552169991e-05, - "loss": 1.3117, + "epoch": 0.48, + "grad_norm": 32.98752975463867, + "learning_rate": 1.6797891478057148e-05, + "loss": 2.9483, "step": 3828 }, { - "epoch": 1.15, - "grad_norm": 21.10222625732422, - "learning_rate": 1.2326350606394708e-05, - "loss": 1.4309, + "epoch": 0.48, + "grad_norm": 14.529047012329102, + "learning_rate": 1.679705476300046e-05, + "loss": 2.0955, "step": 3829 }, { - "epoch": 1.15, - "grad_norm": 15.547961235046387, - "learning_rate": 1.2324345995790318e-05, - "loss": 1.3606, + "epoch": 0.48, + "grad_norm": 9.023208618164062, + "learning_rate": 1.6796218047943772e-05, + "loss": 1.2004, "step": 3830 }, { - "epoch": 1.15, - "grad_norm": 13.123003005981445, - "learning_rate": 1.232234138518593e-05, - "loss": 2.1664, + "epoch": 0.48, + "grad_norm": 12.670555114746094, + "learning_rate": 1.6795381332887086e-05, + "loss": 3.1739, "step": 3831 }, { - "epoch": 1.15, - "grad_norm": 14.233585357666016, - "learning_rate": 1.2320336774581538e-05, - "loss": 1.9827, + "epoch": 0.48, + "grad_norm": 25.908464431762695, + "learning_rate": 1.67945446178304e-05, + "loss": 2.7281, "step": 3832 }, { - "epoch": 1.15, - "grad_norm": 8.499350547790527, - "learning_rate": 1.2318332163977148e-05, - "loss": 0.8647, + "epoch": 0.48, + "grad_norm": 5.578554153442383, + "learning_rate": 1.6793707902773713e-05, + "loss": 0.2401, "step": 3833 }, { - "epoch": 1.15, - "grad_norm": 24.76041603088379, - "learning_rate": 1.2316327553372758e-05, - "loss": 2.5669, + "epoch": 0.48, + "grad_norm": 14.278937339782715, + "learning_rate": 1.6792871187717023e-05, + "loss": 1.7784, "step": 3834 }, { - "epoch": 1.15, - "grad_norm": 11.308053970336914, - "learning_rate": 1.2314322942768369e-05, - "loss": 1.5768, + "epoch": 0.48, + "grad_norm": 15.300288200378418, + "learning_rate": 1.6792034472660337e-05, + "loss": 1.2436, "step": 3835 }, { - "epoch": 1.15, - "grad_norm": 25.03925132751465, - "learning_rate": 1.2312318332163977e-05, - "loss": 1.7572, + "epoch": 0.48, + "grad_norm": 21.30611228942871, + "learning_rate": 1.679119775760365e-05, + "loss": 1.885, "step": 3836 }, { - "epoch": 1.15, - "grad_norm": 19.462387084960938, - "learning_rate": 1.2310313721559589e-05, - "loss": 2.0888, + "epoch": 0.48, + "grad_norm": 16.563570022583008, + "learning_rate": 1.679036104254696e-05, + "loss": 1.578, "step": 3837 }, { - "epoch": 1.15, - "grad_norm": 12.561948776245117, - "learning_rate": 1.2308309110955199e-05, - "loss": 1.1124, + "epoch": 0.48, + "grad_norm": 19.17254066467285, + "learning_rate": 1.6789524327490274e-05, + "loss": 2.2793, "step": 3838 }, { - "epoch": 1.15, - "grad_norm": 34.78009796142578, - "learning_rate": 1.2306304500350807e-05, - "loss": 1.5277, + "epoch": 0.48, + "grad_norm": 9.509984016418457, + "learning_rate": 1.6788687612433588e-05, + "loss": 1.495, "step": 3839 }, { - "epoch": 1.15, - "grad_norm": 26.024934768676758, - "learning_rate": 1.2304299889746419e-05, - "loss": 2.0164, - "step": 3840 - }, - { - "epoch": 1.15, - "eval_loss": 0.21982242166996002, - "eval_runtime": 43.5261, - "eval_samples_per_second": 33.98, - "eval_steps_per_second": 33.98, + "epoch": 0.48, + "grad_norm": 33.51737594604492, + "learning_rate": 1.67878508973769e-05, + "loss": 2.7438, "step": 3840 }, { - "epoch": 1.15, - "grad_norm": 6.806481838226318, - "learning_rate": 1.2302295279142027e-05, - "loss": 0.7476, + "epoch": 0.48, + "grad_norm": 20.427261352539062, + "learning_rate": 1.6787014182320212e-05, + "loss": 3.2905, "step": 3841 }, { - "epoch": 1.16, - "grad_norm": 18.72743797302246, - "learning_rate": 1.2300290668537637e-05, - "loss": 1.4762, + "epoch": 0.48, + "grad_norm": 11.561785697937012, + "learning_rate": 1.6786177467263526e-05, + "loss": 3.0269, "step": 3842 }, { - "epoch": 1.16, - "grad_norm": 55.82676315307617, - "learning_rate": 1.2298286057933246e-05, - "loss": 1.809, + "epoch": 0.48, + "grad_norm": 19.586536407470703, + "learning_rate": 1.678534075220684e-05, + "loss": 2.9902, "step": 3843 }, { - "epoch": 1.16, - "grad_norm": 13.187909126281738, - "learning_rate": 1.2296281447328858e-05, - "loss": 1.9105, + "epoch": 0.48, + "grad_norm": 10.160581588745117, + "learning_rate": 1.678450403715015e-05, + "loss": 1.4402, "step": 3844 }, { - "epoch": 1.16, - "grad_norm": 11.694255828857422, - "learning_rate": 1.2294276836724468e-05, - "loss": 1.6571, + "epoch": 0.48, + "grad_norm": 21.57250213623047, + "learning_rate": 1.6783667322093463e-05, + "loss": 1.9489, "step": 3845 }, { - "epoch": 1.16, - "grad_norm": 20.324060440063477, - "learning_rate": 1.2292272226120076e-05, - "loss": 1.3941, + "epoch": 0.48, + "grad_norm": 14.982203483581543, + "learning_rate": 1.6782830607036777e-05, + "loss": 1.0441, "step": 3846 }, { - "epoch": 1.16, - "grad_norm": 13.865283012390137, - "learning_rate": 1.2290267615515688e-05, - "loss": 1.9631, + "epoch": 0.48, + "grad_norm": 14.22341251373291, + "learning_rate": 1.6781993891980087e-05, + "loss": 1.8044, "step": 3847 }, { - "epoch": 1.16, - "grad_norm": 18.18462562561035, - "learning_rate": 1.2288263004911296e-05, - "loss": 2.1097, + "epoch": 0.48, + "grad_norm": 6.044835567474365, + "learning_rate": 1.67811571769234e-05, + "loss": 0.6106, "step": 3848 }, { - "epoch": 1.16, - "grad_norm": 12.211462020874023, - "learning_rate": 1.2286258394306906e-05, - "loss": 1.8528, + "epoch": 0.48, + "grad_norm": 28.15514373779297, + "learning_rate": 1.6780320461866714e-05, + "loss": 1.8341, "step": 3849 }, { - "epoch": 1.16, - "grad_norm": 12.799270629882812, - "learning_rate": 1.2284253783702518e-05, - "loss": 1.9839, + "epoch": 0.48, + "grad_norm": 13.517406463623047, + "learning_rate": 1.6779483746810025e-05, + "loss": 1.1603, "step": 3850 }, { - "epoch": 1.16, - "grad_norm": 36.99559020996094, - "learning_rate": 1.2282249173098127e-05, - "loss": 1.7886, + "epoch": 0.48, + "grad_norm": 12.102564811706543, + "learning_rate": 1.6778647031753338e-05, + "loss": 1.1951, "step": 3851 }, { - "epoch": 1.16, - "grad_norm": 15.477313995361328, - "learning_rate": 1.2280244562493737e-05, - "loss": 1.6013, + "epoch": 0.48, + "grad_norm": 22.357250213623047, + "learning_rate": 1.677781031669665e-05, + "loss": 1.5787, "step": 3852 }, { - "epoch": 1.16, - "grad_norm": 12.97021198272705, - "learning_rate": 1.2278239951889345e-05, - "loss": 2.1934, + "epoch": 0.48, + "grad_norm": 8.78569507598877, + "learning_rate": 1.6776973601639962e-05, + "loss": 0.8656, "step": 3853 }, { - "epoch": 1.16, - "grad_norm": 9.071000099182129, - "learning_rate": 1.2276235341284957e-05, - "loss": 1.2502, + "epoch": 0.48, + "grad_norm": 14.004049301147461, + "learning_rate": 1.6776136886583276e-05, + "loss": 2.3198, "step": 3854 }, { - "epoch": 1.16, - "grad_norm": 13.090424537658691, - "learning_rate": 1.2274230730680565e-05, - "loss": 1.4692, + "epoch": 0.48, + "grad_norm": 11.207050323486328, + "learning_rate": 1.6775300171526586e-05, + "loss": 2.6709, "step": 3855 }, { - "epoch": 1.16, - "grad_norm": 15.393147468566895, - "learning_rate": 1.2272226120076175e-05, - "loss": 1.1286, + "epoch": 0.48, + "grad_norm": 17.077152252197266, + "learning_rate": 1.67744634564699e-05, + "loss": 1.5795, "step": 3856 }, { - "epoch": 1.16, - "grad_norm": 38.615726470947266, - "learning_rate": 1.2270221509471787e-05, - "loss": 2.5387, + "epoch": 0.48, + "grad_norm": 25.284257888793945, + "learning_rate": 1.6773626741413213e-05, + "loss": 3.127, "step": 3857 }, { - "epoch": 1.16, - "grad_norm": 28.15321922302246, - "learning_rate": 1.2268216898867395e-05, - "loss": 2.187, + "epoch": 0.48, + "grad_norm": 12.849029541015625, + "learning_rate": 1.6772790026356524e-05, + "loss": 1.9975, "step": 3858 }, { - "epoch": 1.16, - "grad_norm": 12.39357852935791, - "learning_rate": 1.2266212288263007e-05, - "loss": 2.2882, + "epoch": 0.48, + "grad_norm": 12.927366256713867, + "learning_rate": 1.6771953311299837e-05, + "loss": 3.0397, "step": 3859 }, { - "epoch": 1.16, - "grad_norm": 30.955167770385742, - "learning_rate": 1.2264207677658616e-05, - "loss": 1.8665, + "epoch": 0.48, + "grad_norm": 7.013041973114014, + "learning_rate": 1.677111659624315e-05, + "loss": 0.2361, "step": 3860 }, { - "epoch": 1.16, - "grad_norm": 16.133106231689453, - "learning_rate": 1.2262203067054226e-05, - "loss": 1.8293, + "epoch": 0.48, + "grad_norm": 10.52235221862793, + "learning_rate": 1.6770279881186465e-05, + "loss": 1.1664, "step": 3861 }, { - "epoch": 1.16, - "grad_norm": 7.856118202209473, - "learning_rate": 1.2260198456449837e-05, - "loss": 0.9778, + "epoch": 0.48, + "grad_norm": 5.555588722229004, + "learning_rate": 1.6769443166129775e-05, + "loss": 0.4259, "step": 3862 }, { - "epoch": 1.16, - "grad_norm": 12.470169067382812, - "learning_rate": 1.2258193845845446e-05, - "loss": 1.4214, + "epoch": 0.48, + "grad_norm": 6.372686862945557, + "learning_rate": 1.676860645107309e-05, + "loss": 1.5433, "step": 3863 }, { - "epoch": 1.16, - "grad_norm": 19.56519317626953, - "learning_rate": 1.2256189235241056e-05, - "loss": 2.2586, + "epoch": 0.48, + "grad_norm": 12.421819686889648, + "learning_rate": 1.6767769736016402e-05, + "loss": 1.1898, "step": 3864 }, { - "epoch": 1.16, - "grad_norm": 23.616268157958984, - "learning_rate": 1.2254184624636664e-05, - "loss": 2.7421, + "epoch": 0.49, + "grad_norm": 13.235730171203613, + "learning_rate": 1.6766933020959712e-05, + "loss": 1.0993, "step": 3865 }, { - "epoch": 1.16, - "grad_norm": 15.930326461791992, - "learning_rate": 1.2252180014032276e-05, - "loss": 2.1892, + "epoch": 0.49, + "grad_norm": 12.569911003112793, + "learning_rate": 1.6766096305903026e-05, + "loss": 2.0045, "step": 3866 }, { - "epoch": 1.16, - "grad_norm": 24.9573974609375, - "learning_rate": 1.2250175403427884e-05, - "loss": 2.7173, + "epoch": 0.49, + "grad_norm": 19.18654441833496, + "learning_rate": 1.676525959084634e-05, + "loss": 1.0166, "step": 3867 }, { - "epoch": 1.16, - "grad_norm": 8.64990234375, - "learning_rate": 1.2248170792823495e-05, - "loss": 1.8587, + "epoch": 0.49, + "grad_norm": 12.57631778717041, + "learning_rate": 1.6764422875789653e-05, + "loss": 1.365, "step": 3868 }, { - "epoch": 1.16, - "grad_norm": 29.846378326416016, - "learning_rate": 1.2246166182219106e-05, - "loss": 1.8677, + "epoch": 0.49, + "grad_norm": 12.228401184082031, + "learning_rate": 1.6763586160732964e-05, + "loss": 1.8901, "step": 3869 }, { - "epoch": 1.16, - "grad_norm": 11.002440452575684, - "learning_rate": 1.2244161571614715e-05, - "loss": 1.6396, + "epoch": 0.49, + "grad_norm": 18.046661376953125, + "learning_rate": 1.6762749445676277e-05, + "loss": 2.1157, "step": 3870 }, { - "epoch": 1.16, - "grad_norm": 16.57766342163086, - "learning_rate": 1.2242156961010325e-05, - "loss": 1.381, + "epoch": 0.49, + "grad_norm": 17.057580947875977, + "learning_rate": 1.676191273061959e-05, + "loss": 1.4784, "step": 3871 }, { - "epoch": 1.16, - "grad_norm": 16.003095626831055, - "learning_rate": 1.2240152350405933e-05, - "loss": 1.7541, + "epoch": 0.49, + "grad_norm": 2.8317675590515137, + "learning_rate": 1.67610760155629e-05, + "loss": 0.1153, "step": 3872 }, { - "epoch": 1.16, - "grad_norm": 16.007123947143555, - "learning_rate": 1.2238147739801545e-05, - "loss": 1.9818, + "epoch": 0.49, + "grad_norm": 8.640043258666992, + "learning_rate": 1.6760239300506215e-05, + "loss": 1.1723, "step": 3873 }, { - "epoch": 1.16, - "grad_norm": 16.965221405029297, - "learning_rate": 1.2236143129197155e-05, - "loss": 2.3105, + "epoch": 0.49, + "grad_norm": 9.828316688537598, + "learning_rate": 1.675940258544953e-05, + "loss": 2.7181, "step": 3874 }, { - "epoch": 1.17, - "grad_norm": 27.36821174621582, - "learning_rate": 1.2234138518592763e-05, - "loss": 2.4866, + "epoch": 0.49, + "grad_norm": 5.815865993499756, + "learning_rate": 1.675856587039284e-05, + "loss": 0.4049, "step": 3875 }, { - "epoch": 1.17, - "grad_norm": 13.598553657531738, - "learning_rate": 1.2232133907988375e-05, - "loss": 1.654, + "epoch": 0.49, + "grad_norm": 25.681018829345703, + "learning_rate": 1.6757729155336152e-05, + "loss": 3.4645, "step": 3876 }, { - "epoch": 1.17, - "grad_norm": 15.706884384155273, - "learning_rate": 1.2230129297383984e-05, - "loss": 2.687, + "epoch": 0.49, + "grad_norm": 9.771852493286133, + "learning_rate": 1.6756892440279463e-05, + "loss": 2.0329, "step": 3877 }, { - "epoch": 1.17, - "grad_norm": 16.47588348388672, - "learning_rate": 1.2228124686779594e-05, - "loss": 1.6308, + "epoch": 0.49, + "grad_norm": 24.728055953979492, + "learning_rate": 1.6756055725222776e-05, + "loss": 3.5055, "step": 3878 }, { - "epoch": 1.17, - "grad_norm": 12.876527786254883, - "learning_rate": 1.2226120076175204e-05, - "loss": 1.4511, + "epoch": 0.49, + "grad_norm": 8.333413124084473, + "learning_rate": 1.675521901016609e-05, + "loss": 2.9655, "step": 3879 }, { - "epoch": 1.17, - "grad_norm": 20.67323112487793, - "learning_rate": 1.2224115465570814e-05, - "loss": 1.4801, + "epoch": 0.49, + "grad_norm": 15.079514503479004, + "learning_rate": 1.67543822951094e-05, + "loss": 2.9962, "step": 3880 }, { - "epoch": 1.17, - "grad_norm": 29.967750549316406, - "learning_rate": 1.2222110854966424e-05, - "loss": 2.2102, + "epoch": 0.49, + "grad_norm": 16.224035263061523, + "learning_rate": 1.6753545580052714e-05, + "loss": 1.4763, "step": 3881 }, { - "epoch": 1.17, - "grad_norm": 12.949405670166016, - "learning_rate": 1.2220106244362034e-05, - "loss": 1.8167, + "epoch": 0.49, + "grad_norm": 6.735492706298828, + "learning_rate": 1.6752708864996027e-05, + "loss": 0.4744, "step": 3882 }, { - "epoch": 1.17, - "grad_norm": 26.90522575378418, - "learning_rate": 1.2218101633757644e-05, - "loss": 1.7836, + "epoch": 0.49, + "grad_norm": 13.388711929321289, + "learning_rate": 1.6751872149939338e-05, + "loss": 0.7654, "step": 3883 }, { - "epoch": 1.17, - "grad_norm": 14.43256950378418, - "learning_rate": 1.2216097023153253e-05, - "loss": 1.5144, + "epoch": 0.49, + "grad_norm": 25.84760093688965, + "learning_rate": 1.675103543488265e-05, + "loss": 3.6068, "step": 3884 }, { - "epoch": 1.17, - "grad_norm": 10.095649719238281, - "learning_rate": 1.2214092412548864e-05, - "loss": 0.8749, + "epoch": 0.49, + "grad_norm": 11.907512664794922, + "learning_rate": 1.6750198719825965e-05, + "loss": 1.642, "step": 3885 }, { - "epoch": 1.17, - "grad_norm": 7.504431247711182, - "learning_rate": 1.2212087801944473e-05, - "loss": 1.5074, + "epoch": 0.49, + "grad_norm": 12.980208396911621, + "learning_rate": 1.6749362004769275e-05, + "loss": 1.0291, "step": 3886 }, { - "epoch": 1.17, - "grad_norm": 19.273103713989258, - "learning_rate": 1.2210083191340083e-05, - "loss": 2.6758, + "epoch": 0.49, + "grad_norm": 12.23293685913086, + "learning_rate": 1.674852528971259e-05, + "loss": 2.3221, "step": 3887 }, { - "epoch": 1.17, - "grad_norm": 24.052509307861328, - "learning_rate": 1.2208078580735695e-05, - "loss": 1.6407, + "epoch": 0.49, + "grad_norm": 13.18155288696289, + "learning_rate": 1.6747688574655903e-05, + "loss": 1.0233, "step": 3888 }, { - "epoch": 1.17, - "grad_norm": 19.1132869720459, - "learning_rate": 1.2206073970131303e-05, - "loss": 1.7163, + "epoch": 0.49, + "grad_norm": 8.485978126525879, + "learning_rate": 1.6746851859599216e-05, + "loss": 1.2268, "step": 3889 }, { - "epoch": 1.17, - "grad_norm": 51.50567626953125, - "learning_rate": 1.2204069359526913e-05, - "loss": 2.9385, + "epoch": 0.49, + "grad_norm": 9.914412498474121, + "learning_rate": 1.6746015144542526e-05, + "loss": 1.3486, "step": 3890 }, { - "epoch": 1.17, - "grad_norm": 18.24009132385254, - "learning_rate": 1.2202064748922521e-05, - "loss": 1.7915, + "epoch": 0.49, + "grad_norm": 6.944596290588379, + "learning_rate": 1.674517842948584e-05, + "loss": 1.0267, "step": 3891 }, { - "epoch": 1.17, - "grad_norm": 14.289188385009766, - "learning_rate": 1.2200060138318133e-05, - "loss": 1.8599, + "epoch": 0.49, + "grad_norm": 10.893056869506836, + "learning_rate": 1.6744341714429154e-05, + "loss": 0.8046, "step": 3892 }, { - "epoch": 1.17, - "grad_norm": 35.022804260253906, - "learning_rate": 1.2198055527713743e-05, - "loss": 2.9202, + "epoch": 0.49, + "grad_norm": 20.531518936157227, + "learning_rate": 1.6743504999372464e-05, + "loss": 3.314, "step": 3893 }, { - "epoch": 1.17, - "grad_norm": 25.463573455810547, - "learning_rate": 1.2196050917109352e-05, - "loss": 1.622, + "epoch": 0.49, + "grad_norm": 26.03514862060547, + "learning_rate": 1.6742668284315778e-05, + "loss": 1.8338, "step": 3894 }, { - "epoch": 1.17, - "grad_norm": 10.089998245239258, - "learning_rate": 1.2194046306504963e-05, - "loss": 1.1327, + "epoch": 0.49, + "grad_norm": 14.674633026123047, + "learning_rate": 1.674183156925909e-05, + "loss": 2.1986, "step": 3895 }, { - "epoch": 1.17, - "grad_norm": 23.384336471557617, - "learning_rate": 1.2192041695900572e-05, - "loss": 1.2661, + "epoch": 0.49, + "grad_norm": 22.857954025268555, + "learning_rate": 1.6740994854202405e-05, + "loss": 3.8863, "step": 3896 }, { - "epoch": 1.17, - "grad_norm": 12.663224220275879, - "learning_rate": 1.2190037085296182e-05, - "loss": 2.6986, + "epoch": 0.49, + "grad_norm": 16.738014221191406, + "learning_rate": 1.6740158139145715e-05, + "loss": 2.1353, "step": 3897 }, { - "epoch": 1.17, - "grad_norm": 16.6176700592041, - "learning_rate": 1.218803247469179e-05, - "loss": 1.894, + "epoch": 0.49, + "grad_norm": 14.383560180664062, + "learning_rate": 1.673932142408903e-05, + "loss": 3.1573, "step": 3898 }, { - "epoch": 1.17, - "grad_norm": 12.314738273620605, - "learning_rate": 1.2186027864087402e-05, - "loss": 1.2068, + "epoch": 0.49, + "grad_norm": 31.759464263916016, + "learning_rate": 1.6738484709032343e-05, + "loss": 2.1836, "step": 3899 }, { - "epoch": 1.17, - "grad_norm": 23.991886138916016, - "learning_rate": 1.2184023253483012e-05, - "loss": 1.6332, + "epoch": 0.49, + "grad_norm": 9.206013679504395, + "learning_rate": 1.6737647993975653e-05, + "loss": 1.3652, "step": 3900 }, { - "epoch": 1.17, - "grad_norm": 23.119836807250977, - "learning_rate": 1.218201864287862e-05, - "loss": 2.1423, + "epoch": 0.49, + "grad_norm": 7.129506587982178, + "learning_rate": 1.6736811278918966e-05, + "loss": 1.1021, "step": 3901 }, { - "epoch": 1.17, - "grad_norm": 29.719497680664062, - "learning_rate": 1.2180014032274232e-05, - "loss": 1.4554, + "epoch": 0.49, + "grad_norm": 21.61859703063965, + "learning_rate": 1.673597456386228e-05, + "loss": 2.0031, "step": 3902 }, { - "epoch": 1.17, - "grad_norm": 13.610468864440918, - "learning_rate": 1.217800942166984e-05, - "loss": 1.6478, + "epoch": 0.49, + "grad_norm": 16.006519317626953, + "learning_rate": 1.673513784880559e-05, + "loss": 2.8328, "step": 3903 }, { - "epoch": 1.17, - "grad_norm": 12.19708251953125, - "learning_rate": 1.217600481106545e-05, - "loss": 1.4697, + "epoch": 0.49, + "grad_norm": 14.28494930267334, + "learning_rate": 1.6734301133748904e-05, + "loss": 1.6195, "step": 3904 }, { - "epoch": 1.17, - "grad_norm": 14.37850570678711, - "learning_rate": 1.2174000200461063e-05, - "loss": 1.2426, + "epoch": 0.49, + "grad_norm": 27.926685333251953, + "learning_rate": 1.6733464418692214e-05, + "loss": 1.0091, "step": 3905 }, { - "epoch": 1.17, - "grad_norm": 16.169696807861328, - "learning_rate": 1.2171995589856671e-05, - "loss": 1.7116, + "epoch": 0.49, + "grad_norm": 12.705221176147461, + "learning_rate": 1.6732627703635528e-05, + "loss": 1.2817, "step": 3906 }, { - "epoch": 1.17, - "grad_norm": 14.644316673278809, - "learning_rate": 1.2169990979252281e-05, - "loss": 1.5071, + "epoch": 0.49, + "grad_norm": 13.459248542785645, + "learning_rate": 1.673179098857884e-05, + "loss": 1.7091, "step": 3907 }, { - "epoch": 1.17, - "grad_norm": 10.753741264343262, - "learning_rate": 1.2167986368647891e-05, - "loss": 1.8289, + "epoch": 0.49, + "grad_norm": 19.173797607421875, + "learning_rate": 1.6730954273522152e-05, + "loss": 3.2456, "step": 3908 }, { - "epoch": 1.18, - "grad_norm": 17.671937942504883, - "learning_rate": 1.2165981758043501e-05, - "loss": 1.3769, + "epoch": 0.49, + "grad_norm": 16.161258697509766, + "learning_rate": 1.6730117558465465e-05, + "loss": 1.4467, "step": 3909 }, { - "epoch": 1.18, - "grad_norm": 10.696758270263672, - "learning_rate": 1.216397714743911e-05, - "loss": 1.7544, + "epoch": 0.49, + "grad_norm": 24.011537551879883, + "learning_rate": 1.672928084340878e-05, + "loss": 2.3174, "step": 3910 }, { - "epoch": 1.18, - "grad_norm": 10.894145011901855, - "learning_rate": 1.2161972536834721e-05, - "loss": 1.3638, + "epoch": 0.49, + "grad_norm": 15.91162395477295, + "learning_rate": 1.672844412835209e-05, + "loss": 4.2072, "step": 3911 }, { - "epoch": 1.18, - "grad_norm": 14.590895652770996, - "learning_rate": 1.2159967926230332e-05, - "loss": 1.4625, + "epoch": 0.49, + "grad_norm": 24.110483169555664, + "learning_rate": 1.6727607413295403e-05, + "loss": 1.7958, "step": 3912 }, { - "epoch": 1.18, - "grad_norm": 20.419675827026367, - "learning_rate": 1.215796331562594e-05, - "loss": 2.223, + "epoch": 0.49, + "grad_norm": 8.102417945861816, + "learning_rate": 1.6726770698238717e-05, + "loss": 1.6236, "step": 3913 }, { - "epoch": 1.18, - "grad_norm": 29.404043197631836, - "learning_rate": 1.2155958705021552e-05, - "loss": 1.6396, + "epoch": 0.49, + "grad_norm": 8.216350555419922, + "learning_rate": 1.6725933983182027e-05, + "loss": 3.1026, "step": 3914 }, { - "epoch": 1.18, - "grad_norm": 24.126564025878906, - "learning_rate": 1.215395409441716e-05, - "loss": 1.6138, + "epoch": 0.49, + "grad_norm": 9.680403709411621, + "learning_rate": 1.672509726812534e-05, + "loss": 2.7506, "step": 3915 }, { - "epoch": 1.18, - "grad_norm": 13.458683967590332, - "learning_rate": 1.215194948381277e-05, - "loss": 2.0191, + "epoch": 0.49, + "grad_norm": 11.488420486450195, + "learning_rate": 1.6724260553068654e-05, + "loss": 2.0227, "step": 3916 }, { - "epoch": 1.18, - "grad_norm": 9.993988037109375, - "learning_rate": 1.2149944873208382e-05, - "loss": 1.3779, + "epoch": 0.49, + "grad_norm": 32.506500244140625, + "learning_rate": 1.6723423838011968e-05, + "loss": 1.5272, "step": 3917 }, { - "epoch": 1.18, - "grad_norm": 13.048202514648438, - "learning_rate": 1.214794026260399e-05, - "loss": 1.8113, + "epoch": 0.49, + "grad_norm": 5.9329352378845215, + "learning_rate": 1.6722587122955278e-05, + "loss": 0.4988, "step": 3918 }, { - "epoch": 1.18, - "grad_norm": 11.234829902648926, - "learning_rate": 1.21459356519996e-05, - "loss": 1.4256, + "epoch": 0.49, + "grad_norm": 7.876817226409912, + "learning_rate": 1.6721750407898592e-05, + "loss": 1.166, "step": 3919 }, { - "epoch": 1.18, - "grad_norm": 25.56989860534668, - "learning_rate": 1.2143931041395209e-05, - "loss": 2.0883, + "epoch": 0.49, + "grad_norm": 11.694708824157715, + "learning_rate": 1.6720913692841905e-05, + "loss": 1.7381, "step": 3920 }, { - "epoch": 1.18, - "grad_norm": 12.80074405670166, - "learning_rate": 1.214192643079082e-05, - "loss": 1.7903, + "epoch": 0.49, + "grad_norm": 17.446664810180664, + "learning_rate": 1.6720076977785216e-05, + "loss": 1.4075, "step": 3921 }, { - "epoch": 1.18, - "grad_norm": 26.22979736328125, - "learning_rate": 1.2139921820186429e-05, - "loss": 2.3038, + "epoch": 0.49, + "grad_norm": 15.675682067871094, + "learning_rate": 1.671924026272853e-05, + "loss": 2.8233, "step": 3922 }, { - "epoch": 1.18, - "grad_norm": 15.948308944702148, - "learning_rate": 1.2137917209582039e-05, - "loss": 1.9847, + "epoch": 0.49, + "grad_norm": 22.835962295532227, + "learning_rate": 1.6718403547671843e-05, + "loss": 1.0312, "step": 3923 }, { - "epoch": 1.18, - "grad_norm": 20.231281280517578, - "learning_rate": 1.213591259897765e-05, - "loss": 2.6807, + "epoch": 0.49, + "grad_norm": 9.870447158813477, + "learning_rate": 1.6717566832615157e-05, + "loss": 1.2746, "step": 3924 }, { - "epoch": 1.18, - "grad_norm": 26.88908576965332, - "learning_rate": 1.213390798837326e-05, - "loss": 2.2021, + "epoch": 0.49, + "grad_norm": 43.03246307373047, + "learning_rate": 1.6716730117558467e-05, + "loss": 2.252, "step": 3925 }, { - "epoch": 1.18, - "grad_norm": 9.647951126098633, - "learning_rate": 1.213190337776887e-05, - "loss": 1.7718, + "epoch": 0.49, + "grad_norm": 10.218485832214355, + "learning_rate": 1.671589340250178e-05, + "loss": 0.9047, "step": 3926 }, { - "epoch": 1.18, - "grad_norm": 25.476966857910156, - "learning_rate": 1.212989876716448e-05, - "loss": 1.7512, + "epoch": 0.49, + "grad_norm": 9.962735176086426, + "learning_rate": 1.6715056687445094e-05, + "loss": 1.1066, "step": 3927 }, { - "epoch": 1.18, - "grad_norm": 17.493688583374023, - "learning_rate": 1.212789415656009e-05, - "loss": 2.0739, + "epoch": 0.49, + "grad_norm": 11.211706161499023, + "learning_rate": 1.6714219972388404e-05, + "loss": 1.5452, "step": 3928 }, { - "epoch": 1.18, - "grad_norm": 18.608076095581055, - "learning_rate": 1.2125889545955698e-05, - "loss": 1.8991, + "epoch": 0.49, + "grad_norm": 11.349990844726562, + "learning_rate": 1.6713383257331718e-05, + "loss": 0.8992, "step": 3929 }, { - "epoch": 1.18, - "grad_norm": 26.32684326171875, - "learning_rate": 1.212388493535131e-05, - "loss": 1.3218, + "epoch": 0.49, + "grad_norm": 14.319396018981934, + "learning_rate": 1.671254654227503e-05, + "loss": 2.6085, "step": 3930 }, { - "epoch": 1.18, - "grad_norm": 14.290658950805664, - "learning_rate": 1.212188032474692e-05, - "loss": 1.6064, + "epoch": 0.49, + "grad_norm": 34.960479736328125, + "learning_rate": 1.6711709827218342e-05, + "loss": 2.4196, "step": 3931 }, { - "epoch": 1.18, - "grad_norm": 8.176145553588867, - "learning_rate": 1.2119875714142528e-05, - "loss": 1.101, + "epoch": 0.49, + "grad_norm": 18.694778442382812, + "learning_rate": 1.6710873112161656e-05, + "loss": 1.6912, "step": 3932 }, { - "epoch": 1.18, - "grad_norm": 77.52426147460938, - "learning_rate": 1.211787110353814e-05, - "loss": 1.8667, + "epoch": 0.49, + "grad_norm": 33.8570442199707, + "learning_rate": 1.6710036397104966e-05, + "loss": 2.4436, "step": 3933 }, { - "epoch": 1.18, - "grad_norm": 13.218364715576172, - "learning_rate": 1.2115866492933748e-05, - "loss": 2.0845, + "epoch": 0.49, + "grad_norm": 6.813931465148926, + "learning_rate": 1.670919968204828e-05, + "loss": 4.083, "step": 3934 }, { - "epoch": 1.18, - "grad_norm": 13.180416107177734, - "learning_rate": 1.2113861882329358e-05, - "loss": 1.7175, + "epoch": 0.49, + "grad_norm": 4.025628089904785, + "learning_rate": 1.670836296699159e-05, + "loss": 0.09, "step": 3935 }, { - "epoch": 1.18, - "grad_norm": 27.077486038208008, - "learning_rate": 1.211185727172497e-05, - "loss": 1.5133, + "epoch": 0.49, + "grad_norm": 7.611042499542236, + "learning_rate": 1.6707526251934903e-05, + "loss": 0.9509, "step": 3936 }, { - "epoch": 1.18, - "grad_norm": 14.540903091430664, - "learning_rate": 1.2109852661120579e-05, - "loss": 2.1343, + "epoch": 0.49, + "grad_norm": 56.0538330078125, + "learning_rate": 1.6706689536878217e-05, + "loss": 1.2226, "step": 3937 }, { - "epoch": 1.18, - "grad_norm": 18.087007522583008, - "learning_rate": 1.2107848050516189e-05, - "loss": 1.6534, + "epoch": 0.49, + "grad_norm": 12.239697456359863, + "learning_rate": 1.670585282182153e-05, + "loss": 2.6821, "step": 3938 }, { - "epoch": 1.18, - "grad_norm": 18.4873046875, - "learning_rate": 1.2105843439911797e-05, - "loss": 1.8601, + "epoch": 0.49, + "grad_norm": 21.050329208374023, + "learning_rate": 1.670501610676484e-05, + "loss": 1.2584, "step": 3939 }, { - "epoch": 1.18, - "grad_norm": 11.12307071685791, - "learning_rate": 1.2103838829307409e-05, - "loss": 1.4969, + "epoch": 0.49, + "grad_norm": 46.02468490600586, + "learning_rate": 1.6704179391708155e-05, + "loss": 2.3161, "step": 3940 }, { - "epoch": 1.18, - "grad_norm": 22.688817977905273, - "learning_rate": 1.2101834218703017e-05, - "loss": 2.0138, + "epoch": 0.49, + "grad_norm": 31.32345199584961, + "learning_rate": 1.670334267665147e-05, + "loss": 2.216, "step": 3941 }, { - "epoch": 1.19, - "grad_norm": 32.38698196411133, - "learning_rate": 1.2099829608098627e-05, - "loss": 3.5612, + "epoch": 0.49, + "grad_norm": 13.726557731628418, + "learning_rate": 1.670250596159478e-05, + "loss": 2.2354, "step": 3942 }, { - "epoch": 1.19, - "grad_norm": 40.410884857177734, - "learning_rate": 1.2097824997494239e-05, - "loss": 2.6222, + "epoch": 0.49, + "grad_norm": 12.190067291259766, + "learning_rate": 1.6701669246538092e-05, + "loss": 1.2092, "step": 3943 }, { - "epoch": 1.19, - "grad_norm": 11.373454093933105, - "learning_rate": 1.2095820386889847e-05, - "loss": 1.8325, + "epoch": 0.49, + "grad_norm": 19.066221237182617, + "learning_rate": 1.6700832531481406e-05, + "loss": 2.4095, "step": 3944 }, { - "epoch": 1.19, - "grad_norm": 57.750003814697266, - "learning_rate": 1.2093815776285458e-05, - "loss": 2.7973, + "epoch": 0.5, + "grad_norm": 48.04729080200195, + "learning_rate": 1.669999581642472e-05, + "loss": 1.9395, "step": 3945 }, { - "epoch": 1.19, - "grad_norm": 18.441944122314453, - "learning_rate": 1.2091811165681066e-05, - "loss": 2.9912, + "epoch": 0.5, + "grad_norm": 15.870589256286621, + "learning_rate": 1.669915910136803e-05, + "loss": 2.7708, "step": 3946 }, { - "epoch": 1.19, - "grad_norm": 32.177513122558594, - "learning_rate": 1.2089806555076678e-05, - "loss": 1.9056, + "epoch": 0.5, + "grad_norm": 8.784354209899902, + "learning_rate": 1.6698322386311343e-05, + "loss": 2.7359, "step": 3947 }, { - "epoch": 1.19, - "grad_norm": 22.477294921875, - "learning_rate": 1.2087801944472288e-05, - "loss": 2.2548, + "epoch": 0.5, + "grad_norm": 19.37180519104004, + "learning_rate": 1.6697485671254657e-05, + "loss": 1.7255, "step": 3948 }, { - "epoch": 1.19, - "grad_norm": 20.485315322875977, - "learning_rate": 1.2085797333867896e-05, - "loss": 1.4721, + "epoch": 0.5, + "grad_norm": 16.548009872436523, + "learning_rate": 1.6696648956197967e-05, + "loss": 1.3249, "step": 3949 }, { - "epoch": 1.19, - "grad_norm": 14.799079895019531, - "learning_rate": 1.2083792723263508e-05, - "loss": 1.8698, + "epoch": 0.5, + "grad_norm": 8.36740493774414, + "learning_rate": 1.669581224114128e-05, + "loss": 0.97, "step": 3950 }, { - "epoch": 1.19, - "grad_norm": 13.152766227722168, - "learning_rate": 1.2081788112659116e-05, - "loss": 2.6729, + "epoch": 0.5, + "grad_norm": 14.328258514404297, + "learning_rate": 1.6694975526084595e-05, + "loss": 1.6861, "step": 3951 }, { - "epoch": 1.19, - "grad_norm": 10.631035804748535, - "learning_rate": 1.2079783502054726e-05, - "loss": 1.7254, + "epoch": 0.5, + "grad_norm": 10.262837409973145, + "learning_rate": 1.6694138811027908e-05, + "loss": 1.8354, "step": 3952 }, { - "epoch": 1.19, - "grad_norm": 15.786643981933594, - "learning_rate": 1.2077778891450336e-05, - "loss": 1.1798, + "epoch": 0.5, + "grad_norm": 10.428544044494629, + "learning_rate": 1.669330209597122e-05, + "loss": 1.3457, "step": 3953 }, { - "epoch": 1.19, - "grad_norm": 12.110862731933594, - "learning_rate": 1.2075774280845947e-05, - "loss": 1.4401, + "epoch": 0.5, + "grad_norm": 13.362865447998047, + "learning_rate": 1.6692465380914532e-05, + "loss": 2.1365, "step": 3954 }, { - "epoch": 1.19, - "grad_norm": 15.332069396972656, - "learning_rate": 1.2073769670241557e-05, - "loss": 2.4265, + "epoch": 0.5, + "grad_norm": 11.466238975524902, + "learning_rate": 1.6691628665857842e-05, + "loss": 1.1351, "step": 3955 }, { - "epoch": 1.19, - "grad_norm": 26.413911819458008, - "learning_rate": 1.2071765059637167e-05, - "loss": 1.7556, + "epoch": 0.5, + "grad_norm": 17.091264724731445, + "learning_rate": 1.6690791950801156e-05, + "loss": 2.4696, "step": 3956 }, { - "epoch": 1.19, - "grad_norm": 10.954570770263672, - "learning_rate": 1.2069760449032777e-05, - "loss": 0.8213, + "epoch": 0.5, + "grad_norm": 18.040620803833008, + "learning_rate": 1.668995523574447e-05, + "loss": 2.4856, "step": 3957 }, { - "epoch": 1.19, - "grad_norm": 20.662534713745117, - "learning_rate": 1.2067755838428385e-05, - "loss": 1.6746, + "epoch": 0.5, + "grad_norm": 10.580212593078613, + "learning_rate": 1.668911852068778e-05, + "loss": 1.9885, "step": 3958 }, { - "epoch": 1.19, - "grad_norm": 9.891891479492188, - "learning_rate": 1.2065751227823997e-05, - "loss": 1.3557, + "epoch": 0.5, + "grad_norm": 7.360715389251709, + "learning_rate": 1.6688281805631094e-05, + "loss": 0.5205, "step": 3959 }, { - "epoch": 1.19, - "grad_norm": 13.939960479736328, - "learning_rate": 1.2063746617219607e-05, - "loss": 1.6665, - "step": 3960 - }, - { - "epoch": 1.19, - "eval_loss": 0.22422289848327637, - "eval_runtime": 43.495, - "eval_samples_per_second": 34.004, - "eval_steps_per_second": 34.004, + "epoch": 0.5, + "grad_norm": 17.456523895263672, + "learning_rate": 1.6687445090574407e-05, + "loss": 1.7968, "step": 3960 }, { - "epoch": 1.19, - "grad_norm": 50.82015609741211, - "learning_rate": 1.2061742006615215e-05, - "loss": 2.8102, + "epoch": 0.5, + "grad_norm": 10.726287841796875, + "learning_rate": 1.6686608375517718e-05, + "loss": 1.5141, "step": 3961 }, { - "epoch": 1.19, - "grad_norm": 10.213366508483887, - "learning_rate": 1.2059737396010827e-05, - "loss": 1.7045, + "epoch": 0.5, + "grad_norm": 8.44047737121582, + "learning_rate": 1.668577166046103e-05, + "loss": 1.1776, "step": 3962 }, { - "epoch": 1.19, - "grad_norm": 9.090950965881348, - "learning_rate": 1.2057732785406436e-05, - "loss": 1.0682, + "epoch": 0.5, + "grad_norm": 16.860645294189453, + "learning_rate": 1.668493494540434e-05, + "loss": 5.1519, "step": 3963 }, { - "epoch": 1.19, - "grad_norm": 14.20166015625, - "learning_rate": 1.2055728174802046e-05, - "loss": 1.6642, + "epoch": 0.5, + "grad_norm": 11.764198303222656, + "learning_rate": 1.6684098230347655e-05, + "loss": 1.5988, "step": 3964 }, { - "epoch": 1.19, - "grad_norm": 14.564234733581543, - "learning_rate": 1.2053723564197654e-05, - "loss": 1.6248, + "epoch": 0.5, + "grad_norm": 5.626613616943359, + "learning_rate": 1.668326151529097e-05, + "loss": 1.0498, "step": 3965 }, { - "epoch": 1.19, - "grad_norm": 16.231090545654297, - "learning_rate": 1.2051718953593266e-05, - "loss": 1.2251, + "epoch": 0.5, + "grad_norm": 53.96699523925781, + "learning_rate": 1.6682424800234282e-05, + "loss": 1.1779, "step": 3966 }, { - "epoch": 1.19, - "grad_norm": 15.789569854736328, - "learning_rate": 1.2049714342988876e-05, - "loss": 2.4118, + "epoch": 0.5, + "grad_norm": 4.14115571975708, + "learning_rate": 1.6681588085177593e-05, + "loss": 0.1223, "step": 3967 }, { - "epoch": 1.19, - "grad_norm": 18.06235122680664, - "learning_rate": 1.2047709732384484e-05, - "loss": 1.4758, + "epoch": 0.5, + "grad_norm": 35.95234680175781, + "learning_rate": 1.6680751370120906e-05, + "loss": 1.6813, "step": 3968 }, { - "epoch": 1.19, - "grad_norm": 18.72103500366211, - "learning_rate": 1.2045705121780096e-05, - "loss": 1.416, + "epoch": 0.5, + "grad_norm": 11.989574432373047, + "learning_rate": 1.667991465506422e-05, + "loss": 1.5719, "step": 3969 }, { - "epoch": 1.19, - "grad_norm": 67.29039001464844, - "learning_rate": 1.2043700511175705e-05, - "loss": 3.1432, + "epoch": 0.5, + "grad_norm": 9.279258728027344, + "learning_rate": 1.667907794000753e-05, + "loss": 1.3138, "step": 3970 }, { - "epoch": 1.19, - "grad_norm": 9.234892845153809, - "learning_rate": 1.2041695900571315e-05, - "loss": 1.0072, + "epoch": 0.5, + "grad_norm": 7.835389614105225, + "learning_rate": 1.6678241224950844e-05, + "loss": 1.4085, "step": 3971 }, { - "epoch": 1.19, - "grad_norm": 16.655811309814453, - "learning_rate": 1.2039691289966923e-05, - "loss": 1.4894, + "epoch": 0.5, + "grad_norm": 10.986465454101562, + "learning_rate": 1.6677404509894158e-05, + "loss": 1.6206, "step": 3972 }, { - "epoch": 1.19, - "grad_norm": 24.546369552612305, - "learning_rate": 1.2037686679362535e-05, - "loss": 1.6944, + "epoch": 0.5, + "grad_norm": 21.686466217041016, + "learning_rate": 1.667656779483747e-05, + "loss": 1.826, "step": 3973 }, { - "epoch": 1.19, - "grad_norm": 22.591285705566406, - "learning_rate": 1.2035682068758145e-05, - "loss": 1.4835, + "epoch": 0.5, + "grad_norm": 10.920929908752441, + "learning_rate": 1.667573107978078e-05, + "loss": 1.2232, "step": 3974 }, { - "epoch": 1.2, - "grad_norm": 19.460481643676758, - "learning_rate": 1.2033677458153753e-05, - "loss": 2.3772, + "epoch": 0.5, + "grad_norm": 14.287152290344238, + "learning_rate": 1.6674894364724095e-05, + "loss": 0.3653, "step": 3975 }, { - "epoch": 1.2, - "grad_norm": 10.695039749145508, - "learning_rate": 1.2031672847549365e-05, - "loss": 1.3966, + "epoch": 0.5, + "grad_norm": 40.73248291015625, + "learning_rate": 1.667405764966741e-05, + "loss": 3.1087, "step": 3976 }, { - "epoch": 1.2, - "grad_norm": 16.96231460571289, - "learning_rate": 1.2029668236944973e-05, - "loss": 1.5844, + "epoch": 0.5, + "grad_norm": 10.77466106414795, + "learning_rate": 1.667322093461072e-05, + "loss": 1.8408, "step": 3977 }, { - "epoch": 1.2, - "grad_norm": 13.368218421936035, - "learning_rate": 1.2027663626340585e-05, - "loss": 2.2137, + "epoch": 0.5, + "grad_norm": 16.43342399597168, + "learning_rate": 1.6672384219554033e-05, + "loss": 1.9936, "step": 3978 }, { - "epoch": 1.2, - "grad_norm": 16.881895065307617, - "learning_rate": 1.2025659015736195e-05, - "loss": 1.809, + "epoch": 0.5, + "grad_norm": 12.749366760253906, + "learning_rate": 1.6671547504497346e-05, + "loss": 1.4893, "step": 3979 }, { - "epoch": 1.2, - "grad_norm": 16.86216926574707, - "learning_rate": 1.2023654405131804e-05, - "loss": 1.7264, + "epoch": 0.5, + "grad_norm": 22.858173370361328, + "learning_rate": 1.667071078944066e-05, + "loss": 2.2102, "step": 3980 }, { - "epoch": 1.2, - "grad_norm": 19.68351173400879, - "learning_rate": 1.2021649794527415e-05, - "loss": 1.4559, + "epoch": 0.5, + "grad_norm": 10.709026336669922, + "learning_rate": 1.666987407438397e-05, + "loss": 0.931, "step": 3981 }, { - "epoch": 1.2, - "grad_norm": 27.27216339111328, - "learning_rate": 1.2019645183923024e-05, - "loss": 1.6525, + "epoch": 0.5, + "grad_norm": 12.102494239807129, + "learning_rate": 1.6669037359327284e-05, + "loss": 1.8199, "step": 3982 }, { - "epoch": 1.2, - "grad_norm": 48.12042999267578, - "learning_rate": 1.2017640573318634e-05, - "loss": 2.7933, + "epoch": 0.5, + "grad_norm": 29.741167068481445, + "learning_rate": 1.6668200644270594e-05, + "loss": 1.8224, "step": 3983 }, { - "epoch": 1.2, - "grad_norm": 17.5889892578125, - "learning_rate": 1.2015635962714242e-05, - "loss": 1.2232, + "epoch": 0.5, + "grad_norm": 10.0697021484375, + "learning_rate": 1.6667363929213908e-05, + "loss": 2.046, "step": 3984 }, { - "epoch": 1.2, - "grad_norm": 15.102045059204102, - "learning_rate": 1.2013631352109854e-05, - "loss": 2.1175, + "epoch": 0.5, + "grad_norm": 29.015119552612305, + "learning_rate": 1.666652721415722e-05, + "loss": 1.8735, "step": 3985 }, { - "epoch": 1.2, - "grad_norm": 15.093314170837402, - "learning_rate": 1.2011626741505464e-05, - "loss": 2.0942, + "epoch": 0.5, + "grad_norm": 7.788076400756836, + "learning_rate": 1.6665690499100532e-05, + "loss": 2.3206, "step": 3986 }, { - "epoch": 1.2, - "grad_norm": 14.149928092956543, - "learning_rate": 1.2009622130901073e-05, - "loss": 1.4488, + "epoch": 0.5, + "grad_norm": 39.918521881103516, + "learning_rate": 1.6664853784043845e-05, + "loss": 1.9093, "step": 3987 }, { - "epoch": 1.2, - "grad_norm": 23.17877197265625, - "learning_rate": 1.2007617520296684e-05, - "loss": 2.1352, + "epoch": 0.5, + "grad_norm": 14.87916374206543, + "learning_rate": 1.6664017068987156e-05, + "loss": 1.5678, "step": 3988 }, { - "epoch": 1.2, - "grad_norm": 11.108948707580566, - "learning_rate": 1.2005612909692293e-05, - "loss": 1.4723, + "epoch": 0.5, + "grad_norm": 11.87180233001709, + "learning_rate": 1.666318035393047e-05, + "loss": 1.7662, "step": 3989 }, { - "epoch": 1.2, - "grad_norm": 41.934715270996094, - "learning_rate": 1.2003608299087903e-05, - "loss": 1.3572, + "epoch": 0.5, + "grad_norm": 26.39105224609375, + "learning_rate": 1.6662343638873783e-05, + "loss": 1.7568, "step": 3990 }, { - "epoch": 1.2, - "grad_norm": 30.752838134765625, - "learning_rate": 1.2001603688483515e-05, - "loss": 1.7268, + "epoch": 0.5, + "grad_norm": 8.435544967651367, + "learning_rate": 1.6661506923817093e-05, + "loss": 3.2938, "step": 3991 }, { - "epoch": 1.2, - "grad_norm": 19.63275718688965, - "learning_rate": 1.1999599077879123e-05, - "loss": 1.8781, + "epoch": 0.5, + "grad_norm": 13.673137664794922, + "learning_rate": 1.6660670208760407e-05, + "loss": 1.3841, "step": 3992 }, { - "epoch": 1.2, - "grad_norm": 16.867294311523438, - "learning_rate": 1.1997594467274733e-05, - "loss": 1.2249, + "epoch": 0.5, + "grad_norm": 10.718171119689941, + "learning_rate": 1.665983349370372e-05, + "loss": 0.8371, "step": 3993 }, { - "epoch": 1.2, - "grad_norm": 12.371650695800781, - "learning_rate": 1.1995589856670341e-05, - "loss": 2.7085, + "epoch": 0.5, + "grad_norm": 18.63213539123535, + "learning_rate": 1.6658996778647034e-05, + "loss": 1.8301, "step": 3994 }, { - "epoch": 1.2, - "grad_norm": 14.14538860321045, - "learning_rate": 1.1993585246065953e-05, - "loss": 1.7304, + "epoch": 0.5, + "grad_norm": 11.364863395690918, + "learning_rate": 1.6658160063590344e-05, + "loss": 2.5042, "step": 3995 }, { - "epoch": 1.2, - "grad_norm": 28.63239860534668, - "learning_rate": 1.1991580635461562e-05, - "loss": 1.6735, + "epoch": 0.5, + "grad_norm": 77.68419647216797, + "learning_rate": 1.6657323348533658e-05, + "loss": 0.9009, "step": 3996 }, { - "epoch": 1.2, - "grad_norm": 10.542112350463867, - "learning_rate": 1.1989576024857172e-05, - "loss": 2.628, + "epoch": 0.5, + "grad_norm": 17.809261322021484, + "learning_rate": 1.665648663347697e-05, + "loss": 1.5477, "step": 3997 }, { - "epoch": 1.2, - "grad_norm": 15.562253952026367, - "learning_rate": 1.1987571414252784e-05, - "loss": 1.8914, + "epoch": 0.5, + "grad_norm": 34.99085998535156, + "learning_rate": 1.6655649918420282e-05, + "loss": 3.4252, "step": 3998 }, { - "epoch": 1.2, - "grad_norm": 14.955403327941895, - "learning_rate": 1.1985566803648392e-05, - "loss": 1.4507, + "epoch": 0.5, + "grad_norm": 18.85693359375, + "learning_rate": 1.6654813203363596e-05, + "loss": 2.2447, "step": 3999 }, { - "epoch": 1.2, - "grad_norm": 8.927510261535645, - "learning_rate": 1.1983562193044002e-05, - "loss": 1.8247, + "epoch": 0.5, + "grad_norm": 10.359712600708008, + "learning_rate": 1.665397648830691e-05, + "loss": 3.4757, "step": 4000 }, { - "epoch": 1.2, - "grad_norm": 40.698150634765625, - "learning_rate": 1.1981557582439612e-05, - "loss": 2.7692, + "epoch": 0.5, + "eval_loss": 0.14231130480766296, + "eval_runtime": 94.7497, + "eval_samples_per_second": 37.383, + "eval_steps_per_second": 37.383, + "step": 4000 + }, + { + "epoch": 0.5, + "grad_norm": 13.207204818725586, + "learning_rate": 1.6653139773250223e-05, + "loss": 1.625, "step": 4001 }, { - "epoch": 1.2, - "grad_norm": 8.478541374206543, - "learning_rate": 1.1979552971835222e-05, - "loss": 1.0191, + "epoch": 0.5, + "grad_norm": 12.07330322265625, + "learning_rate": 1.6652303058193533e-05, + "loss": 2.6738, "step": 4002 }, { - "epoch": 1.2, - "grad_norm": 29.216922760009766, - "learning_rate": 1.1977548361230832e-05, - "loss": 1.7092, + "epoch": 0.5, + "grad_norm": 16.51776123046875, + "learning_rate": 1.6651466343136847e-05, + "loss": 1.036, "step": 4003 }, { - "epoch": 1.2, - "grad_norm": 11.635241508483887, - "learning_rate": 1.1975543750626442e-05, - "loss": 1.7223, + "epoch": 0.5, + "grad_norm": 19.95757293701172, + "learning_rate": 1.665062962808016e-05, + "loss": 2.9243, "step": 4004 }, { - "epoch": 1.2, - "grad_norm": 23.52641487121582, - "learning_rate": 1.1973539140022052e-05, - "loss": 1.6989, + "epoch": 0.5, + "grad_norm": 71.45176696777344, + "learning_rate": 1.664979291302347e-05, + "loss": 1.9166, "step": 4005 }, { - "epoch": 1.2, - "grad_norm": 18.005016326904297, - "learning_rate": 1.197153452941766e-05, - "loss": 2.5197, + "epoch": 0.5, + "grad_norm": 44.509273529052734, + "learning_rate": 1.6648956197966784e-05, + "loss": 2.8575, "step": 4006 }, { - "epoch": 1.2, - "grad_norm": 59.00607681274414, - "learning_rate": 1.1969529918813273e-05, - "loss": 3.0074, + "epoch": 0.5, + "grad_norm": 17.11343765258789, + "learning_rate": 1.6648119482910098e-05, + "loss": 3.3717, "step": 4007 }, { - "epoch": 1.21, - "grad_norm": 23.461963653564453, - "learning_rate": 1.1967525308208881e-05, - "loss": 2.5943, + "epoch": 0.5, + "grad_norm": 19.97622299194336, + "learning_rate": 1.6647282767853408e-05, + "loss": 1.7877, "step": 4008 }, { - "epoch": 1.21, - "grad_norm": 30.815887451171875, - "learning_rate": 1.1965520697604491e-05, - "loss": 1.9424, + "epoch": 0.5, + "grad_norm": 15.097450256347656, + "learning_rate": 1.6646446052796722e-05, + "loss": 2.227, "step": 4009 }, { - "epoch": 1.21, - "grad_norm": 29.489526748657227, - "learning_rate": 1.1963516087000103e-05, - "loss": 2.0126, + "epoch": 0.5, + "grad_norm": 23.042789459228516, + "learning_rate": 1.6645609337740036e-05, + "loss": 2.6331, "step": 4010 }, { - "epoch": 1.21, - "grad_norm": 7.134322166442871, - "learning_rate": 1.1961511476395711e-05, - "loss": 1.2801, + "epoch": 0.5, + "grad_norm": 15.941010475158691, + "learning_rate": 1.6644772622683346e-05, + "loss": 1.7747, "step": 4011 }, { - "epoch": 1.21, - "grad_norm": 12.04995059967041, - "learning_rate": 1.1959506865791321e-05, - "loss": 1.0723, + "epoch": 0.5, + "grad_norm": 33.62506103515625, + "learning_rate": 1.664393590762666e-05, + "loss": 1.3494, "step": 4012 }, { - "epoch": 1.21, - "grad_norm": 13.849966049194336, - "learning_rate": 1.195750225518693e-05, - "loss": 1.0122, + "epoch": 0.5, + "grad_norm": 19.71503257751465, + "learning_rate": 1.6643099192569973e-05, + "loss": 1.6179, "step": 4013 }, { - "epoch": 1.21, - "grad_norm": 7.143357753753662, - "learning_rate": 1.1955497644582541e-05, - "loss": 1.961, + "epoch": 0.5, + "grad_norm": 16.13291358947754, + "learning_rate": 1.6642262477513283e-05, + "loss": 2.6505, "step": 4014 }, { - "epoch": 1.21, - "grad_norm": 16.170339584350586, - "learning_rate": 1.195349303397815e-05, - "loss": 1.9324, + "epoch": 0.5, + "grad_norm": 16.614009857177734, + "learning_rate": 1.6641425762456597e-05, + "loss": 2.215, "step": 4015 }, { - "epoch": 1.21, - "grad_norm": 22.960071563720703, - "learning_rate": 1.195148842337376e-05, - "loss": 2.5535, + "epoch": 0.5, + "grad_norm": 20.98224449157715, + "learning_rate": 1.6640589047399907e-05, + "loss": 1.7982, "step": 4016 }, { - "epoch": 1.21, - "grad_norm": 15.098960876464844, - "learning_rate": 1.1949483812769372e-05, - "loss": 1.142, + "epoch": 0.5, + "grad_norm": 18.615270614624023, + "learning_rate": 1.663975233234322e-05, + "loss": 4.0383, "step": 4017 }, { - "epoch": 1.21, - "grad_norm": 19.63709831237793, - "learning_rate": 1.194747920216498e-05, - "loss": 2.072, + "epoch": 0.5, + "grad_norm": 21.422555923461914, + "learning_rate": 1.6638915617286535e-05, + "loss": 0.7228, "step": 4018 }, { - "epoch": 1.21, - "grad_norm": 14.952527046203613, - "learning_rate": 1.194547459156059e-05, - "loss": 1.2427, + "epoch": 0.5, + "grad_norm": 18.596290588378906, + "learning_rate": 1.6638078902229845e-05, + "loss": 1.7599, "step": 4019 }, { - "epoch": 1.21, - "grad_norm": 13.70450210571289, - "learning_rate": 1.1943469980956199e-05, - "loss": 1.8315, + "epoch": 0.5, + "grad_norm": 12.224632263183594, + "learning_rate": 1.663724218717316e-05, + "loss": 1.4104, "step": 4020 }, { - "epoch": 1.21, - "grad_norm": 13.074018478393555, - "learning_rate": 1.194146537035181e-05, - "loss": 0.9992, + "epoch": 0.5, + "grad_norm": 13.065860748291016, + "learning_rate": 1.6636405472116472e-05, + "loss": 2.3884, "step": 4021 }, { - "epoch": 1.21, - "grad_norm": 12.277508735656738, - "learning_rate": 1.193946075974742e-05, - "loss": 1.8067, + "epoch": 0.5, + "grad_norm": 22.903993606567383, + "learning_rate": 1.6635568757059786e-05, + "loss": 3.8494, "step": 4022 }, { - "epoch": 1.21, - "grad_norm": 27.505800247192383, - "learning_rate": 1.1937456149143029e-05, - "loss": 2.1141, + "epoch": 0.5, + "grad_norm": 12.7423734664917, + "learning_rate": 1.6634732042003096e-05, + "loss": 1.5295, "step": 4023 }, { - "epoch": 1.21, - "grad_norm": 18.097551345825195, - "learning_rate": 1.193545153853864e-05, - "loss": 1.631, + "epoch": 0.51, + "grad_norm": 16.80319595336914, + "learning_rate": 1.663389532694641e-05, + "loss": 1.9948, "step": 4024 }, { - "epoch": 1.21, - "grad_norm": 15.072179794311523, - "learning_rate": 1.1933446927934249e-05, - "loss": 1.5262, + "epoch": 0.51, + "grad_norm": 8.247112274169922, + "learning_rate": 1.6633058611889723e-05, + "loss": 1.9971, "step": 4025 }, { - "epoch": 1.21, - "grad_norm": 13.110054016113281, - "learning_rate": 1.1931442317329859e-05, - "loss": 1.4835, + "epoch": 0.51, + "grad_norm": 16.039772033691406, + "learning_rate": 1.6632221896833034e-05, + "loss": 1.5694, "step": 4026 }, { - "epoch": 1.21, - "grad_norm": 11.2367525100708, - "learning_rate": 1.192943770672547e-05, - "loss": 1.5453, + "epoch": 0.51, + "grad_norm": 13.967763900756836, + "learning_rate": 1.6631385181776347e-05, + "loss": 1.4187, "step": 4027 }, { - "epoch": 1.21, - "grad_norm": 19.51766014099121, - "learning_rate": 1.192743309612108e-05, - "loss": 1.4438, + "epoch": 0.51, + "grad_norm": 17.23777198791504, + "learning_rate": 1.663054846671966e-05, + "loss": 1.1976, "step": 4028 }, { - "epoch": 1.21, - "grad_norm": 58.76639938354492, - "learning_rate": 1.1925428485516691e-05, - "loss": 2.2727, + "epoch": 0.51, + "grad_norm": 25.99625587463379, + "learning_rate": 1.6629711751662975e-05, + "loss": 4.8848, "step": 4029 }, { - "epoch": 1.21, - "grad_norm": 16.14882469177246, - "learning_rate": 1.19234238749123e-05, - "loss": 1.4155, + "epoch": 0.51, + "grad_norm": 17.495437622070312, + "learning_rate": 1.6628875036606285e-05, + "loss": 3.902, "step": 4030 }, { - "epoch": 1.21, - "grad_norm": 9.466064453125, - "learning_rate": 1.192141926430791e-05, - "loss": 1.3346, + "epoch": 0.51, + "grad_norm": 26.159387588500977, + "learning_rate": 1.66280383215496e-05, + "loss": 3.2425, "step": 4031 }, { - "epoch": 1.21, - "grad_norm": 16.83454704284668, - "learning_rate": 1.1919414653703518e-05, - "loss": 1.3689, + "epoch": 0.51, + "grad_norm": 32.6840934753418, + "learning_rate": 1.6627201606492912e-05, + "loss": 1.4698, "step": 4032 }, { - "epoch": 1.21, - "grad_norm": 11.721967697143555, - "learning_rate": 1.191741004309913e-05, - "loss": 1.5468, + "epoch": 0.51, + "grad_norm": 12.527408599853516, + "learning_rate": 1.6626364891436222e-05, + "loss": 1.9287, "step": 4033 }, { - "epoch": 1.21, - "grad_norm": 9.986775398254395, - "learning_rate": 1.191540543249474e-05, - "loss": 2.5393, + "epoch": 0.51, + "grad_norm": 25.900415420532227, + "learning_rate": 1.6625528176379536e-05, + "loss": 2.039, "step": 4034 }, { - "epoch": 1.21, - "grad_norm": 11.999430656433105, - "learning_rate": 1.1913400821890348e-05, - "loss": 1.7203, + "epoch": 0.51, + "grad_norm": 10.852729797363281, + "learning_rate": 1.662469146132285e-05, + "loss": 1.3046, "step": 4035 }, { - "epoch": 1.21, - "grad_norm": 15.086469650268555, - "learning_rate": 1.191139621128596e-05, - "loss": 1.8978, + "epoch": 0.51, + "grad_norm": 14.895544052124023, + "learning_rate": 1.662385474626616e-05, + "loss": 1.8385, "step": 4036 }, { - "epoch": 1.21, - "grad_norm": 20.604961395263672, - "learning_rate": 1.1909391600681568e-05, - "loss": 1.0557, + "epoch": 0.51, + "grad_norm": 16.182043075561523, + "learning_rate": 1.6623018031209474e-05, + "loss": 2.2905, "step": 4037 }, { - "epoch": 1.21, - "grad_norm": 11.977767944335938, - "learning_rate": 1.1907386990077178e-05, - "loss": 1.9318, + "epoch": 0.51, + "grad_norm": 11.084127426147461, + "learning_rate": 1.6622181316152787e-05, + "loss": 4.2573, "step": 4038 }, { - "epoch": 1.21, - "grad_norm": 17.97518539428711, - "learning_rate": 1.1905382379472787e-05, - "loss": 0.948, + "epoch": 0.51, + "grad_norm": 18.71702766418457, + "learning_rate": 1.6621344601096097e-05, + "loss": 2.1855, "step": 4039 }, { - "epoch": 1.21, - "grad_norm": 9.713574409484863, - "learning_rate": 1.1903377768868399e-05, - "loss": 1.0446, + "epoch": 0.51, + "grad_norm": 10.081755638122559, + "learning_rate": 1.662050788603941e-05, + "loss": 1.9548, "step": 4040 }, { - "epoch": 1.21, - "grad_norm": 18.080841064453125, - "learning_rate": 1.1901373158264009e-05, - "loss": 1.703, + "epoch": 0.51, + "grad_norm": 17.882465362548828, + "learning_rate": 1.661967117098272e-05, + "loss": 3.3668, "step": 4041 }, { - "epoch": 1.22, - "grad_norm": 35.3074951171875, - "learning_rate": 1.1899368547659617e-05, - "loss": 2.8692, + "epoch": 0.51, + "grad_norm": 28.202253341674805, + "learning_rate": 1.6618834455926035e-05, + "loss": 1.74, "step": 4042 }, { - "epoch": 1.22, - "grad_norm": 12.872424125671387, - "learning_rate": 1.1897363937055229e-05, - "loss": 1.2558, + "epoch": 0.51, + "grad_norm": 13.790504455566406, + "learning_rate": 1.661799774086935e-05, + "loss": 2.8872, "step": 4043 }, { - "epoch": 1.22, - "grad_norm": 20.24639129638672, - "learning_rate": 1.1895359326450837e-05, - "loss": 2.013, + "epoch": 0.51, + "grad_norm": 19.462251663208008, + "learning_rate": 1.661716102581266e-05, + "loss": 0.7643, "step": 4044 }, { - "epoch": 1.22, - "grad_norm": 19.4423885345459, - "learning_rate": 1.1893354715846447e-05, - "loss": 1.2018, + "epoch": 0.51, + "grad_norm": 13.855001449584961, + "learning_rate": 1.6616324310755973e-05, + "loss": 1.5681, "step": 4045 }, { - "epoch": 1.22, - "grad_norm": 28.46637725830078, - "learning_rate": 1.1891350105242059e-05, - "loss": 2.3941, + "epoch": 0.51, + "grad_norm": 8.267029762268066, + "learning_rate": 1.6615487595699286e-05, + "loss": 0.942, "step": 4046 }, { - "epoch": 1.22, - "grad_norm": 15.011366844177246, - "learning_rate": 1.1889345494637667e-05, - "loss": 1.5718, + "epoch": 0.51, + "grad_norm": 14.253353118896484, + "learning_rate": 1.6614650880642597e-05, + "loss": 3.2943, "step": 4047 }, { - "epoch": 1.22, - "grad_norm": 30.27291488647461, - "learning_rate": 1.1887340884033278e-05, - "loss": 2.3006, + "epoch": 0.51, + "grad_norm": 4.903005599975586, + "learning_rate": 1.661381416558591e-05, + "loss": 1.309, "step": 4048 }, { - "epoch": 1.22, - "grad_norm": 18.10991859436035, - "learning_rate": 1.1885336273428888e-05, - "loss": 1.7672, + "epoch": 0.51, + "grad_norm": 6.407052040100098, + "learning_rate": 1.6612977450529224e-05, + "loss": 0.5181, "step": 4049 }, { - "epoch": 1.22, - "grad_norm": 34.54472732543945, - "learning_rate": 1.1883331662824498e-05, - "loss": 2.484, + "epoch": 0.51, + "grad_norm": 17.52642250061035, + "learning_rate": 1.6612140735472537e-05, + "loss": 1.4306, "step": 4050 }, { - "epoch": 1.22, - "grad_norm": 12.643610954284668, - "learning_rate": 1.1881327052220106e-05, - "loss": 2.0557, + "epoch": 0.51, + "grad_norm": 14.540874481201172, + "learning_rate": 1.6611304020415848e-05, + "loss": 1.8284, "step": 4051 }, { - "epoch": 1.22, - "grad_norm": 12.419498443603516, - "learning_rate": 1.1879322441615718e-05, - "loss": 1.8953, + "epoch": 0.51, + "grad_norm": 8.986979484558105, + "learning_rate": 1.661046730535916e-05, + "loss": 0.7586, "step": 4052 }, { - "epoch": 1.22, - "grad_norm": 9.909022331237793, - "learning_rate": 1.1877317831011328e-05, - "loss": 0.9697, + "epoch": 0.51, + "grad_norm": 8.945680618286133, + "learning_rate": 1.6609630590302475e-05, + "loss": 1.2829, "step": 4053 }, { - "epoch": 1.22, - "grad_norm": 213.60519409179688, - "learning_rate": 1.1875313220406936e-05, - "loss": 2.7859, + "epoch": 0.51, + "grad_norm": 19.894546508789062, + "learning_rate": 1.6608793875245785e-05, + "loss": 2.1166, "step": 4054 }, { - "epoch": 1.22, - "grad_norm": 43.98239517211914, - "learning_rate": 1.1873308609802548e-05, - "loss": 2.0812, + "epoch": 0.51, + "grad_norm": 13.819477081298828, + "learning_rate": 1.66079571601891e-05, + "loss": 1.7849, "step": 4055 }, { - "epoch": 1.22, - "grad_norm": 22.185237884521484, - "learning_rate": 1.1871303999198157e-05, - "loss": 1.778, + "epoch": 0.51, + "grad_norm": 13.106219291687012, + "learning_rate": 1.6607120445132413e-05, + "loss": 1.3608, "step": 4056 }, { - "epoch": 1.22, - "grad_norm": 20.283220291137695, - "learning_rate": 1.1869299388593767e-05, - "loss": 1.6555, + "epoch": 0.51, + "grad_norm": 18.80193328857422, + "learning_rate": 1.6606283730075726e-05, + "loss": 2.3694, "step": 4057 }, { - "epoch": 1.22, - "grad_norm": 52.456600189208984, - "learning_rate": 1.1867294777989375e-05, - "loss": 2.7659, + "epoch": 0.51, + "grad_norm": 11.726581573486328, + "learning_rate": 1.6605447015019036e-05, + "loss": 2.1741, "step": 4058 }, { - "epoch": 1.22, - "grad_norm": 27.247201919555664, - "learning_rate": 1.1865290167384987e-05, - "loss": 1.9769, + "epoch": 0.51, + "grad_norm": 8.649958610534668, + "learning_rate": 1.660461029996235e-05, + "loss": 0.8855, "step": 4059 }, { - "epoch": 1.22, - "grad_norm": 19.406150817871094, - "learning_rate": 1.1863285556780597e-05, - "loss": 2.0779, + "epoch": 0.51, + "grad_norm": 13.534453392028809, + "learning_rate": 1.6603773584905664e-05, + "loss": 2.5613, "step": 4060 }, { - "epoch": 1.22, - "grad_norm": 7.287651062011719, - "learning_rate": 1.1861280946176205e-05, - "loss": 1.3887, + "epoch": 0.51, + "grad_norm": 21.837444305419922, + "learning_rate": 1.6602936869848974e-05, + "loss": 3.0084, "step": 4061 }, { - "epoch": 1.22, - "grad_norm": 12.292468070983887, - "learning_rate": 1.1859276335571817e-05, - "loss": 1.7488, + "epoch": 0.51, + "grad_norm": 9.196706771850586, + "learning_rate": 1.6602100154792288e-05, + "loss": 2.6228, "step": 4062 }, { - "epoch": 1.22, - "grad_norm": 9.17116928100586, - "learning_rate": 1.1857271724967425e-05, - "loss": 1.2331, + "epoch": 0.51, + "grad_norm": 13.78110408782959, + "learning_rate": 1.66012634397356e-05, + "loss": 1.6917, "step": 4063 }, { - "epoch": 1.22, - "grad_norm": 12.708617210388184, - "learning_rate": 1.1855267114363036e-05, - "loss": 1.5956, + "epoch": 0.51, + "grad_norm": 6.981208801269531, + "learning_rate": 1.660042672467891e-05, + "loss": 0.9407, "step": 4064 }, { - "epoch": 1.22, - "grad_norm": 19.217905044555664, - "learning_rate": 1.1853262503758647e-05, - "loss": 3.3333, + "epoch": 0.51, + "grad_norm": 14.598764419555664, + "learning_rate": 1.6599590009622225e-05, + "loss": 0.7875, "step": 4065 }, { - "epoch": 1.22, - "grad_norm": 9.486623764038086, - "learning_rate": 1.1851257893154256e-05, - "loss": 1.1654, + "epoch": 0.51, + "grad_norm": 22.41578483581543, + "learning_rate": 1.6598753294565536e-05, + "loss": 2.7584, "step": 4066 }, { - "epoch": 1.22, - "grad_norm": 15.519346237182617, - "learning_rate": 1.1849253282549866e-05, - "loss": 1.3905, + "epoch": 0.51, + "grad_norm": 15.423556327819824, + "learning_rate": 1.659791657950885e-05, + "loss": 1.6977, "step": 4067 }, { - "epoch": 1.22, - "grad_norm": 27.3670597076416, - "learning_rate": 1.1847248671945474e-05, - "loss": 1.8644, + "epoch": 0.51, + "grad_norm": 26.116514205932617, + "learning_rate": 1.6597079864452163e-05, + "loss": 3.1091, "step": 4068 }, { - "epoch": 1.22, - "grad_norm": 12.652219772338867, - "learning_rate": 1.1845244061341086e-05, - "loss": 1.7432, + "epoch": 0.51, + "grad_norm": 12.65701675415039, + "learning_rate": 1.6596243149395473e-05, + "loss": 0.8299, "step": 4069 }, { - "epoch": 1.22, - "grad_norm": 7.705514430999756, - "learning_rate": 1.1843239450736694e-05, - "loss": 0.8268, + "epoch": 0.51, + "grad_norm": 6.301980018615723, + "learning_rate": 1.6595406434338787e-05, + "loss": 1.3544, "step": 4070 }, { - "epoch": 1.22, - "grad_norm": 13.351577758789062, - "learning_rate": 1.1841234840132304e-05, - "loss": 2.1508, + "epoch": 0.51, + "grad_norm": 11.021586418151855, + "learning_rate": 1.65945697192821e-05, + "loss": 0.9748, "step": 4071 }, { - "epoch": 1.22, - "grad_norm": 44.65596008300781, - "learning_rate": 1.1839230229527916e-05, - "loss": 2.0205, + "epoch": 0.51, + "grad_norm": 15.616560935974121, + "learning_rate": 1.659373300422541e-05, + "loss": 3.0321, "step": 4072 }, { - "epoch": 1.22, - "grad_norm": 24.321109771728516, - "learning_rate": 1.1837225618923525e-05, - "loss": 1.7858, + "epoch": 0.51, + "grad_norm": 32.11990737915039, + "learning_rate": 1.6592896289168724e-05, + "loss": 2.4086, "step": 4073 }, { - "epoch": 1.22, - "grad_norm": 48.96630096435547, - "learning_rate": 1.1835221008319135e-05, - "loss": 1.9235, + "epoch": 0.51, + "grad_norm": 16.586008071899414, + "learning_rate": 1.6592059574112038e-05, + "loss": 3.0542, "step": 4074 }, { - "epoch": 1.23, - "grad_norm": 16.92742156982422, - "learning_rate": 1.1833216397714745e-05, - "loss": 1.7837, + "epoch": 0.51, + "grad_norm": 11.527971267700195, + "learning_rate": 1.6591222859055348e-05, + "loss": 2.2782, "step": 4075 }, { - "epoch": 1.23, - "grad_norm": 42.10576629638672, - "learning_rate": 1.1831211787110355e-05, - "loss": 3.2812, + "epoch": 0.51, + "grad_norm": 22.980730056762695, + "learning_rate": 1.6590386143998662e-05, + "loss": 0.79, "step": 4076 }, { - "epoch": 1.23, - "grad_norm": 13.819901466369629, - "learning_rate": 1.1829207176505965e-05, - "loss": 1.4294, + "epoch": 0.51, + "grad_norm": 13.224310874938965, + "learning_rate": 1.6589549428941975e-05, + "loss": 1.4557, "step": 4077 }, { - "epoch": 1.23, - "grad_norm": 11.949419021606445, - "learning_rate": 1.1827202565901575e-05, - "loss": 1.3175, + "epoch": 0.51, + "grad_norm": 21.20461082458496, + "learning_rate": 1.658871271388529e-05, + "loss": 1.6549, "step": 4078 }, { - "epoch": 1.23, - "grad_norm": 23.273836135864258, - "learning_rate": 1.1825197955297185e-05, - "loss": 2.5785, + "epoch": 0.51, + "grad_norm": 21.756563186645508, + "learning_rate": 1.65878759988286e-05, + "loss": 2.6471, "step": 4079 }, { - "epoch": 1.23, - "grad_norm": 15.00057601928711, - "learning_rate": 1.1823193344692793e-05, - "loss": 1.4673, - "step": 4080 - }, - { - "epoch": 1.23, - "eval_loss": 0.22429148852825165, - "eval_runtime": 43.6464, - "eval_samples_per_second": 33.886, - "eval_steps_per_second": 33.886, + "epoch": 0.51, + "grad_norm": 26.183300018310547, + "learning_rate": 1.6587039283771913e-05, + "loss": 0.6442, "step": 4080 }, { - "epoch": 1.23, - "grad_norm": 16.4818172454834, - "learning_rate": 1.1821188734088405e-05, - "loss": 1.8456, + "epoch": 0.51, + "grad_norm": 10.016181945800781, + "learning_rate": 1.6586202568715227e-05, + "loss": 1.7346, "step": 4081 }, { - "epoch": 1.23, - "grad_norm": 9.311663627624512, - "learning_rate": 1.1819184123484014e-05, - "loss": 1.5563, + "epoch": 0.51, + "grad_norm": 87.37271881103516, + "learning_rate": 1.6585365853658537e-05, + "loss": 0.7923, "step": 4082 }, { - "epoch": 1.23, - "grad_norm": 19.122093200683594, - "learning_rate": 1.1817179512879624e-05, - "loss": 2.2633, + "epoch": 0.51, + "grad_norm": 65.72235107421875, + "learning_rate": 1.658452913860185e-05, + "loss": 1.9297, "step": 4083 }, { - "epoch": 1.23, - "grad_norm": 33.5509033203125, - "learning_rate": 1.1815174902275236e-05, - "loss": 1.9949, + "epoch": 0.51, + "grad_norm": 9.46326732635498, + "learning_rate": 1.6583692423545164e-05, + "loss": 0.6906, "step": 4084 }, { - "epoch": 1.23, - "grad_norm": 14.471306800842285, - "learning_rate": 1.1813170291670844e-05, - "loss": 1.8224, + "epoch": 0.51, + "grad_norm": 8.229296684265137, + "learning_rate": 1.6582855708488478e-05, + "loss": 0.719, "step": 4085 }, { - "epoch": 1.23, - "grad_norm": 11.821321487426758, - "learning_rate": 1.1811165681066454e-05, - "loss": 1.8433, + "epoch": 0.51, + "grad_norm": 9.068437576293945, + "learning_rate": 1.6582018993431788e-05, + "loss": 1.4107, "step": 4086 }, { - "epoch": 1.23, - "grad_norm": 26.970781326293945, - "learning_rate": 1.1809161070462062e-05, - "loss": 2.336, + "epoch": 0.51, + "grad_norm": 16.31252670288086, + "learning_rate": 1.6581182278375102e-05, + "loss": 3.5396, "step": 4087 }, { - "epoch": 1.23, - "grad_norm": 18.373268127441406, - "learning_rate": 1.1807156459857674e-05, - "loss": 1.4686, + "epoch": 0.51, + "grad_norm": 7.02438497543335, + "learning_rate": 1.6580345563318415e-05, + "loss": 1.0429, "step": 4088 }, { - "epoch": 1.23, - "grad_norm": 17.756778717041016, - "learning_rate": 1.1805151849253284e-05, - "loss": 1.4351, + "epoch": 0.51, + "grad_norm": 29.739078521728516, + "learning_rate": 1.6579508848261726e-05, + "loss": 2.1456, "step": 4089 }, { - "epoch": 1.23, - "grad_norm": 21.345794677734375, - "learning_rate": 1.1803147238648893e-05, - "loss": 1.7058, + "epoch": 0.51, + "grad_norm": 54.58208084106445, + "learning_rate": 1.657867213320504e-05, + "loss": 2.1292, "step": 4090 }, { - "epoch": 1.23, - "grad_norm": 99.40449523925781, - "learning_rate": 1.1801142628044504e-05, - "loss": 2.2699, + "epoch": 0.51, + "grad_norm": 20.13058090209961, + "learning_rate": 1.6577835418148353e-05, + "loss": 3.2348, "step": 4091 }, { - "epoch": 1.23, - "grad_norm": 29.920364379882812, - "learning_rate": 1.1799138017440113e-05, - "loss": 1.7795, + "epoch": 0.51, + "grad_norm": 14.049749374389648, + "learning_rate": 1.6576998703091663e-05, + "loss": 3.1451, "step": 4092 }, { - "epoch": 1.23, - "grad_norm": 16.23194122314453, - "learning_rate": 1.1797133406835723e-05, - "loss": 1.8951, + "epoch": 0.51, + "grad_norm": 21.84672737121582, + "learning_rate": 1.6576161988034977e-05, + "loss": 3.6311, "step": 4093 }, { - "epoch": 1.23, - "grad_norm": 16.518890380859375, - "learning_rate": 1.1795128796231331e-05, - "loss": 2.0618, + "epoch": 0.51, + "grad_norm": 17.8717041015625, + "learning_rate": 1.6575325272978287e-05, + "loss": 1.4837, "step": 4094 }, { - "epoch": 1.23, - "grad_norm": 15.834227561950684, - "learning_rate": 1.1793124185626943e-05, - "loss": 1.3648, + "epoch": 0.51, + "grad_norm": 35.71775817871094, + "learning_rate": 1.65744885579216e-05, + "loss": 1.683, "step": 4095 }, { - "epoch": 1.23, - "grad_norm": 21.73334503173828, - "learning_rate": 1.1791119575022553e-05, - "loss": 1.6835, + "epoch": 0.51, + "grad_norm": 11.681072235107422, + "learning_rate": 1.6573651842864914e-05, + "loss": 1.1817, "step": 4096 }, { - "epoch": 1.23, - "grad_norm": 16.20547103881836, - "learning_rate": 1.1789114964418163e-05, - "loss": 2.8748, + "epoch": 0.51, + "grad_norm": 14.100946426391602, + "learning_rate": 1.6572815127808225e-05, + "loss": 2.141, "step": 4097 }, { - "epoch": 1.23, - "grad_norm": 13.940154075622559, - "learning_rate": 1.1787110353813773e-05, - "loss": 1.7865, + "epoch": 0.51, + "grad_norm": 10.091500282287598, + "learning_rate": 1.657197841275154e-05, + "loss": 0.9144, "step": 4098 }, { - "epoch": 1.23, - "grad_norm": 10.902242660522461, - "learning_rate": 1.1785105743209382e-05, - "loss": 1.4294, + "epoch": 0.51, + "grad_norm": 7.164851665496826, + "learning_rate": 1.6571141697694852e-05, + "loss": 0.4938, "step": 4099 }, { - "epoch": 1.23, - "grad_norm": 20.06895637512207, - "learning_rate": 1.1783101132604993e-05, - "loss": 2.0791, + "epoch": 0.51, + "grad_norm": 30.006725311279297, + "learning_rate": 1.6570304982638162e-05, + "loss": 3.7963, "step": 4100 }, { - "epoch": 1.23, - "grad_norm": 21.698528289794922, - "learning_rate": 1.1781096522000602e-05, - "loss": 1.8301, + "epoch": 0.51, + "grad_norm": 12.350128173828125, + "learning_rate": 1.6569468267581476e-05, + "loss": 2.2554, "step": 4101 }, { - "epoch": 1.23, - "grad_norm": 50.30092239379883, - "learning_rate": 1.1779091911396212e-05, - "loss": 2.0178, + "epoch": 0.51, + "grad_norm": 13.280160903930664, + "learning_rate": 1.656863155252479e-05, + "loss": 2.8461, "step": 4102 }, { - "epoch": 1.23, - "grad_norm": 29.254417419433594, - "learning_rate": 1.1777087300791824e-05, - "loss": 1.7604, + "epoch": 0.51, + "grad_norm": 36.530006408691406, + "learning_rate": 1.65677948374681e-05, + "loss": 3.6085, "step": 4103 }, { - "epoch": 1.23, - "grad_norm": 13.801785469055176, - "learning_rate": 1.1775082690187432e-05, - "loss": 1.9442, + "epoch": 0.52, + "grad_norm": 14.720293045043945, + "learning_rate": 1.6566958122411414e-05, + "loss": 1.2472, "step": 4104 }, { - "epoch": 1.23, - "grad_norm": 14.859130859375, - "learning_rate": 1.1773078079583042e-05, - "loss": 2.0129, + "epoch": 0.52, + "grad_norm": 11.89719009399414, + "learning_rate": 1.6566121407354727e-05, + "loss": 1.7108, "step": 4105 }, { - "epoch": 1.23, - "grad_norm": 14.485882759094238, - "learning_rate": 1.177107346897865e-05, - "loss": 1.7607, + "epoch": 0.52, + "grad_norm": 15.50600528717041, + "learning_rate": 1.6565284692298037e-05, + "loss": 1.8948, "step": 4106 }, { - "epoch": 1.23, - "grad_norm": 16.90434455871582, - "learning_rate": 1.1769068858374262e-05, - "loss": 1.5387, + "epoch": 0.52, + "grad_norm": 19.96320152282715, + "learning_rate": 1.656444797724135e-05, + "loss": 2.2534, "step": 4107 }, { - "epoch": 1.24, - "grad_norm": 13.56885051727295, - "learning_rate": 1.1767064247769872e-05, - "loss": 1.2367, + "epoch": 0.52, + "grad_norm": 11.358124732971191, + "learning_rate": 1.6563611262184665e-05, + "loss": 1.7402, "step": 4108 }, { - "epoch": 1.24, - "grad_norm": 14.099568367004395, - "learning_rate": 1.176505963716548e-05, - "loss": 0.9839, + "epoch": 0.52, + "grad_norm": 11.969573974609375, + "learning_rate": 1.656277454712798e-05, + "loss": 1.468, "step": 4109 }, { - "epoch": 1.24, - "grad_norm": 22.063432693481445, - "learning_rate": 1.1763055026561093e-05, - "loss": 1.3803, + "epoch": 0.52, + "grad_norm": 15.62781810760498, + "learning_rate": 1.656193783207129e-05, + "loss": 1.5013, "step": 4110 }, { - "epoch": 1.24, - "grad_norm": 16.2562255859375, - "learning_rate": 1.1761050415956701e-05, - "loss": 1.5013, + "epoch": 0.52, + "grad_norm": 17.239166259765625, + "learning_rate": 1.6561101117014602e-05, + "loss": 3.8252, "step": 4111 }, { - "epoch": 1.24, - "grad_norm": 26.141794204711914, - "learning_rate": 1.1759045805352311e-05, - "loss": 2.49, + "epoch": 0.52, + "grad_norm": 7.174729824066162, + "learning_rate": 1.6560264401957916e-05, + "loss": 0.6201, "step": 4112 }, { - "epoch": 1.24, - "grad_norm": 16.62359619140625, - "learning_rate": 1.175704119474792e-05, - "loss": 0.9543, + "epoch": 0.52, + "grad_norm": 23.233652114868164, + "learning_rate": 1.6559427686901226e-05, + "loss": 1.8877, "step": 4113 }, { - "epoch": 1.24, - "grad_norm": 21.430021286010742, - "learning_rate": 1.1755036584143531e-05, - "loss": 1.2435, + "epoch": 0.52, + "grad_norm": 7.3985066413879395, + "learning_rate": 1.655859097184454e-05, + "loss": 2.5075, "step": 4114 }, { - "epoch": 1.24, - "grad_norm": 15.07040786743164, - "learning_rate": 1.1753031973539141e-05, - "loss": 0.9931, + "epoch": 0.52, + "grad_norm": 31.08193588256836, + "learning_rate": 1.6557754256787853e-05, + "loss": 3.7956, "step": 4115 }, { - "epoch": 1.24, - "grad_norm": 9.915048599243164, - "learning_rate": 1.175102736293475e-05, - "loss": 1.498, + "epoch": 0.52, + "grad_norm": 5.321239471435547, + "learning_rate": 1.6556917541731167e-05, + "loss": 0.406, "step": 4116 }, { - "epoch": 1.24, - "grad_norm": 16.46501922607422, - "learning_rate": 1.1749022752330362e-05, - "loss": 2.0349, + "epoch": 0.52, + "grad_norm": 14.6619234085083, + "learning_rate": 1.6556080826674477e-05, + "loss": 1.5106, "step": 4117 }, { - "epoch": 1.24, - "grad_norm": 16.73762321472168, - "learning_rate": 1.174701814172597e-05, - "loss": 1.9466, + "epoch": 0.52, + "grad_norm": 9.001350402832031, + "learning_rate": 1.655524411161779e-05, + "loss": 1.8031, "step": 4118 }, { - "epoch": 1.24, - "grad_norm": 19.883028030395508, - "learning_rate": 1.174501353112158e-05, - "loss": 1.8937, + "epoch": 0.52, + "grad_norm": 11.533523559570312, + "learning_rate": 1.65544073965611e-05, + "loss": 0.8387, "step": 4119 }, { - "epoch": 1.24, - "grad_norm": 47.894859313964844, - "learning_rate": 1.1743008920517192e-05, - "loss": 1.9303, + "epoch": 0.52, + "grad_norm": 8.868361473083496, + "learning_rate": 1.6553570681504415e-05, + "loss": 0.2606, "step": 4120 }, { - "epoch": 1.24, - "grad_norm": 14.77435302734375, - "learning_rate": 1.17410043099128e-05, - "loss": 1.8005, + "epoch": 0.52, + "grad_norm": 34.47975158691406, + "learning_rate": 1.655273396644773e-05, + "loss": 2.8862, "step": 4121 }, { - "epoch": 1.24, - "grad_norm": 9.72576904296875, - "learning_rate": 1.173899969930841e-05, - "loss": 1.5267, + "epoch": 0.52, + "grad_norm": 20.224817276000977, + "learning_rate": 1.655189725139104e-05, + "loss": 2.4562, "step": 4122 }, { - "epoch": 1.24, - "grad_norm": 27.231468200683594, - "learning_rate": 1.173699508870402e-05, - "loss": 1.5438, + "epoch": 0.52, + "grad_norm": 12.108320236206055, + "learning_rate": 1.6551060536334353e-05, + "loss": 1.7544, "step": 4123 }, { - "epoch": 1.24, - "grad_norm": 14.572410583496094, - "learning_rate": 1.173499047809963e-05, - "loss": 1.7702, + "epoch": 0.52, + "grad_norm": 12.343804359436035, + "learning_rate": 1.6550223821277666e-05, + "loss": 2.9308, "step": 4124 }, { - "epoch": 1.24, - "grad_norm": 14.047436714172363, - "learning_rate": 1.1732985867495239e-05, - "loss": 1.8277, + "epoch": 0.52, + "grad_norm": 24.018522262573242, + "learning_rate": 1.6549387106220976e-05, + "loss": 3.1213, "step": 4125 }, { - "epoch": 1.24, - "grad_norm": 20.151954650878906, - "learning_rate": 1.173098125689085e-05, - "loss": 1.7252, + "epoch": 0.52, + "grad_norm": 42.93147659301758, + "learning_rate": 1.654855039116429e-05, + "loss": 1.9207, "step": 4126 }, { - "epoch": 1.24, - "grad_norm": 13.929177284240723, - "learning_rate": 1.172897664628646e-05, - "loss": 2.2859, + "epoch": 0.52, + "grad_norm": 12.159256935119629, + "learning_rate": 1.65477136761076e-05, + "loss": 0.8922, "step": 4127 }, { - "epoch": 1.24, - "grad_norm": 16.374547958374023, - "learning_rate": 1.1726972035682069e-05, - "loss": 1.4863, + "epoch": 0.52, + "grad_norm": 12.611270904541016, + "learning_rate": 1.6546876961050914e-05, + "loss": 1.4998, "step": 4128 }, { - "epoch": 1.24, - "grad_norm": 20.026716232299805, - "learning_rate": 1.172496742507768e-05, - "loss": 1.4962, + "epoch": 0.52, + "grad_norm": 11.294777870178223, + "learning_rate": 1.6546040245994228e-05, + "loss": 0.698, "step": 4129 }, { - "epoch": 1.24, - "grad_norm": 14.383737564086914, - "learning_rate": 1.172296281447329e-05, - "loss": 1.8605, + "epoch": 0.52, + "grad_norm": 15.277084350585938, + "learning_rate": 1.654520353093754e-05, + "loss": 2.7802, "step": 4130 }, { - "epoch": 1.24, - "grad_norm": 8.625479698181152, - "learning_rate": 1.17209582038689e-05, - "loss": 1.3414, + "epoch": 0.52, + "grad_norm": 10.900196075439453, + "learning_rate": 1.654436681588085e-05, + "loss": 0.6085, "step": 4131 }, { - "epoch": 1.24, - "grad_norm": 9.765339851379395, - "learning_rate": 1.1718953593264511e-05, - "loss": 2.2172, + "epoch": 0.52, + "grad_norm": 18.502803802490234, + "learning_rate": 1.6543530100824165e-05, + "loss": 1.4703, "step": 4132 }, { - "epoch": 1.24, - "grad_norm": 17.315092086791992, - "learning_rate": 1.171694898266012e-05, - "loss": 1.8639, + "epoch": 0.52, + "grad_norm": 34.35626220703125, + "learning_rate": 1.654269338576748e-05, + "loss": 2.4679, "step": 4133 }, { - "epoch": 1.24, - "grad_norm": 28.305118560791016, - "learning_rate": 1.171494437205573e-05, - "loss": 1.2452, + "epoch": 0.52, + "grad_norm": 21.993507385253906, + "learning_rate": 1.654185667071079e-05, + "loss": 3.4082, "step": 4134 }, { - "epoch": 1.24, - "grad_norm": 23.19142723083496, - "learning_rate": 1.1712939761451338e-05, - "loss": 1.5799, + "epoch": 0.52, + "grad_norm": 23.336885452270508, + "learning_rate": 1.6541019955654103e-05, + "loss": 3.8204, "step": 4135 }, { - "epoch": 1.24, - "grad_norm": 38.168418884277344, - "learning_rate": 1.171093515084695e-05, - "loss": 2.4729, + "epoch": 0.52, + "grad_norm": 14.885393142700195, + "learning_rate": 1.6540183240597416e-05, + "loss": 1.2218, "step": 4136 }, { - "epoch": 1.24, - "grad_norm": 12.799455642700195, - "learning_rate": 1.1708930540242558e-05, - "loss": 0.9976, + "epoch": 0.52, + "grad_norm": 12.772022247314453, + "learning_rate": 1.653934652554073e-05, + "loss": 1.872, "step": 4137 }, { - "epoch": 1.24, - "grad_norm": 11.151683807373047, - "learning_rate": 1.1706925929638168e-05, - "loss": 2.1855, + "epoch": 0.52, + "grad_norm": 7.038129806518555, + "learning_rate": 1.653850981048404e-05, + "loss": 1.8513, "step": 4138 }, { - "epoch": 1.24, - "grad_norm": 55.58821487426758, - "learning_rate": 1.170492131903378e-05, - "loss": 1.4985, + "epoch": 0.52, + "grad_norm": 4.405773639678955, + "learning_rate": 1.6537673095427354e-05, + "loss": 1.6504, "step": 4139 }, { - "epoch": 1.24, - "grad_norm": 12.804335594177246, - "learning_rate": 1.1702916708429388e-05, - "loss": 2.2843, + "epoch": 0.52, + "grad_norm": 12.794690132141113, + "learning_rate": 1.6536836380370668e-05, + "loss": 1.3373, "step": 4140 }, { - "epoch": 1.25, - "grad_norm": 18.690269470214844, - "learning_rate": 1.1700912097824998e-05, - "loss": 1.8465, + "epoch": 0.52, + "grad_norm": 69.78988647460938, + "learning_rate": 1.6535999665313978e-05, + "loss": 1.8227, "step": 4141 }, { - "epoch": 1.25, - "grad_norm": 13.506994247436523, - "learning_rate": 1.1698907487220607e-05, - "loss": 1.2933, + "epoch": 0.52, + "grad_norm": 13.622889518737793, + "learning_rate": 1.653516295025729e-05, + "loss": 1.7115, "step": 4142 }, { - "epoch": 1.25, - "grad_norm": 12.685111999511719, - "learning_rate": 1.1696902876616219e-05, - "loss": 1.1366, + "epoch": 0.52, + "grad_norm": 10.512645721435547, + "learning_rate": 1.6534326235200605e-05, + "loss": 0.8185, "step": 4143 }, { - "epoch": 1.25, - "grad_norm": 51.746055603027344, - "learning_rate": 1.1694898266011827e-05, - "loss": 2.0853, + "epoch": 0.52, + "grad_norm": 9.174022674560547, + "learning_rate": 1.653348952014392e-05, + "loss": 0.811, "step": 4144 }, { - "epoch": 1.25, - "grad_norm": 16.30849838256836, - "learning_rate": 1.1692893655407437e-05, - "loss": 1.8595, + "epoch": 0.52, + "grad_norm": 14.15351676940918, + "learning_rate": 1.653265280508723e-05, + "loss": 2.5374, "step": 4145 }, { - "epoch": 1.25, - "grad_norm": 24.40019989013672, - "learning_rate": 1.1690889044803049e-05, - "loss": 2.4656, + "epoch": 0.52, + "grad_norm": 14.107945442199707, + "learning_rate": 1.6531816090030543e-05, + "loss": 1.8294, "step": 4146 }, { - "epoch": 1.25, - "grad_norm": 30.043943405151367, - "learning_rate": 1.1688884434198657e-05, - "loss": 1.5323, + "epoch": 0.52, + "grad_norm": 46.87839126586914, + "learning_rate": 1.6530979374973853e-05, + "loss": 3.7714, "step": 4147 }, { - "epoch": 1.25, - "grad_norm": 16.770814895629883, - "learning_rate": 1.1686879823594269e-05, - "loss": 2.9425, + "epoch": 0.52, + "grad_norm": 27.021465301513672, + "learning_rate": 1.6530142659917167e-05, + "loss": 1.1433, "step": 4148 }, { - "epoch": 1.25, - "grad_norm": 11.892877578735352, - "learning_rate": 1.1684875212989877e-05, - "loss": 2.216, + "epoch": 0.52, + "grad_norm": 57.1532096862793, + "learning_rate": 1.652930594486048e-05, + "loss": 1.6439, "step": 4149 }, { - "epoch": 1.25, - "grad_norm": 20.787343978881836, - "learning_rate": 1.1682870602385488e-05, - "loss": 2.4695, + "epoch": 0.52, + "grad_norm": 25.887805938720703, + "learning_rate": 1.652846922980379e-05, + "loss": 2.2957, "step": 4150 }, { - "epoch": 1.25, - "grad_norm": 23.898902893066406, - "learning_rate": 1.16808659917811e-05, - "loss": 1.5012, + "epoch": 0.52, + "grad_norm": 15.23217487335205, + "learning_rate": 1.6527632514747104e-05, + "loss": 2.052, "step": 4151 }, { - "epoch": 1.25, - "grad_norm": 24.14310073852539, - "learning_rate": 1.1678861381176708e-05, - "loss": 1.8126, + "epoch": 0.52, + "grad_norm": 10.493826866149902, + "learning_rate": 1.6526795799690414e-05, + "loss": 1.3312, "step": 4152 }, { - "epoch": 1.25, - "grad_norm": 16.704151153564453, - "learning_rate": 1.1676856770572318e-05, - "loss": 2.1569, + "epoch": 0.52, + "grad_norm": 10.333160400390625, + "learning_rate": 1.6525959084633728e-05, + "loss": 1.345, "step": 4153 }, { - "epoch": 1.25, - "grad_norm": 11.908331871032715, - "learning_rate": 1.1674852159967926e-05, - "loss": 1.5541, + "epoch": 0.52, + "grad_norm": 32.34117889404297, + "learning_rate": 1.6525122369577042e-05, + "loss": 2.4136, "step": 4154 }, { - "epoch": 1.25, - "grad_norm": 14.777883529663086, - "learning_rate": 1.1672847549363538e-05, - "loss": 1.7314, + "epoch": 0.52, + "grad_norm": 18.288616180419922, + "learning_rate": 1.6524285654520352e-05, + "loss": 0.7964, "step": 4155 }, { - "epoch": 1.25, - "grad_norm": 13.375772476196289, - "learning_rate": 1.1670842938759146e-05, - "loss": 1.1609, + "epoch": 0.52, + "grad_norm": 11.544230461120605, + "learning_rate": 1.6523448939463666e-05, + "loss": 2.2476, "step": 4156 }, { - "epoch": 1.25, - "grad_norm": 9.32558536529541, - "learning_rate": 1.1668838328154756e-05, - "loss": 1.6317, + "epoch": 0.52, + "grad_norm": 13.22861099243164, + "learning_rate": 1.652261222440698e-05, + "loss": 1.4952, "step": 4157 }, { - "epoch": 1.25, - "grad_norm": 14.819250106811523, - "learning_rate": 1.1666833717550368e-05, - "loss": 1.5169, + "epoch": 0.52, + "grad_norm": 48.550697326660156, + "learning_rate": 1.6521775509350293e-05, + "loss": 1.0027, "step": 4158 }, { - "epoch": 1.25, - "grad_norm": 7.862631797790527, - "learning_rate": 1.1664829106945977e-05, - "loss": 1.187, + "epoch": 0.52, + "grad_norm": 8.484000205993652, + "learning_rate": 1.6520938794293603e-05, + "loss": 0.4071, "step": 4159 }, { - "epoch": 1.25, - "grad_norm": 26.258054733276367, - "learning_rate": 1.1662824496341587e-05, - "loss": 2.8375, + "epoch": 0.52, + "grad_norm": 13.84500503540039, + "learning_rate": 1.6520102079236917e-05, + "loss": 1.1636, "step": 4160 }, { - "epoch": 1.25, - "grad_norm": 16.67561912536621, - "learning_rate": 1.1660819885737195e-05, - "loss": 1.5601, + "epoch": 0.52, + "grad_norm": 15.75895881652832, + "learning_rate": 1.651926536418023e-05, + "loss": 1.3895, "step": 4161 }, { - "epoch": 1.25, - "grad_norm": 22.342544555664062, - "learning_rate": 1.1658815275132807e-05, - "loss": 1.5789, + "epoch": 0.52, + "grad_norm": 23.719133377075195, + "learning_rate": 1.651842864912354e-05, + "loss": 2.8539, "step": 4162 }, { - "epoch": 1.25, - "grad_norm": 13.550854682922363, - "learning_rate": 1.1656810664528417e-05, - "loss": 2.4655, + "epoch": 0.52, + "grad_norm": 30.87020492553711, + "learning_rate": 1.6517591934066854e-05, + "loss": 1.6645, "step": 4163 }, { - "epoch": 1.25, - "grad_norm": 33.92632293701172, - "learning_rate": 1.1654806053924025e-05, - "loss": 1.5478, + "epoch": 0.52, + "grad_norm": 9.798362731933594, + "learning_rate": 1.6516755219010168e-05, + "loss": 1.4841, "step": 4164 }, { - "epoch": 1.25, - "grad_norm": 10.819427490234375, - "learning_rate": 1.1652801443319637e-05, - "loss": 1.5479, + "epoch": 0.52, + "grad_norm": 8.731616973876953, + "learning_rate": 1.6515918503953482e-05, + "loss": 0.8007, "step": 4165 }, { - "epoch": 1.25, - "grad_norm": 16.504961013793945, - "learning_rate": 1.1650796832715245e-05, - "loss": 1.4122, + "epoch": 0.52, + "grad_norm": 8.03121566772461, + "learning_rate": 1.6515081788896792e-05, + "loss": 0.4783, "step": 4166 }, { - "epoch": 1.25, - "grad_norm": 22.91649055480957, - "learning_rate": 1.1648792222110856e-05, - "loss": 1.9459, + "epoch": 0.52, + "grad_norm": 8.199400901794434, + "learning_rate": 1.6514245073840106e-05, + "loss": 2.1517, "step": 4167 }, { - "epoch": 1.25, - "grad_norm": 9.941856384277344, - "learning_rate": 1.1646787611506466e-05, - "loss": 1.7419, + "epoch": 0.52, + "grad_norm": 17.17656898498535, + "learning_rate": 1.651340835878342e-05, + "loss": 1.3263, "step": 4168 }, { - "epoch": 1.25, - "grad_norm": 12.914164543151855, - "learning_rate": 1.1644783000902076e-05, - "loss": 1.2351, + "epoch": 0.52, + "grad_norm": 12.302600860595703, + "learning_rate": 1.651257164372673e-05, + "loss": 0.6993, "step": 4169 }, { - "epoch": 1.25, - "grad_norm": 10.681536674499512, - "learning_rate": 1.1642778390297686e-05, - "loss": 1.7278, + "epoch": 0.52, + "grad_norm": 34.4046745300293, + "learning_rate": 1.6511734928670043e-05, + "loss": 1.5034, "step": 4170 }, { - "epoch": 1.25, - "grad_norm": 17.06873321533203, - "learning_rate": 1.1640773779693296e-05, - "loss": 2.0348, + "epoch": 0.52, + "grad_norm": 15.348669052124023, + "learning_rate": 1.6510898213613357e-05, + "loss": 1.8687, "step": 4171 }, { - "epoch": 1.25, - "grad_norm": 20.173357009887695, - "learning_rate": 1.1638769169088906e-05, - "loss": 1.6212, + "epoch": 0.52, + "grad_norm": 8.436819076538086, + "learning_rate": 1.6510061498556667e-05, + "loss": 1.0849, "step": 4172 }, { - "epoch": 1.25, - "grad_norm": 6.546261310577393, - "learning_rate": 1.1636764558484514e-05, - "loss": 0.4214, + "epoch": 0.52, + "grad_norm": 12.729174613952637, + "learning_rate": 1.650922478349998e-05, + "loss": 1.7756, "step": 4173 }, { - "epoch": 1.25, - "grad_norm": 27.385225296020508, - "learning_rate": 1.1634759947880126e-05, - "loss": 1.6822, + "epoch": 0.52, + "grad_norm": 9.80980110168457, + "learning_rate": 1.6508388068443294e-05, + "loss": 1.7873, "step": 4174 }, { - "epoch": 1.26, - "grad_norm": 37.10486602783203, - "learning_rate": 1.1632755337275736e-05, - "loss": 1.5469, + "epoch": 0.52, + "grad_norm": 16.981359481811523, + "learning_rate": 1.6507551353386605e-05, + "loss": 2.1876, "step": 4175 }, { - "epoch": 1.26, - "grad_norm": 13.431403160095215, - "learning_rate": 1.1630750726671345e-05, - "loss": 2.7734, + "epoch": 0.52, + "grad_norm": 7.826014041900635, + "learning_rate": 1.6506714638329918e-05, + "loss": 2.1796, "step": 4176 }, { - "epoch": 1.26, - "grad_norm": 17.27809715270996, - "learning_rate": 1.1628746116066956e-05, - "loss": 1.8377, + "epoch": 0.52, + "grad_norm": 28.039058685302734, + "learning_rate": 1.650587792327323e-05, + "loss": 1.1134, "step": 4177 }, { - "epoch": 1.26, - "grad_norm": 6.829921245574951, - "learning_rate": 1.1626741505462565e-05, - "loss": 1.0531, + "epoch": 0.52, + "grad_norm": 8.88306999206543, + "learning_rate": 1.6505041208216542e-05, + "loss": 1.0974, "step": 4178 }, { - "epoch": 1.26, - "grad_norm": 15.239294052124023, - "learning_rate": 1.1624736894858175e-05, - "loss": 1.8001, + "epoch": 0.52, + "grad_norm": 6.60862398147583, + "learning_rate": 1.6504204493159856e-05, + "loss": 0.5625, "step": 4179 }, { - "epoch": 1.26, - "grad_norm": 12.837756156921387, - "learning_rate": 1.1622732284253783e-05, - "loss": 1.1665, + "epoch": 0.52, + "grad_norm": 10.781895637512207, + "learning_rate": 1.6503367778103166e-05, + "loss": 1.1298, "step": 4180 }, { - "epoch": 1.26, - "grad_norm": 16.143875122070312, - "learning_rate": 1.1620727673649395e-05, - "loss": 0.9885, + "epoch": 0.52, + "grad_norm": 18.55240249633789, + "learning_rate": 1.650253106304648e-05, + "loss": 2.801, "step": 4181 }, { - "epoch": 1.26, - "grad_norm": 12.66282844543457, - "learning_rate": 1.1618723063045005e-05, - "loss": 1.219, + "epoch": 0.52, + "grad_norm": 16.69265365600586, + "learning_rate": 1.6501694347989793e-05, + "loss": 1.61, "step": 4182 }, { - "epoch": 1.26, - "grad_norm": 27.858238220214844, - "learning_rate": 1.1616718452440614e-05, - "loss": 2.2014, + "epoch": 0.52, + "grad_norm": 17.6204891204834, + "learning_rate": 1.6500857632933104e-05, + "loss": 3.775, "step": 4183 }, { - "epoch": 1.26, - "grad_norm": 21.537065505981445, - "learning_rate": 1.1614713841836225e-05, - "loss": 2.1228, + "epoch": 0.53, + "grad_norm": 9.196520805358887, + "learning_rate": 1.6500020917876417e-05, + "loss": 0.6398, "step": 4184 }, { - "epoch": 1.26, - "grad_norm": 43.390296936035156, - "learning_rate": 1.1612709231231834e-05, - "loss": 2.343, + "epoch": 0.53, + "grad_norm": 15.22008228302002, + "learning_rate": 1.649918420281973e-05, + "loss": 2.4216, "step": 4185 }, { - "epoch": 1.26, - "grad_norm": 24.188064575195312, - "learning_rate": 1.1610704620627444e-05, - "loss": 1.1875, + "epoch": 0.53, + "grad_norm": 41.116641998291016, + "learning_rate": 1.6498347487763045e-05, + "loss": 2.4155, "step": 4186 }, { - "epoch": 1.26, - "grad_norm": 13.248528480529785, - "learning_rate": 1.1608700010023052e-05, - "loss": 1.3365, + "epoch": 0.53, + "grad_norm": 33.585166931152344, + "learning_rate": 1.6497510772706355e-05, + "loss": 2.9035, "step": 4187 }, { - "epoch": 1.26, - "grad_norm": 26.479053497314453, - "learning_rate": 1.1606695399418664e-05, - "loss": 2.2343, + "epoch": 0.53, + "grad_norm": 7.945860385894775, + "learning_rate": 1.649667405764967e-05, + "loss": 0.5704, "step": 4188 }, { - "epoch": 1.26, - "grad_norm": 20.167842864990234, - "learning_rate": 1.1604690788814274e-05, - "loss": 1.9832, + "epoch": 0.53, + "grad_norm": 13.593266487121582, + "learning_rate": 1.6495837342592982e-05, + "loss": 2.459, "step": 4189 }, { - "epoch": 1.26, - "grad_norm": 27.049171447753906, - "learning_rate": 1.1602686178209882e-05, - "loss": 1.6917, + "epoch": 0.53, + "grad_norm": 99.65393829345703, + "learning_rate": 1.6495000627536292e-05, + "loss": 2.3606, "step": 4190 }, { - "epoch": 1.26, - "grad_norm": 7.952903747558594, - "learning_rate": 1.1600681567605494e-05, - "loss": 1.1251, + "epoch": 0.53, + "grad_norm": 15.27071762084961, + "learning_rate": 1.6494163912479606e-05, + "loss": 2.4064, "step": 4191 }, { - "epoch": 1.26, - "grad_norm": 9.99161148071289, - "learning_rate": 1.1598676957001103e-05, - "loss": 1.3309, + "epoch": 0.53, + "grad_norm": 13.85755443572998, + "learning_rate": 1.649332719742292e-05, + "loss": 0.7231, "step": 4192 }, { - "epoch": 1.26, - "grad_norm": 23.682477951049805, - "learning_rate": 1.1596672346396713e-05, - "loss": 1.7625, + "epoch": 0.53, + "grad_norm": 17.547557830810547, + "learning_rate": 1.6492490482366233e-05, + "loss": 1.8349, "step": 4193 }, { - "epoch": 1.26, - "grad_norm": 14.787646293640137, - "learning_rate": 1.1594667735792324e-05, - "loss": 0.8436, + "epoch": 0.53, + "grad_norm": 13.032142639160156, + "learning_rate": 1.6491653767309544e-05, + "loss": 1.0245, "step": 4194 }, { - "epoch": 1.26, - "grad_norm": 16.3641414642334, - "learning_rate": 1.1592663125187933e-05, - "loss": 1.5916, + "epoch": 0.53, + "grad_norm": 37.35247039794922, + "learning_rate": 1.6490817052252857e-05, + "loss": 3.246, "step": 4195 }, { - "epoch": 1.26, - "grad_norm": 15.075904846191406, - "learning_rate": 1.1590658514583543e-05, - "loss": 1.5184, + "epoch": 0.53, + "grad_norm": 27.185203552246094, + "learning_rate": 1.648998033719617e-05, + "loss": 3.0728, "step": 4196 }, { - "epoch": 1.26, - "grad_norm": 13.050864219665527, - "learning_rate": 1.1588653903979153e-05, - "loss": 1.7963, + "epoch": 0.53, + "grad_norm": 10.853189468383789, + "learning_rate": 1.648914362213948e-05, + "loss": 2.0268, "step": 4197 }, { - "epoch": 1.26, - "grad_norm": 10.601302146911621, - "learning_rate": 1.1586649293374763e-05, - "loss": 1.7489, + "epoch": 0.53, + "grad_norm": 16.356616973876953, + "learning_rate": 1.6488306907082795e-05, + "loss": 1.4985, "step": 4198 }, { - "epoch": 1.26, - "grad_norm": 21.2014217376709, - "learning_rate": 1.1584644682770371e-05, - "loss": 1.6054, + "epoch": 0.53, + "grad_norm": 16.33892822265625, + "learning_rate": 1.648747019202611e-05, + "loss": 3.2044, "step": 4199 }, { - "epoch": 1.26, - "grad_norm": 17.15383529663086, - "learning_rate": 1.1582640072165983e-05, - "loss": 1.2979, + "epoch": 0.53, + "grad_norm": 16.992094039916992, + "learning_rate": 1.648663347696942e-05, + "loss": 2.4081, "step": 4200 }, { - "epoch": 1.26, - "eval_loss": 0.2202432006597519, - "eval_runtime": 43.4456, - "eval_samples_per_second": 34.043, - "eval_steps_per_second": 34.043, - "step": 4200 + "epoch": 0.53, + "grad_norm": 19.227170944213867, + "learning_rate": 1.6485796761912732e-05, + "loss": 0.8864, + "step": 4201 }, { - "epoch": 1.26, - "grad_norm": 14.757326126098633, - "learning_rate": 1.1580635461561593e-05, - "loss": 1.03, - "step": 4201 - }, - { - "epoch": 1.26, - "grad_norm": 12.233736038208008, - "learning_rate": 1.1578630850957202e-05, - "loss": 1.4463, + "epoch": 0.53, + "grad_norm": 154.24375915527344, + "learning_rate": 1.6484960046856046e-05, + "loss": 0.8654, "step": 4202 }, { - "epoch": 1.26, - "grad_norm": 17.687862396240234, - "learning_rate": 1.1576626240352814e-05, - "loss": 1.9516, + "epoch": 0.53, + "grad_norm": 8.544039726257324, + "learning_rate": 1.6484123331799356e-05, + "loss": 0.8526, "step": 4203 }, { - "epoch": 1.26, - "grad_norm": 18.68252944946289, - "learning_rate": 1.1574621629748422e-05, - "loss": 2.6357, + "epoch": 0.53, + "grad_norm": 30.925003051757812, + "learning_rate": 1.648328661674267e-05, + "loss": 2.7016, "step": 4204 }, { - "epoch": 1.26, - "grad_norm": 24.503774642944336, - "learning_rate": 1.1572617019144032e-05, - "loss": 1.4327, + "epoch": 0.53, + "grad_norm": 10.509055137634277, + "learning_rate": 1.648244990168598e-05, + "loss": 0.7868, "step": 4205 }, { - "epoch": 1.26, - "grad_norm": 14.029136657714844, - "learning_rate": 1.1570612408539644e-05, - "loss": 2.0086, + "epoch": 0.53, + "grad_norm": 10.42081069946289, + "learning_rate": 1.6481613186629294e-05, + "loss": 1.9535, "step": 4206 }, { - "epoch": 1.26, - "grad_norm": 13.784452438354492, - "learning_rate": 1.1568607797935252e-05, - "loss": 2.5528, + "epoch": 0.53, + "grad_norm": 4.504967212677002, + "learning_rate": 1.6480776471572608e-05, + "loss": 0.3315, "step": 4207 }, { - "epoch": 1.27, - "grad_norm": 29.253156661987305, - "learning_rate": 1.1566603187330862e-05, - "loss": 0.5661, + "epoch": 0.53, + "grad_norm": 79.56721496582031, + "learning_rate": 1.6479939756515918e-05, + "loss": 3.8614, "step": 4208 }, { - "epoch": 1.27, - "grad_norm": 26.327823638916016, - "learning_rate": 1.156459857672647e-05, - "loss": 1.8774, + "epoch": 0.53, + "grad_norm": 15.625540733337402, + "learning_rate": 1.647910304145923e-05, + "loss": 1.8213, "step": 4209 }, { - "epoch": 1.27, - "grad_norm": 50.058536529541016, - "learning_rate": 1.1562593966122082e-05, - "loss": 2.2888, + "epoch": 0.53, + "grad_norm": 9.894253730773926, + "learning_rate": 1.6478266326402545e-05, + "loss": 1.1941, "step": 4210 }, { - "epoch": 1.27, - "grad_norm": 8.850481033325195, - "learning_rate": 1.156058935551769e-05, - "loss": 1.476, + "epoch": 0.53, + "grad_norm": 17.677467346191406, + "learning_rate": 1.6477429611345855e-05, + "loss": 3.2213, "step": 4211 }, { - "epoch": 1.27, - "grad_norm": 12.578634262084961, - "learning_rate": 1.1558584744913301e-05, - "loss": 1.5757, + "epoch": 0.53, + "grad_norm": 4.6228766441345215, + "learning_rate": 1.647659289628917e-05, + "loss": 1.6235, "step": 4212 }, { - "epoch": 1.27, - "grad_norm": 42.25316619873047, - "learning_rate": 1.1556580134308913e-05, - "loss": 1.574, + "epoch": 0.53, + "grad_norm": 30.595361709594727, + "learning_rate": 1.6475756181232483e-05, + "loss": 2.2286, "step": 4213 }, { - "epoch": 1.27, - "grad_norm": 15.227287292480469, - "learning_rate": 1.1554575523704521e-05, - "loss": 2.4708, + "epoch": 0.53, + "grad_norm": 22.9339542388916, + "learning_rate": 1.6474919466175796e-05, + "loss": 3.3339, "step": 4214 }, { - "epoch": 1.27, - "grad_norm": 10.333723068237305, - "learning_rate": 1.1552570913100131e-05, - "loss": 1.6319, + "epoch": 0.53, + "grad_norm": 11.161800384521484, + "learning_rate": 1.6474082751119107e-05, + "loss": 2.5971, "step": 4215 }, { - "epoch": 1.27, - "grad_norm": 9.808292388916016, - "learning_rate": 1.1550566302495741e-05, - "loss": 2.1226, + "epoch": 0.53, + "grad_norm": 8.695982933044434, + "learning_rate": 1.647324603606242e-05, + "loss": 1.6401, "step": 4216 }, { - "epoch": 1.27, - "grad_norm": 39.865562438964844, - "learning_rate": 1.1548561691891351e-05, - "loss": 2.9798, + "epoch": 0.53, + "grad_norm": 31.54930877685547, + "learning_rate": 1.6472409321005734e-05, + "loss": 2.0755, "step": 4217 }, { - "epoch": 1.27, - "grad_norm": 47.3433723449707, - "learning_rate": 1.154655708128696e-05, - "loss": 1.7633, + "epoch": 0.53, + "grad_norm": 19.136375427246094, + "learning_rate": 1.6471572605949044e-05, + "loss": 1.2303, "step": 4218 }, { - "epoch": 1.27, - "grad_norm": 18.767004013061523, - "learning_rate": 1.1544552470682571e-05, - "loss": 1.0271, + "epoch": 0.53, + "grad_norm": 25.4252986907959, + "learning_rate": 1.6470735890892358e-05, + "loss": 2.7373, "step": 4219 }, { - "epoch": 1.27, - "grad_norm": 5.930094242095947, - "learning_rate": 1.1542547860078182e-05, - "loss": 0.7385, + "epoch": 0.53, + "grad_norm": 14.933919906616211, + "learning_rate": 1.646989917583567e-05, + "loss": 2.7044, "step": 4220 }, { - "epoch": 1.27, - "grad_norm": 17.10114097595215, - "learning_rate": 1.154054324947379e-05, - "loss": 1.4075, + "epoch": 0.53, + "grad_norm": 25.30756187438965, + "learning_rate": 1.6469062460778985e-05, + "loss": 2.2519, "step": 4221 }, { - "epoch": 1.27, - "grad_norm": 22.982498168945312, - "learning_rate": 1.1538538638869402e-05, - "loss": 2.3147, + "epoch": 0.53, + "grad_norm": 3.7429444789886475, + "learning_rate": 1.6468225745722295e-05, + "loss": 0.5936, "step": 4222 }, { - "epoch": 1.27, - "grad_norm": 29.882640838623047, - "learning_rate": 1.153653402826501e-05, - "loss": 1.3968, + "epoch": 0.53, + "grad_norm": 12.389649391174316, + "learning_rate": 1.646738903066561e-05, + "loss": 1.0459, "step": 4223 }, { - "epoch": 1.27, - "grad_norm": 17.916343688964844, - "learning_rate": 1.153452941766062e-05, - "loss": 1.5234, + "epoch": 0.53, + "grad_norm": 27.49970054626465, + "learning_rate": 1.6466552315608923e-05, + "loss": 2.9375, "step": 4224 }, { - "epoch": 1.27, - "grad_norm": 12.272050857543945, - "learning_rate": 1.1532524807056232e-05, - "loss": 0.895, + "epoch": 0.53, + "grad_norm": 4.344343185424805, + "learning_rate": 1.6465715600552233e-05, + "loss": 0.0751, "step": 4225 }, { - "epoch": 1.27, - "grad_norm": 14.076766967773438, - "learning_rate": 1.153052019645184e-05, - "loss": 1.1994, + "epoch": 0.53, + "grad_norm": 14.881327629089355, + "learning_rate": 1.6464878885495547e-05, + "loss": 2.584, "step": 4226 }, { - "epoch": 1.27, - "grad_norm": 16.016265869140625, - "learning_rate": 1.152851558584745e-05, - "loss": 1.0423, + "epoch": 0.53, + "grad_norm": 14.25925064086914, + "learning_rate": 1.646404217043886e-05, + "loss": 1.7455, "step": 4227 }, { - "epoch": 1.27, - "grad_norm": 10.317485809326172, - "learning_rate": 1.1526510975243059e-05, - "loss": 1.3147, + "epoch": 0.53, + "grad_norm": 13.312737464904785, + "learning_rate": 1.646320545538217e-05, + "loss": 2.3142, "step": 4228 }, { - "epoch": 1.27, - "grad_norm": 12.768622398376465, - "learning_rate": 1.152450636463867e-05, - "loss": 2.2527, + "epoch": 0.53, + "grad_norm": 22.862720489501953, + "learning_rate": 1.6462368740325484e-05, + "loss": 2.6122, "step": 4229 }, { - "epoch": 1.27, - "grad_norm": 15.519412994384766, - "learning_rate": 1.1522501754034279e-05, - "loss": 1.4902, + "epoch": 0.53, + "grad_norm": 19.689655303955078, + "learning_rate": 1.6461532025268794e-05, + "loss": 3.9217, "step": 4230 }, { - "epoch": 1.27, - "grad_norm": 18.289718627929688, - "learning_rate": 1.1520497143429889e-05, - "loss": 2.5218, + "epoch": 0.53, + "grad_norm": 22.79598045349121, + "learning_rate": 1.6460695310212108e-05, + "loss": 0.636, "step": 4231 }, { - "epoch": 1.27, - "grad_norm": 17.28578758239746, - "learning_rate": 1.1518492532825501e-05, - "loss": 0.8004, + "epoch": 0.53, + "grad_norm": 16.043413162231445, + "learning_rate": 1.645985859515542e-05, + "loss": 2.4597, "step": 4232 }, { - "epoch": 1.27, - "grad_norm": 23.51228141784668, - "learning_rate": 1.151648792222111e-05, - "loss": 1.8972, + "epoch": 0.53, + "grad_norm": 19.496339797973633, + "learning_rate": 1.6459021880098732e-05, + "loss": 2.9435, "step": 4233 }, { - "epoch": 1.27, - "grad_norm": 12.242810249328613, - "learning_rate": 1.151448331161672e-05, - "loss": 1.0982, + "epoch": 0.53, + "grad_norm": 9.446452140808105, + "learning_rate": 1.6458185165042046e-05, + "loss": 1.7816, "step": 4234 }, { - "epoch": 1.27, - "grad_norm": 16.621652603149414, - "learning_rate": 1.1512478701012328e-05, - "loss": 1.8477, + "epoch": 0.53, + "grad_norm": 12.899818420410156, + "learning_rate": 1.645734844998536e-05, + "loss": 2.1706, "step": 4235 }, { - "epoch": 1.27, - "grad_norm": 9.91673755645752, - "learning_rate": 1.151047409040794e-05, - "loss": 1.4897, + "epoch": 0.53, + "grad_norm": 12.458577156066895, + "learning_rate": 1.645651173492867e-05, + "loss": 1.667, "step": 4236 }, { - "epoch": 1.27, - "grad_norm": 17.222322463989258, - "learning_rate": 1.150846947980355e-05, - "loss": 1.2186, + "epoch": 0.53, + "grad_norm": 27.420490264892578, + "learning_rate": 1.6455675019871983e-05, + "loss": 2.0797, "step": 4237 }, { - "epoch": 1.27, - "grad_norm": 11.030045509338379, - "learning_rate": 1.1506464869199158e-05, - "loss": 1.0489, + "epoch": 0.53, + "grad_norm": 9.165604591369629, + "learning_rate": 1.6454838304815297e-05, + "loss": 1.4257, "step": 4238 }, { - "epoch": 1.27, - "grad_norm": 13.282185554504395, - "learning_rate": 1.150446025859477e-05, - "loss": 2.1427, + "epoch": 0.53, + "grad_norm": 11.653155326843262, + "learning_rate": 1.6454001589758607e-05, + "loss": 1.6472, "step": 4239 }, { - "epoch": 1.27, - "grad_norm": 18.355653762817383, - "learning_rate": 1.1502455647990378e-05, - "loss": 1.5696, + "epoch": 0.53, + "grad_norm": 52.71607208251953, + "learning_rate": 1.645316487470192e-05, + "loss": 3.3165, "step": 4240 }, { - "epoch": 1.28, - "grad_norm": 15.735917091369629, - "learning_rate": 1.1500451037385988e-05, - "loss": 1.789, + "epoch": 0.53, + "grad_norm": 60.9556999206543, + "learning_rate": 1.6452328159645234e-05, + "loss": 2.1788, "step": 4241 }, { - "epoch": 1.28, - "grad_norm": 28.514938354492188, - "learning_rate": 1.1498446426781598e-05, - "loss": 2.2112, + "epoch": 0.53, + "grad_norm": 8.82602310180664, + "learning_rate": 1.6451491444588548e-05, + "loss": 0.8412, "step": 4242 }, { - "epoch": 1.28, - "grad_norm": 16.573684692382812, - "learning_rate": 1.1496441816177208e-05, - "loss": 1.6606, + "epoch": 0.53, + "grad_norm": 11.567743301391602, + "learning_rate": 1.6450654729531858e-05, + "loss": 2.6234, "step": 4243 }, { - "epoch": 1.28, - "grad_norm": 18.012393951416016, - "learning_rate": 1.1494437205572819e-05, - "loss": 1.3518, + "epoch": 0.53, + "grad_norm": 15.222989082336426, + "learning_rate": 1.6449818014475172e-05, + "loss": 1.9354, "step": 4244 }, { - "epoch": 1.28, - "grad_norm": 21.622119903564453, - "learning_rate": 1.1492432594968429e-05, - "loss": 1.9794, + "epoch": 0.53, + "grad_norm": 20.00623321533203, + "learning_rate": 1.6448981299418486e-05, + "loss": 2.37, "step": 4245 }, { - "epoch": 1.28, - "grad_norm": 11.884101867675781, - "learning_rate": 1.1490427984364039e-05, - "loss": 1.4142, + "epoch": 0.53, + "grad_norm": 19.969297409057617, + "learning_rate": 1.6448144584361796e-05, + "loss": 2.7817, "step": 4246 }, { - "epoch": 1.28, - "grad_norm": 19.552974700927734, - "learning_rate": 1.1488423373759647e-05, - "loss": 1.0571, + "epoch": 0.53, + "grad_norm": 10.317906379699707, + "learning_rate": 1.644730786930511e-05, + "loss": 1.6364, "step": 4247 }, { - "epoch": 1.28, - "grad_norm": 17.979076385498047, - "learning_rate": 1.1486418763155259e-05, - "loss": 1.6985, + "epoch": 0.53, + "grad_norm": 16.51494789123535, + "learning_rate": 1.6446471154248423e-05, + "loss": 2.9893, "step": 4248 }, { - "epoch": 1.28, - "grad_norm": 19.238636016845703, - "learning_rate": 1.1484414152550869e-05, - "loss": 1.396, + "epoch": 0.53, + "grad_norm": 12.647660255432129, + "learning_rate": 1.6445634439191737e-05, + "loss": 2.065, "step": 4249 }, { - "epoch": 1.28, - "grad_norm": 46.26264953613281, - "learning_rate": 1.1482409541946477e-05, - "loss": 2.4083, + "epoch": 0.53, + "grad_norm": 11.133133888244629, + "learning_rate": 1.6444797724135047e-05, + "loss": 1.5552, "step": 4250 }, { - "epoch": 1.28, - "grad_norm": 13.219773292541504, - "learning_rate": 1.1480404931342089e-05, - "loss": 0.8623, + "epoch": 0.53, + "grad_norm": 10.225388526916504, + "learning_rate": 1.644396100907836e-05, + "loss": 0.9422, "step": 4251 }, { - "epoch": 1.28, - "grad_norm": 9.744564056396484, - "learning_rate": 1.1478400320737697e-05, - "loss": 2.33, + "epoch": 0.53, + "grad_norm": 13.449052810668945, + "learning_rate": 1.6443124294021674e-05, + "loss": 0.5701, "step": 4252 }, { - "epoch": 1.28, - "grad_norm": 17.062484741210938, - "learning_rate": 1.1476395710133308e-05, - "loss": 1.4111, + "epoch": 0.53, + "grad_norm": 16.118934631347656, + "learning_rate": 1.6442287578964985e-05, + "loss": 1.8372, "step": 4253 }, { - "epoch": 1.28, - "grad_norm": 11.308719635009766, - "learning_rate": 1.1474391099528916e-05, - "loss": 1.0592, + "epoch": 0.53, + "grad_norm": 12.920221328735352, + "learning_rate": 1.6441450863908298e-05, + "loss": 2.2682, "step": 4254 }, { - "epoch": 1.28, - "grad_norm": 12.195525169372559, - "learning_rate": 1.1472386488924528e-05, - "loss": 1.8724, + "epoch": 0.53, + "grad_norm": 17.492290496826172, + "learning_rate": 1.6440614148851612e-05, + "loss": 1.5939, "step": 4255 }, { - "epoch": 1.28, - "grad_norm": 13.376428604125977, - "learning_rate": 1.1470381878320138e-05, - "loss": 1.5216, + "epoch": 0.53, + "grad_norm": 23.524873733520508, + "learning_rate": 1.6439777433794922e-05, + "loss": 1.9309, "step": 4256 }, { - "epoch": 1.28, - "grad_norm": 7.64074182510376, - "learning_rate": 1.1468377267715746e-05, - "loss": 0.7377, + "epoch": 0.53, + "grad_norm": 14.729205131530762, + "learning_rate": 1.6438940718738236e-05, + "loss": 1.0595, "step": 4257 }, { - "epoch": 1.28, - "grad_norm": 19.55640983581543, - "learning_rate": 1.1466372657111358e-05, - "loss": 2.3189, + "epoch": 0.53, + "grad_norm": 11.834205627441406, + "learning_rate": 1.6438104003681546e-05, + "loss": 1.9227, "step": 4258 }, { - "epoch": 1.28, - "grad_norm": 202.1298370361328, - "learning_rate": 1.1464368046506966e-05, - "loss": 1.7217, + "epoch": 0.53, + "grad_norm": 18.87928009033203, + "learning_rate": 1.643726728862486e-05, + "loss": 2.661, "step": 4259 }, { - "epoch": 1.28, - "grad_norm": 44.232112884521484, - "learning_rate": 1.1462363435902576e-05, - "loss": 1.1155, + "epoch": 0.53, + "grad_norm": 9.27440357208252, + "learning_rate": 1.6436430573568173e-05, + "loss": 0.9423, "step": 4260 }, { - "epoch": 1.28, - "grad_norm": 20.715179443359375, - "learning_rate": 1.1460358825298185e-05, - "loss": 1.5698, + "epoch": 0.53, + "grad_norm": 49.268157958984375, + "learning_rate": 1.6435593858511484e-05, + "loss": 2.913, "step": 4261 }, { - "epoch": 1.28, - "grad_norm": 16.672870635986328, - "learning_rate": 1.1458354214693797e-05, - "loss": 2.0041, + "epoch": 0.53, + "grad_norm": 13.71524715423584, + "learning_rate": 1.6434757143454797e-05, + "loss": 1.9741, "step": 4262 }, { - "epoch": 1.28, - "grad_norm": 15.357919692993164, - "learning_rate": 1.1456349604089407e-05, - "loss": 1.9595, + "epoch": 0.53, + "grad_norm": 15.70528507232666, + "learning_rate": 1.643392042839811e-05, + "loss": 1.6815, "step": 4263 }, { - "epoch": 1.28, - "grad_norm": 25.542970657348633, - "learning_rate": 1.1454344993485015e-05, - "loss": 3.4864, + "epoch": 0.54, + "grad_norm": 7.827919006347656, + "learning_rate": 1.643308371334142e-05, + "loss": 1.6473, "step": 4264 }, { - "epoch": 1.28, - "grad_norm": 20.45473861694336, - "learning_rate": 1.1452340382880627e-05, - "loss": 1.8289, + "epoch": 0.54, + "grad_norm": 9.871508598327637, + "learning_rate": 1.6432246998284735e-05, + "loss": 1.4339, "step": 4265 }, { - "epoch": 1.28, - "grad_norm": 18.233806610107422, - "learning_rate": 1.1450335772276235e-05, - "loss": 1.9064, + "epoch": 0.54, + "grad_norm": 16.842849731445312, + "learning_rate": 1.643141028322805e-05, + "loss": 3.094, "step": 4266 }, { - "epoch": 1.28, - "grad_norm": 8.359683990478516, - "learning_rate": 1.1448331161671847e-05, - "loss": 1.201, + "epoch": 0.54, + "grad_norm": 15.442558288574219, + "learning_rate": 1.643057356817136e-05, + "loss": 1.0509, "step": 4267 }, { - "epoch": 1.28, - "grad_norm": 9.61418628692627, - "learning_rate": 1.1446326551067457e-05, - "loss": 2.0508, + "epoch": 0.54, + "grad_norm": 7.148504734039307, + "learning_rate": 1.6429736853114672e-05, + "loss": 1.6355, "step": 4268 }, { - "epoch": 1.28, - "grad_norm": 12.493593215942383, - "learning_rate": 1.1444321940463066e-05, - "loss": 1.8421, + "epoch": 0.54, + "grad_norm": 12.161744117736816, + "learning_rate": 1.6428900138057986e-05, + "loss": 1.2154, "step": 4269 }, { - "epoch": 1.28, - "grad_norm": 19.759090423583984, - "learning_rate": 1.1442317329858677e-05, - "loss": 2.7833, + "epoch": 0.54, + "grad_norm": 50.82536697387695, + "learning_rate": 1.64280634230013e-05, + "loss": 3.1307, "step": 4270 }, { - "epoch": 1.28, - "grad_norm": 12.588993072509766, - "learning_rate": 1.1440312719254286e-05, - "loss": 1.1135, + "epoch": 0.54, + "grad_norm": 12.391793251037598, + "learning_rate": 1.642722670794461e-05, + "loss": 2.1355, "step": 4271 }, { - "epoch": 1.28, - "grad_norm": 22.17607879638672, - "learning_rate": 1.1438308108649896e-05, - "loss": 2.3915, + "epoch": 0.54, + "grad_norm": 9.488656997680664, + "learning_rate": 1.6426389992887924e-05, + "loss": 1.2715, "step": 4272 }, { - "epoch": 1.28, - "grad_norm": 11.478544235229492, - "learning_rate": 1.1436303498045504e-05, - "loss": 0.9938, + "epoch": 0.54, + "grad_norm": 12.979984283447266, + "learning_rate": 1.6425553277831237e-05, + "loss": 3.0628, "step": 4273 }, { - "epoch": 1.29, - "grad_norm": 10.766488075256348, - "learning_rate": 1.1434298887441116e-05, - "loss": 1.0582, + "epoch": 0.54, + "grad_norm": 11.4072265625, + "learning_rate": 1.6424716562774547e-05, + "loss": 1.2632, "step": 4274 }, { - "epoch": 1.29, - "grad_norm": 20.56522560119629, - "learning_rate": 1.1432294276836726e-05, - "loss": 1.6918, + "epoch": 0.54, + "grad_norm": 32.04029083251953, + "learning_rate": 1.642387984771786e-05, + "loss": 1.761, "step": 4275 }, { - "epoch": 1.29, - "grad_norm": 9.447360038757324, - "learning_rate": 1.1430289666232334e-05, - "loss": 1.839, + "epoch": 0.54, + "grad_norm": 15.586896896362305, + "learning_rate": 1.6423043132661175e-05, + "loss": 4.1178, "step": 4276 }, { - "epoch": 1.29, - "grad_norm": 9.256118774414062, - "learning_rate": 1.1428285055627946e-05, - "loss": 1.307, + "epoch": 0.54, + "grad_norm": 29.479475021362305, + "learning_rate": 1.642220641760449e-05, + "loss": 0.9489, "step": 4277 }, { - "epoch": 1.29, - "grad_norm": 13.01025676727295, - "learning_rate": 1.1426280445023555e-05, - "loss": 1.6272, + "epoch": 0.54, + "grad_norm": 16.579774856567383, + "learning_rate": 1.64213697025478e-05, + "loss": 2.2051, "step": 4278 }, { - "epoch": 1.29, - "grad_norm": 11.885642051696777, - "learning_rate": 1.1424275834419165e-05, - "loss": 0.5883, + "epoch": 0.54, + "grad_norm": 17.332616806030273, + "learning_rate": 1.6420532987491112e-05, + "loss": 2.4483, "step": 4279 }, { - "epoch": 1.29, - "grad_norm": 13.292377471923828, - "learning_rate": 1.1422271223814776e-05, - "loss": 1.4237, + "epoch": 0.54, + "grad_norm": 36.76666259765625, + "learning_rate": 1.6419696272434426e-05, + "loss": 2.0568, "step": 4280 }, { - "epoch": 1.29, - "grad_norm": 7.443901538848877, - "learning_rate": 1.1420266613210385e-05, - "loss": 0.9989, + "epoch": 0.54, + "grad_norm": 31.958080291748047, + "learning_rate": 1.6418859557377736e-05, + "loss": 4.2887, "step": 4281 }, { - "epoch": 1.29, - "grad_norm": 83.39949035644531, - "learning_rate": 1.1418262002605995e-05, - "loss": 1.8078, + "epoch": 0.54, + "grad_norm": 19.1790771484375, + "learning_rate": 1.641802284232105e-05, + "loss": 2.8956, "step": 4282 }, { - "epoch": 1.29, - "grad_norm": 12.701087951660156, - "learning_rate": 1.1416257392001603e-05, - "loss": 1.5752, + "epoch": 0.54, + "grad_norm": 4.67531681060791, + "learning_rate": 1.641718612726436e-05, + "loss": 0.6016, "step": 4283 }, { - "epoch": 1.29, - "grad_norm": 58.16669464111328, - "learning_rate": 1.1414252781397215e-05, - "loss": 2.8532, + "epoch": 0.54, + "grad_norm": 29.13949203491211, + "learning_rate": 1.6416349412207674e-05, + "loss": 1.4015, "step": 4284 }, { - "epoch": 1.29, - "grad_norm": 21.291078567504883, - "learning_rate": 1.1412248170792823e-05, - "loss": 1.2318, + "epoch": 0.54, + "grad_norm": 19.934898376464844, + "learning_rate": 1.6415512697150987e-05, + "loss": 0.4292, "step": 4285 }, { - "epoch": 1.29, - "grad_norm": 18.41106605529785, - "learning_rate": 1.1410243560188434e-05, - "loss": 2.264, + "epoch": 0.54, + "grad_norm": 3.7015771865844727, + "learning_rate": 1.6414675982094298e-05, + "loss": 0.1326, "step": 4286 }, { - "epoch": 1.29, - "grad_norm": 20.537443161010742, - "learning_rate": 1.1408238949584045e-05, - "loss": 2.6269, + "epoch": 0.54, + "grad_norm": 5.187589645385742, + "learning_rate": 1.641383926703761e-05, + "loss": 2.0533, "step": 4287 }, { - "epoch": 1.29, - "grad_norm": 15.127706527709961, - "learning_rate": 1.1406234338979654e-05, - "loss": 1.9341, + "epoch": 0.54, + "grad_norm": 12.003097534179688, + "learning_rate": 1.641300255198092e-05, + "loss": 0.6824, "step": 4288 }, { - "epoch": 1.29, - "grad_norm": 20.711387634277344, - "learning_rate": 1.1404229728375264e-05, - "loss": 1.7335, + "epoch": 0.54, + "grad_norm": 9.616144180297852, + "learning_rate": 1.6412165836924235e-05, + "loss": 0.643, "step": 4289 }, { - "epoch": 1.29, - "grad_norm": 50.79159164428711, - "learning_rate": 1.1402225117770874e-05, - "loss": 2.1342, + "epoch": 0.54, + "grad_norm": 27.701128005981445, + "learning_rate": 1.641132912186755e-05, + "loss": 1.868, "step": 4290 }, { - "epoch": 1.29, - "grad_norm": 29.805246353149414, - "learning_rate": 1.1400220507166484e-05, - "loss": 2.0907, + "epoch": 0.54, + "grad_norm": 17.69377326965332, + "learning_rate": 1.6410492406810863e-05, + "loss": 1.8682, "step": 4291 }, { - "epoch": 1.29, - "grad_norm": 12.824877738952637, - "learning_rate": 1.1398215896562094e-05, - "loss": 1.1383, + "epoch": 0.54, + "grad_norm": 46.678592681884766, + "learning_rate": 1.6409655691754173e-05, + "loss": 3.8657, "step": 4292 }, { - "epoch": 1.29, - "grad_norm": 7.294422149658203, - "learning_rate": 1.1396211285957704e-05, - "loss": 1.3885, + "epoch": 0.54, + "grad_norm": 14.063295364379883, + "learning_rate": 1.6408818976697486e-05, + "loss": 1.2708, "step": 4293 }, { - "epoch": 1.29, - "grad_norm": 31.934511184692383, - "learning_rate": 1.1394206675353314e-05, - "loss": 2.1207, + "epoch": 0.54, + "grad_norm": 18.941165924072266, + "learning_rate": 1.64079822616408e-05, + "loss": 1.2093, "step": 4294 }, { - "epoch": 1.29, - "grad_norm": 18.717044830322266, - "learning_rate": 1.1392202064748923e-05, - "loss": 1.4102, + "epoch": 0.54, + "grad_norm": 16.60546112060547, + "learning_rate": 1.640714554658411e-05, + "loss": 1.9364, "step": 4295 }, { - "epoch": 1.29, - "grad_norm": 14.38543701171875, - "learning_rate": 1.1390197454144534e-05, - "loss": 1.9233, + "epoch": 0.54, + "grad_norm": 5.379274368286133, + "learning_rate": 1.6406308831527424e-05, + "loss": 0.3526, "step": 4296 }, { - "epoch": 1.29, - "grad_norm": 20.979101181030273, - "learning_rate": 1.1388192843540143e-05, - "loss": 1.509, + "epoch": 0.54, + "grad_norm": 16.46821403503418, + "learning_rate": 1.6405472116470738e-05, + "loss": 1.3559, "step": 4297 }, { - "epoch": 1.29, - "grad_norm": 16.605932235717773, - "learning_rate": 1.1386188232935753e-05, - "loss": 1.4354, + "epoch": 0.54, + "grad_norm": 8.183782577514648, + "learning_rate": 1.640463540141405e-05, + "loss": 2.0754, "step": 4298 }, { - "epoch": 1.29, - "grad_norm": 9.505638122558594, - "learning_rate": 1.1384183622331365e-05, - "loss": 1.1145, + "epoch": 0.54, + "grad_norm": 11.519583702087402, + "learning_rate": 1.640379868635736e-05, + "loss": 1.7285, "step": 4299 }, { - "epoch": 1.29, - "grad_norm": 15.347785949707031, - "learning_rate": 1.1382179011726973e-05, - "loss": 1.6882, + "epoch": 0.54, + "grad_norm": 6.20827579498291, + "learning_rate": 1.6402961971300675e-05, + "loss": 0.2051, "step": 4300 }, { - "epoch": 1.29, - "grad_norm": 20.905508041381836, - "learning_rate": 1.1380174401122583e-05, - "loss": 1.5622, + "epoch": 0.54, + "grad_norm": 17.898536682128906, + "learning_rate": 1.640212525624399e-05, + "loss": 1.0117, "step": 4301 }, { - "epoch": 1.29, - "grad_norm": 41.201194763183594, - "learning_rate": 1.1378169790518192e-05, - "loss": 3.4766, + "epoch": 0.54, + "grad_norm": 14.119236946105957, + "learning_rate": 1.64012885411873e-05, + "loss": 2.6476, "step": 4302 }, { - "epoch": 1.29, - "grad_norm": 15.540892601013184, - "learning_rate": 1.1376165179913803e-05, - "loss": 1.4535, + "epoch": 0.54, + "grad_norm": 36.42744064331055, + "learning_rate": 1.6400451826130613e-05, + "loss": 3.5639, "step": 4303 }, { - "epoch": 1.29, - "grad_norm": 24.336341857910156, - "learning_rate": 1.1374160569309412e-05, - "loss": 1.7003, + "epoch": 0.54, + "grad_norm": 36.01893615722656, + "learning_rate": 1.6399615111073926e-05, + "loss": 3.8818, "step": 4304 }, { - "epoch": 1.29, - "grad_norm": 28.202116012573242, - "learning_rate": 1.1372155958705022e-05, - "loss": 1.2576, + "epoch": 0.54, + "grad_norm": 10.309459686279297, + "learning_rate": 1.639877839601724e-05, + "loss": 4.1736, "step": 4305 }, { - "epoch": 1.29, - "grad_norm": 12.418293952941895, - "learning_rate": 1.1370151348100634e-05, - "loss": 1.8355, + "epoch": 0.54, + "grad_norm": 9.84698486328125, + "learning_rate": 1.639794168096055e-05, + "loss": 4.2613, "step": 4306 }, { - "epoch": 1.29, - "grad_norm": 35.69652557373047, - "learning_rate": 1.1368146737496242e-05, - "loss": 2.1658, + "epoch": 0.54, + "grad_norm": 20.667268753051758, + "learning_rate": 1.6397104965903864e-05, + "loss": 2.7518, "step": 4307 }, { - "epoch": 1.3, - "grad_norm": 17.411029815673828, - "learning_rate": 1.1366142126891852e-05, - "loss": 2.4983, + "epoch": 0.54, + "grad_norm": 14.868730545043945, + "learning_rate": 1.6396268250847178e-05, + "loss": 1.341, "step": 4308 }, { - "epoch": 1.3, - "grad_norm": 10.031928062438965, - "learning_rate": 1.136413751628746e-05, - "loss": 1.7212, + "epoch": 0.54, + "grad_norm": 13.767081260681152, + "learning_rate": 1.6395431535790488e-05, + "loss": 1.7816, "step": 4309 }, { - "epoch": 1.3, - "grad_norm": 11.622937202453613, - "learning_rate": 1.1362132905683072e-05, - "loss": 1.4201, + "epoch": 0.54, + "grad_norm": 6.044618606567383, + "learning_rate": 1.63945948207338e-05, + "loss": 0.4184, "step": 4310 }, { - "epoch": 1.3, - "grad_norm": 19.42033576965332, - "learning_rate": 1.1360128295078682e-05, - "loss": 1.8876, + "epoch": 0.54, + "grad_norm": 15.502923011779785, + "learning_rate": 1.6393758105677112e-05, + "loss": 1.5554, "step": 4311 }, { - "epoch": 1.3, - "grad_norm": 24.995628356933594, - "learning_rate": 1.135812368447429e-05, - "loss": 1.682, + "epoch": 0.54, + "grad_norm": 24.838972091674805, + "learning_rate": 1.6392921390620425e-05, + "loss": 3.4248, "step": 4312 }, { - "epoch": 1.3, - "grad_norm": 11.356565475463867, - "learning_rate": 1.1356119073869902e-05, - "loss": 1.3858, + "epoch": 0.54, + "grad_norm": 13.437346458435059, + "learning_rate": 1.639208467556374e-05, + "loss": 1.5421, "step": 4313 }, { - "epoch": 1.3, - "grad_norm": 18.608964920043945, - "learning_rate": 1.1354114463265511e-05, - "loss": 1.2763, + "epoch": 0.54, + "grad_norm": 19.534582138061523, + "learning_rate": 1.639124796050705e-05, + "loss": 2.9816, "step": 4314 }, { - "epoch": 1.3, - "grad_norm": 16.34697723388672, - "learning_rate": 1.1352109852661121e-05, - "loss": 1.2931, + "epoch": 0.54, + "grad_norm": 15.092426300048828, + "learning_rate": 1.6390411245450363e-05, + "loss": 1.3546, "step": 4315 }, { - "epoch": 1.3, - "grad_norm": 17.903200149536133, - "learning_rate": 1.1350105242056731e-05, - "loss": 1.8466, + "epoch": 0.54, + "grad_norm": 21.328441619873047, + "learning_rate": 1.6389574530393673e-05, + "loss": 3.1882, "step": 4316 }, { - "epoch": 1.3, - "grad_norm": 7.10983419418335, - "learning_rate": 1.1348100631452341e-05, - "loss": 0.967, + "epoch": 0.54, + "grad_norm": 13.988789558410645, + "learning_rate": 1.6388737815336987e-05, + "loss": 0.9068, "step": 4317 }, { - "epoch": 1.3, - "grad_norm": 13.738561630249023, - "learning_rate": 1.1346096020847951e-05, - "loss": 1.4251, + "epoch": 0.54, + "grad_norm": 24.274625778198242, + "learning_rate": 1.63879011002803e-05, + "loss": 1.8792, "step": 4318 }, { - "epoch": 1.3, - "grad_norm": 14.061918258666992, - "learning_rate": 1.1344091410243561e-05, - "loss": 1.8931, + "epoch": 0.54, + "grad_norm": 27.35269546508789, + "learning_rate": 1.6387064385223614e-05, + "loss": 4.0709, "step": 4319 }, { - "epoch": 1.3, - "grad_norm": 28.247364044189453, - "learning_rate": 1.1342086799639171e-05, - "loss": 2.5285, - "step": 4320 - }, - { - "epoch": 1.3, - "eval_loss": 0.21285276114940643, - "eval_runtime": 43.3572, - "eval_samples_per_second": 34.112, - "eval_steps_per_second": 34.112, + "epoch": 0.54, + "grad_norm": 18.570083618164062, + "learning_rate": 1.6386227670166924e-05, + "loss": 2.2045, "step": 4320 }, { - "epoch": 1.3, - "grad_norm": 11.178879737854004, - "learning_rate": 1.134008218903478e-05, - "loss": 1.5172, + "epoch": 0.54, + "grad_norm": 14.518431663513184, + "learning_rate": 1.6385390955110238e-05, + "loss": 1.7629, "step": 4321 }, { - "epoch": 1.3, - "grad_norm": 16.7074031829834, - "learning_rate": 1.1338077578430392e-05, - "loss": 1.3458, + "epoch": 0.54, + "grad_norm": 150.36700439453125, + "learning_rate": 1.6384554240053552e-05, + "loss": 0.7615, "step": 4322 }, { - "epoch": 1.3, - "grad_norm": 21.143476486206055, - "learning_rate": 1.1336072967826002e-05, - "loss": 2.4362, + "epoch": 0.54, + "grad_norm": 8.598651885986328, + "learning_rate": 1.6383717524996862e-05, + "loss": 0.6577, "step": 4323 }, { - "epoch": 1.3, - "grad_norm": 22.83983039855957, - "learning_rate": 1.133406835722161e-05, - "loss": 2.1729, + "epoch": 0.54, + "grad_norm": 6.166103363037109, + "learning_rate": 1.6382880809940176e-05, + "loss": 0.7035, "step": 4324 }, { - "epoch": 1.3, - "grad_norm": 16.511369705200195, - "learning_rate": 1.1332063746617222e-05, - "loss": 1.3165, + "epoch": 0.54, + "grad_norm": 15.817057609558105, + "learning_rate": 1.638204409488349e-05, + "loss": 1.1081, "step": 4325 }, { - "epoch": 1.3, - "grad_norm": 16.411027908325195, - "learning_rate": 1.133005913601283e-05, - "loss": 2.1342, + "epoch": 0.54, + "grad_norm": 14.033500671386719, + "learning_rate": 1.6381207379826803e-05, + "loss": 0.422, "step": 4326 }, { - "epoch": 1.3, - "grad_norm": 16.996347427368164, - "learning_rate": 1.132805452540844e-05, - "loss": 1.9439, + "epoch": 0.54, + "grad_norm": 20.49435806274414, + "learning_rate": 1.6380370664770113e-05, + "loss": 1.372, "step": 4327 }, { - "epoch": 1.3, - "grad_norm": 27.14801788330078, - "learning_rate": 1.1326049914804049e-05, - "loss": 2.3698, + "epoch": 0.54, + "grad_norm": 3.773137331008911, + "learning_rate": 1.6379533949713427e-05, + "loss": 1.6234, "step": 4328 }, { - "epoch": 1.3, - "grad_norm": 10.485852241516113, - "learning_rate": 1.132404530419966e-05, - "loss": 1.1316, + "epoch": 0.54, + "grad_norm": 8.239606857299805, + "learning_rate": 1.637869723465674e-05, + "loss": 1.6802, "step": 4329 }, { - "epoch": 1.3, - "grad_norm": 48.32770538330078, - "learning_rate": 1.132204069359527e-05, - "loss": 1.8271, + "epoch": 0.54, + "grad_norm": 12.494515419006348, + "learning_rate": 1.637786051960005e-05, + "loss": 1.7986, "step": 4330 }, { - "epoch": 1.3, - "grad_norm": 68.1049575805664, - "learning_rate": 1.1320036082990879e-05, - "loss": 2.5344, + "epoch": 0.54, + "grad_norm": 7.956020832061768, + "learning_rate": 1.6377023804543364e-05, + "loss": 0.1914, "step": 4331 }, { - "epoch": 1.3, - "grad_norm": 22.41739273071289, - "learning_rate": 1.131803147238649e-05, - "loss": 2.1376, + "epoch": 0.54, + "grad_norm": 12.799524307250977, + "learning_rate": 1.6376187089486678e-05, + "loss": 1.3183, "step": 4332 }, { - "epoch": 1.3, - "grad_norm": 61.55850601196289, - "learning_rate": 1.1316026861782099e-05, - "loss": 3.21, + "epoch": 0.54, + "grad_norm": 11.716842651367188, + "learning_rate": 1.6375350374429992e-05, + "loss": 2.1588, "step": 4333 }, { - "epoch": 1.3, - "grad_norm": 13.802007675170898, - "learning_rate": 1.1314022251177709e-05, - "loss": 1.7577, + "epoch": 0.54, + "grad_norm": 18.58441162109375, + "learning_rate": 1.6374513659373302e-05, + "loss": 3.564, "step": 4334 }, { - "epoch": 1.3, - "grad_norm": 16.4052734375, - "learning_rate": 1.1312017640573321e-05, - "loss": 1.2041, + "epoch": 0.54, + "grad_norm": 12.418187141418457, + "learning_rate": 1.6373676944316616e-05, + "loss": 0.9168, "step": 4335 }, { - "epoch": 1.3, - "grad_norm": 26.579936981201172, - "learning_rate": 1.131001302996893e-05, - "loss": 1.6785, + "epoch": 0.54, + "grad_norm": 19.849315643310547, + "learning_rate": 1.6372840229259926e-05, + "loss": 3.2139, "step": 4336 }, { - "epoch": 1.3, - "grad_norm": 24.084991455078125, - "learning_rate": 1.130800841936454e-05, - "loss": 1.5441, + "epoch": 0.54, + "grad_norm": 11.36044979095459, + "learning_rate": 1.637200351420324e-05, + "loss": 0.7428, "step": 4337 }, { - "epoch": 1.3, - "grad_norm": 21.47726058959961, - "learning_rate": 1.130600380876015e-05, - "loss": 1.9604, + "epoch": 0.54, + "grad_norm": 6.952395439147949, + "learning_rate": 1.6371166799146553e-05, + "loss": 2.1326, "step": 4338 }, { - "epoch": 1.3, - "grad_norm": 38.121421813964844, - "learning_rate": 1.130399919815576e-05, - "loss": 2.2498, + "epoch": 0.54, + "grad_norm": 17.800748825073242, + "learning_rate": 1.6370330084089863e-05, + "loss": 1.45, "step": 4339 }, { - "epoch": 1.3, - "grad_norm": 23.063236236572266, - "learning_rate": 1.1301994587551368e-05, - "loss": 1.8673, + "epoch": 0.54, + "grad_norm": 8.255887985229492, + "learning_rate": 1.6369493369033177e-05, + "loss": 1.0562, "step": 4340 }, { - "epoch": 1.31, - "grad_norm": 11.506025314331055, - "learning_rate": 1.129998997694698e-05, - "loss": 1.3873, + "epoch": 0.54, + "grad_norm": 9.223470687866211, + "learning_rate": 1.6368656653976487e-05, + "loss": 0.4263, "step": 4341 }, { - "epoch": 1.31, - "grad_norm": 11.516486167907715, - "learning_rate": 1.129798536634259e-05, - "loss": 1.2646, + "epoch": 0.54, + "grad_norm": 46.6847038269043, + "learning_rate": 1.63678199389198e-05, + "loss": 1.5089, "step": 4342 }, { - "epoch": 1.31, - "grad_norm": 15.611605644226074, - "learning_rate": 1.1295980755738198e-05, - "loss": 1.2307, + "epoch": 0.55, + "grad_norm": 12.898388862609863, + "learning_rate": 1.6366983223863115e-05, + "loss": 2.1125, "step": 4343 }, { - "epoch": 1.31, - "grad_norm": 17.53089141845703, - "learning_rate": 1.129397614513381e-05, - "loss": 1.2414, + "epoch": 0.55, + "grad_norm": 4.78568172454834, + "learning_rate": 1.6366146508806425e-05, + "loss": 0.3338, "step": 4344 }, { - "epoch": 1.31, - "grad_norm": 14.397602081298828, - "learning_rate": 1.1291971534529418e-05, - "loss": 1.3744, + "epoch": 0.55, + "grad_norm": 16.7759952545166, + "learning_rate": 1.636530979374974e-05, + "loss": 2.0696, "step": 4345 }, { - "epoch": 1.31, - "grad_norm": 23.23992156982422, - "learning_rate": 1.1289966923925028e-05, - "loss": 1.8092, + "epoch": 0.55, + "grad_norm": 59.63092041015625, + "learning_rate": 1.6364473078693052e-05, + "loss": 2.7622, "step": 4346 }, { - "epoch": 1.31, - "grad_norm": 15.50413990020752, - "learning_rate": 1.1287962313320637e-05, - "loss": 1.819, + "epoch": 0.55, + "grad_norm": 26.8160400390625, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.609, "step": 4347 }, { - "epoch": 1.31, - "grad_norm": 12.4719877243042, - "learning_rate": 1.1285957702716249e-05, - "loss": 0.959, + "epoch": 0.55, + "grad_norm": 9.036898612976074, + "learning_rate": 1.6362799648579676e-05, + "loss": 1.0178, "step": 4348 }, { - "epoch": 1.31, - "grad_norm": 8.75976276397705, - "learning_rate": 1.1283953092111859e-05, - "loss": 1.0429, + "epoch": 0.55, + "grad_norm": 18.46747398376465, + "learning_rate": 1.636196293352299e-05, + "loss": 2.2952, "step": 4349 }, { - "epoch": 1.31, - "grad_norm": 34.667945861816406, - "learning_rate": 1.1281948481507467e-05, - "loss": 2.2953, + "epoch": 0.55, + "grad_norm": 7.6316447257995605, + "learning_rate": 1.6361126218466303e-05, + "loss": 1.4112, "step": 4350 }, { - "epoch": 1.31, - "grad_norm": 15.772024154663086, - "learning_rate": 1.1279943870903079e-05, - "loss": 1.67, + "epoch": 0.55, + "grad_norm": 22.83441734313965, + "learning_rate": 1.6360289503409614e-05, + "loss": 2.9932, "step": 4351 }, { - "epoch": 1.31, - "grad_norm": 19.514604568481445, - "learning_rate": 1.1277939260298687e-05, - "loss": 2.0143, + "epoch": 0.55, + "grad_norm": 11.766851425170898, + "learning_rate": 1.6359452788352927e-05, + "loss": 0.7483, "step": 4352 }, { - "epoch": 1.31, - "grad_norm": 22.16571807861328, - "learning_rate": 1.1275934649694297e-05, - "loss": 1.2593, + "epoch": 0.55, + "grad_norm": 11.485483169555664, + "learning_rate": 1.635861607329624e-05, + "loss": 2.5164, "step": 4353 }, { - "epoch": 1.31, - "grad_norm": 17.157798767089844, - "learning_rate": 1.1273930039089909e-05, - "loss": 1.6187, + "epoch": 0.55, + "grad_norm": 16.853513717651367, + "learning_rate": 1.6357779358239555e-05, + "loss": 2.0076, "step": 4354 }, { - "epoch": 1.31, - "grad_norm": 33.16340637207031, - "learning_rate": 1.1271925428485518e-05, - "loss": 2.0045, + "epoch": 0.55, + "grad_norm": 15.183171272277832, + "learning_rate": 1.6356942643182865e-05, + "loss": 1.3851, "step": 4355 }, { - "epoch": 1.31, - "grad_norm": 14.194746971130371, - "learning_rate": 1.1269920817881128e-05, - "loss": 1.671, + "epoch": 0.55, + "grad_norm": 9.89082145690918, + "learning_rate": 1.635610592812618e-05, + "loss": 0.6442, "step": 4356 }, { - "epoch": 1.31, - "grad_norm": 15.4343900680542, - "learning_rate": 1.1267916207276736e-05, - "loss": 1.2274, + "epoch": 0.55, + "grad_norm": 24.128459930419922, + "learning_rate": 1.6355269213069492e-05, + "loss": 1.1141, "step": 4357 }, { - "epoch": 1.31, - "grad_norm": 21.623807907104492, - "learning_rate": 1.1265911596672348e-05, - "loss": 2.5802, + "epoch": 0.55, + "grad_norm": 22.751218795776367, + "learning_rate": 1.6354432498012802e-05, + "loss": 1.3387, "step": 4358 }, { - "epoch": 1.31, - "grad_norm": 12.904182434082031, - "learning_rate": 1.1263906986067956e-05, - "loss": 1.4189, + "epoch": 0.55, + "grad_norm": 20.522287368774414, + "learning_rate": 1.6353595782956116e-05, + "loss": 1.8312, "step": 4359 }, { - "epoch": 1.31, - "grad_norm": 16.601791381835938, - "learning_rate": 1.1261902375463566e-05, - "loss": 2.787, + "epoch": 0.55, + "grad_norm": 39.677337646484375, + "learning_rate": 1.635275906789943e-05, + "loss": 2.4149, "step": 4360 }, { - "epoch": 1.31, - "grad_norm": 12.915867805480957, - "learning_rate": 1.1259897764859178e-05, - "loss": 1.5663, + "epoch": 0.55, + "grad_norm": 17.512928009033203, + "learning_rate": 1.6351922352842743e-05, + "loss": 0.6399, "step": 4361 }, { - "epoch": 1.31, - "grad_norm": 17.556509017944336, - "learning_rate": 1.1257893154254786e-05, - "loss": 1.8852, + "epoch": 0.55, + "grad_norm": 13.060235977172852, + "learning_rate": 1.6351085637786054e-05, + "loss": 2.7061, "step": 4362 }, { - "epoch": 1.31, - "grad_norm": 23.893291473388672, - "learning_rate": 1.1255888543650397e-05, - "loss": 1.4539, + "epoch": 0.55, + "grad_norm": 53.84843063354492, + "learning_rate": 1.6350248922729367e-05, + "loss": 2.6682, "step": 4363 }, { - "epoch": 1.31, - "grad_norm": 11.98607063293457, - "learning_rate": 1.1253883933046007e-05, - "loss": 1.7452, + "epoch": 0.55, + "grad_norm": 10.398431777954102, + "learning_rate": 1.6349412207672678e-05, + "loss": 1.0722, "step": 4364 }, { - "epoch": 1.31, - "grad_norm": 13.887165069580078, - "learning_rate": 1.1251879322441617e-05, - "loss": 1.6717, + "epoch": 0.55, + "grad_norm": 17.36014175415039, + "learning_rate": 1.634857549261599e-05, + "loss": 2.3584, "step": 4365 }, { - "epoch": 1.31, - "grad_norm": 7.588750839233398, - "learning_rate": 1.1249874711837227e-05, - "loss": 0.8518, + "epoch": 0.55, + "grad_norm": 8.186704635620117, + "learning_rate": 1.6347738777559305e-05, + "loss": 0.9661, "step": 4366 }, { - "epoch": 1.31, - "grad_norm": 27.59679412841797, - "learning_rate": 1.1247870101232837e-05, - "loss": 2.483, + "epoch": 0.55, + "grad_norm": 100.78430938720703, + "learning_rate": 1.6346902062502615e-05, + "loss": 2.5819, "step": 4367 }, { - "epoch": 1.31, - "grad_norm": 19.16243553161621, - "learning_rate": 1.1245865490628447e-05, - "loss": 1.4322, + "epoch": 0.55, + "grad_norm": 117.7176513671875, + "learning_rate": 1.634606534744593e-05, + "loss": 1.7862, "step": 4368 }, { - "epoch": 1.31, - "grad_norm": 16.66512107849121, - "learning_rate": 1.1243860880024055e-05, - "loss": 1.9586, + "epoch": 0.55, + "grad_norm": 9.360919952392578, + "learning_rate": 1.634522863238924e-05, + "loss": 1.8567, "step": 4369 }, { - "epoch": 1.31, - "grad_norm": 17.893173217773438, - "learning_rate": 1.1241856269419667e-05, - "loss": 2.8739, + "epoch": 0.55, + "grad_norm": 12.167970657348633, + "learning_rate": 1.6344391917332553e-05, + "loss": 1.3236, "step": 4370 }, { - "epoch": 1.31, - "grad_norm": 16.915111541748047, - "learning_rate": 1.1239851658815275e-05, - "loss": 1.3351, + "epoch": 0.55, + "grad_norm": 5.873944282531738, + "learning_rate": 1.6343555202275866e-05, + "loss": 0.6095, "step": 4371 }, { - "epoch": 1.31, - "grad_norm": 20.749107360839844, - "learning_rate": 1.1237847048210886e-05, - "loss": 2.3157, + "epoch": 0.55, + "grad_norm": 21.591901779174805, + "learning_rate": 1.6342718487219177e-05, + "loss": 1.4569, "step": 4372 }, { - "epoch": 1.31, - "grad_norm": 33.895973205566406, - "learning_rate": 1.1235842437606497e-05, - "loss": 3.2345, + "epoch": 0.55, + "grad_norm": 6.27478551864624, + "learning_rate": 1.634188177216249e-05, + "loss": 0.9596, "step": 4373 }, { - "epoch": 1.32, - "grad_norm": 16.17300033569336, - "learning_rate": 1.1233837827002106e-05, - "loss": 1.0639, + "epoch": 0.55, + "grad_norm": 28.359424591064453, + "learning_rate": 1.6341045057105804e-05, + "loss": 1.5756, "step": 4374 }, { - "epoch": 1.32, - "grad_norm": 11.130797386169434, - "learning_rate": 1.1231833216397716e-05, - "loss": 2.0059, + "epoch": 0.55, + "grad_norm": 17.1900577545166, + "learning_rate": 1.6340208342049118e-05, + "loss": 1.7516, "step": 4375 }, { - "epoch": 1.32, - "grad_norm": 12.532829284667969, - "learning_rate": 1.1229828605793324e-05, - "loss": 1.1101, + "epoch": 0.55, + "grad_norm": 12.922842979431152, + "learning_rate": 1.6339371626992428e-05, + "loss": 2.1068, "step": 4376 }, { - "epoch": 1.32, - "grad_norm": 10.261481285095215, - "learning_rate": 1.1227823995188936e-05, - "loss": 1.7796, + "epoch": 0.55, + "grad_norm": 24.80573844909668, + "learning_rate": 1.633853491193574e-05, + "loss": 0.9404, "step": 4377 }, { - "epoch": 1.32, - "grad_norm": 18.463781356811523, - "learning_rate": 1.1225819384584546e-05, - "loss": 1.9479, + "epoch": 0.55, + "grad_norm": 42.843414306640625, + "learning_rate": 1.6337698196879055e-05, + "loss": 2.0427, "step": 4378 }, { - "epoch": 1.32, - "grad_norm": 12.4391450881958, - "learning_rate": 1.1223814773980154e-05, - "loss": 2.0404, + "epoch": 0.55, + "grad_norm": 9.03561782836914, + "learning_rate": 1.6336861481822365e-05, + "loss": 1.7024, "step": 4379 }, { - "epoch": 1.32, - "grad_norm": 15.823043823242188, - "learning_rate": 1.1221810163375766e-05, - "loss": 1.4798, + "epoch": 0.55, + "grad_norm": 25.199548721313477, + "learning_rate": 1.633602476676568e-05, + "loss": 1.7186, "step": 4380 }, { - "epoch": 1.32, - "grad_norm": 19.075504302978516, - "learning_rate": 1.1219805552771375e-05, - "loss": 1.712, + "epoch": 0.55, + "grad_norm": 26.4344482421875, + "learning_rate": 1.6335188051708993e-05, + "loss": 3.121, "step": 4381 }, { - "epoch": 1.32, - "grad_norm": 13.849617958068848, - "learning_rate": 1.1217800942166985e-05, - "loss": 1.8711, + "epoch": 0.55, + "grad_norm": 11.396100044250488, + "learning_rate": 1.6334351336652306e-05, + "loss": 1.4493, "step": 4382 }, { - "epoch": 1.32, - "grad_norm": 17.942249298095703, - "learning_rate": 1.1215796331562593e-05, - "loss": 2.6901, + "epoch": 0.55, + "grad_norm": 13.396303176879883, + "learning_rate": 1.6333514621595617e-05, + "loss": 1.7732, "step": 4383 }, { - "epoch": 1.32, - "grad_norm": 9.185188293457031, - "learning_rate": 1.1213791720958205e-05, - "loss": 1.0339, + "epoch": 0.55, + "grad_norm": 48.93893814086914, + "learning_rate": 1.633267790653893e-05, + "loss": 1.8056, "step": 4384 }, { - "epoch": 1.32, - "grad_norm": 12.996100425720215, - "learning_rate": 1.1211787110353815e-05, - "loss": 1.7259, + "epoch": 0.55, + "grad_norm": 11.110664367675781, + "learning_rate": 1.6331841191482244e-05, + "loss": 0.7474, "step": 4385 }, { - "epoch": 1.32, - "grad_norm": 31.83272933959961, - "learning_rate": 1.1209782499749425e-05, - "loss": 2.1627, + "epoch": 0.55, + "grad_norm": 11.114632606506348, + "learning_rate": 1.6331004476425554e-05, + "loss": 3.6277, "step": 4386 }, { - "epoch": 1.32, - "grad_norm": 11.160262107849121, - "learning_rate": 1.1207777889145035e-05, - "loss": 1.7325, + "epoch": 0.55, + "grad_norm": 12.204710006713867, + "learning_rate": 1.6330167761368868e-05, + "loss": 2.2624, "step": 4387 }, { - "epoch": 1.32, - "grad_norm": 19.457317352294922, - "learning_rate": 1.1205773278540644e-05, - "loss": 1.6398, + "epoch": 0.55, + "grad_norm": 5.68466329574585, + "learning_rate": 1.632933104631218e-05, + "loss": 0.5796, "step": 4388 }, { - "epoch": 1.32, - "grad_norm": 15.514799118041992, - "learning_rate": 1.1203768667936255e-05, - "loss": 1.5179, + "epoch": 0.55, + "grad_norm": 8.876830101013184, + "learning_rate": 1.6328494331255492e-05, + "loss": 2.9876, "step": 4389 }, { - "epoch": 1.32, - "grad_norm": 55.15019607543945, - "learning_rate": 1.1201764057331864e-05, - "loss": 4.2946, + "epoch": 0.55, + "grad_norm": 13.933097839355469, + "learning_rate": 1.6327657616198805e-05, + "loss": 2.1435, "step": 4390 }, { - "epoch": 1.32, - "grad_norm": 17.135194778442383, - "learning_rate": 1.1199759446727474e-05, - "loss": 2.0419, + "epoch": 0.55, + "grad_norm": 20.075345993041992, + "learning_rate": 1.632682090114212e-05, + "loss": 2.5147, "step": 4391 }, { - "epoch": 1.32, - "grad_norm": 11.672941207885742, - "learning_rate": 1.1197754836123086e-05, - "loss": 1.4943, + "epoch": 0.55, + "grad_norm": 20.869224548339844, + "learning_rate": 1.632598418608543e-05, + "loss": 2.4408, "step": 4392 }, { - "epoch": 1.32, - "grad_norm": 27.473955154418945, - "learning_rate": 1.1195750225518694e-05, - "loss": 2.4105, + "epoch": 0.55, + "grad_norm": 17.259469985961914, + "learning_rate": 1.6325147471028743e-05, + "loss": 2.3515, "step": 4393 }, { - "epoch": 1.32, - "grad_norm": 12.319040298461914, - "learning_rate": 1.1193745614914304e-05, - "loss": 1.1325, + "epoch": 0.55, + "grad_norm": 7.999358654022217, + "learning_rate": 1.6324310755972053e-05, + "loss": 0.331, "step": 4394 }, { - "epoch": 1.32, - "grad_norm": 19.93411636352539, - "learning_rate": 1.1191741004309912e-05, - "loss": 2.4709, + "epoch": 0.55, + "grad_norm": 18.779560089111328, + "learning_rate": 1.6323474040915367e-05, + "loss": 1.2501, "step": 4395 }, { - "epoch": 1.32, - "grad_norm": 25.42889976501465, - "learning_rate": 1.1189736393705524e-05, - "loss": 2.7937, + "epoch": 0.55, + "grad_norm": 6.969943046569824, + "learning_rate": 1.632263732585868e-05, + "loss": 0.4008, "step": 4396 }, { - "epoch": 1.32, - "grad_norm": 31.769067764282227, - "learning_rate": 1.1187731783101134e-05, - "loss": 2.7676, + "epoch": 0.55, + "grad_norm": 17.99399757385254, + "learning_rate": 1.632180061080199e-05, + "loss": 1.9365, "step": 4397 }, { - "epoch": 1.32, - "grad_norm": 21.329421997070312, - "learning_rate": 1.1185727172496743e-05, - "loss": 2.0521, + "epoch": 0.55, + "grad_norm": 32.63505554199219, + "learning_rate": 1.6320963895745304e-05, + "loss": 3.0524, "step": 4398 }, { - "epoch": 1.32, - "grad_norm": 17.158628463745117, - "learning_rate": 1.1183722561892354e-05, - "loss": 1.5289, + "epoch": 0.55, + "grad_norm": 28.14476203918457, + "learning_rate": 1.6320127180688618e-05, + "loss": 2.3284, "step": 4399 }, { - "epoch": 1.32, - "grad_norm": 13.490479469299316, - "learning_rate": 1.1181717951287963e-05, - "loss": 1.4183, + "epoch": 0.55, + "grad_norm": 12.068603515625, + "learning_rate": 1.6319290465631928e-05, + "loss": 3.1424, "step": 4400 }, { - "epoch": 1.32, - "grad_norm": 8.754962921142578, - "learning_rate": 1.1179713340683573e-05, - "loss": 1.2418, + "epoch": 0.55, + "eval_loss": 0.1459905505180359, + "eval_runtime": 94.1606, + "eval_samples_per_second": 37.617, + "eval_steps_per_second": 37.617, + "step": 4400 + }, + { + "epoch": 0.55, + "grad_norm": 7.126096725463867, + "learning_rate": 1.6318453750575242e-05, + "loss": 2.3201, "step": 4401 }, { - "epoch": 1.32, - "grad_norm": 11.73357105255127, - "learning_rate": 1.1177708730079181e-05, - "loss": 1.3139, + "epoch": 0.55, + "grad_norm": 13.402315139770508, + "learning_rate": 1.6317617035518556e-05, + "loss": 1.3034, "step": 4402 }, { - "epoch": 1.32, - "grad_norm": 13.230740547180176, - "learning_rate": 1.1175704119474793e-05, - "loss": 1.6772, + "epoch": 0.55, + "grad_norm": 11.152032852172852, + "learning_rate": 1.631678032046187e-05, + "loss": 0.9552, "step": 4403 }, { - "epoch": 1.32, - "grad_norm": 17.465307235717773, - "learning_rate": 1.1173699508870403e-05, - "loss": 1.6562, + "epoch": 0.55, + "grad_norm": 20.30156898498535, + "learning_rate": 1.631594360540518e-05, + "loss": 1.2899, "step": 4404 }, { - "epoch": 1.32, - "grad_norm": 32.38056945800781, - "learning_rate": 1.1171694898266012e-05, - "loss": 2.3572, + "epoch": 0.55, + "grad_norm": 28.718151092529297, + "learning_rate": 1.6315106890348493e-05, + "loss": 3.3766, "step": 4405 }, { - "epoch": 1.32, - "grad_norm": 14.467717170715332, - "learning_rate": 1.1169690287661623e-05, - "loss": 1.2874, + "epoch": 0.55, + "grad_norm": 17.650142669677734, + "learning_rate": 1.6314270175291807e-05, + "loss": 2.6818, "step": 4406 }, { - "epoch": 1.33, - "grad_norm": 17.778472900390625, - "learning_rate": 1.1167685677057232e-05, - "loss": 1.9954, + "epoch": 0.55, + "grad_norm": 15.46431827545166, + "learning_rate": 1.6313433460235117e-05, + "loss": 2.8227, "step": 4407 }, { - "epoch": 1.33, - "grad_norm": 8.146272659301758, - "learning_rate": 1.1165681066452842e-05, - "loss": 0.6304, + "epoch": 0.55, + "grad_norm": 31.419979095458984, + "learning_rate": 1.631259674517843e-05, + "loss": 2.973, "step": 4408 }, { - "epoch": 1.33, - "grad_norm": 18.66156005859375, - "learning_rate": 1.1163676455848454e-05, - "loss": 2.0366, + "epoch": 0.55, + "grad_norm": 12.229551315307617, + "learning_rate": 1.6311760030121744e-05, + "loss": 2.2177, "step": 4409 }, { - "epoch": 1.33, - "grad_norm": 13.149731636047363, - "learning_rate": 1.1161671845244062e-05, - "loss": 1.5874, + "epoch": 0.55, + "grad_norm": 16.591360092163086, + "learning_rate": 1.6310923315065058e-05, + "loss": 2.3748, "step": 4410 }, { - "epoch": 1.33, - "grad_norm": 15.526867866516113, - "learning_rate": 1.1159667234639672e-05, - "loss": 1.4642, + "epoch": 0.55, + "grad_norm": 12.206480979919434, + "learning_rate": 1.6310086600008368e-05, + "loss": 2.0689, "step": 4411 }, { - "epoch": 1.33, - "grad_norm": 27.83363151550293, - "learning_rate": 1.1157662624035282e-05, - "loss": 2.0596, + "epoch": 0.55, + "grad_norm": 19.842300415039062, + "learning_rate": 1.6309249884951682e-05, + "loss": 2.4016, "step": 4412 }, { - "epoch": 1.33, - "grad_norm": 82.84793090820312, - "learning_rate": 1.1155658013430892e-05, - "loss": 1.8469, + "epoch": 0.55, + "grad_norm": 33.59919357299805, + "learning_rate": 1.6308413169894996e-05, + "loss": 2.0409, "step": 4413 }, { - "epoch": 1.33, - "grad_norm": 16.59908676147461, - "learning_rate": 1.11536534028265e-05, - "loss": 1.9362, + "epoch": 0.55, + "grad_norm": 14.440958023071289, + "learning_rate": 1.6307576454838306e-05, + "loss": 2.1244, "step": 4414 }, { - "epoch": 1.33, - "grad_norm": 8.759167671203613, - "learning_rate": 1.1151648792222112e-05, - "loss": 2.0726, + "epoch": 0.55, + "grad_norm": 17.500349044799805, + "learning_rate": 1.630673973978162e-05, + "loss": 1.1437, "step": 4415 }, { - "epoch": 1.33, - "grad_norm": 32.26197814941406, - "learning_rate": 1.1149644181617723e-05, - "loss": 2.0811, + "epoch": 0.55, + "grad_norm": 9.321260452270508, + "learning_rate": 1.6305903024724933e-05, + "loss": 1.6508, "step": 4416 }, { - "epoch": 1.33, - "grad_norm": 8.401446342468262, - "learning_rate": 1.1147639571013331e-05, - "loss": 1.3336, + "epoch": 0.55, + "grad_norm": 15.433318138122559, + "learning_rate": 1.6305066309668243e-05, + "loss": 1.8476, "step": 4417 }, { - "epoch": 1.33, - "grad_norm": 10.307430267333984, - "learning_rate": 1.1145634960408943e-05, - "loss": 1.1188, + "epoch": 0.55, + "grad_norm": 10.021492958068848, + "learning_rate": 1.6304229594611557e-05, + "loss": 1.6934, "step": 4418 }, { - "epoch": 1.33, - "grad_norm": 16.372766494750977, - "learning_rate": 1.1143630349804551e-05, - "loss": 1.8886, + "epoch": 0.55, + "grad_norm": 17.209562301635742, + "learning_rate": 1.630339287955487e-05, + "loss": 1.5899, "step": 4419 }, { - "epoch": 1.33, - "grad_norm": 39.85802459716797, - "learning_rate": 1.1141625739200161e-05, - "loss": 2.3996, + "epoch": 0.55, + "grad_norm": 4.5861005783081055, + "learning_rate": 1.630255616449818e-05, + "loss": 0.5776, "step": 4420 }, { - "epoch": 1.33, - "grad_norm": 14.044843673706055, - "learning_rate": 1.1139621128595773e-05, - "loss": 1.4335, + "epoch": 0.55, + "grad_norm": 15.866487503051758, + "learning_rate": 1.6301719449441495e-05, + "loss": 2.145, "step": 4421 }, { - "epoch": 1.33, - "grad_norm": 23.040878295898438, - "learning_rate": 1.1137616517991381e-05, - "loss": 1.8451, + "epoch": 0.55, + "grad_norm": 12.779694557189941, + "learning_rate": 1.6300882734384805e-05, + "loss": 1.1097, "step": 4422 }, { - "epoch": 1.33, - "grad_norm": 17.075387954711914, - "learning_rate": 1.1135611907386991e-05, - "loss": 2.2248, + "epoch": 0.56, + "grad_norm": 9.33056926727295, + "learning_rate": 1.630004601932812e-05, + "loss": 2.8792, "step": 4423 }, { - "epoch": 1.33, - "grad_norm": 110.57589721679688, - "learning_rate": 1.11336072967826e-05, - "loss": 1.5452, + "epoch": 0.56, + "grad_norm": 15.881376266479492, + "learning_rate": 1.6299209304271432e-05, + "loss": 1.7005, "step": 4424 }, { - "epoch": 1.33, - "grad_norm": 15.424068450927734, - "learning_rate": 1.1131602686178212e-05, - "loss": 1.5217, + "epoch": 0.56, + "grad_norm": 12.460254669189453, + "learning_rate": 1.6298372589214742e-05, + "loss": 1.3359, "step": 4425 }, { - "epoch": 1.33, - "grad_norm": 23.823379516601562, - "learning_rate": 1.112959807557382e-05, - "loss": 1.2601, + "epoch": 0.56, + "grad_norm": 10.446908950805664, + "learning_rate": 1.6297535874158056e-05, + "loss": 2.2929, "step": 4426 }, { - "epoch": 1.33, - "grad_norm": 10.928523063659668, - "learning_rate": 1.112759346496943e-05, - "loss": 1.4933, + "epoch": 0.56, + "grad_norm": 14.90433406829834, + "learning_rate": 1.629669915910137e-05, + "loss": 1.2793, "step": 4427 }, { - "epoch": 1.33, - "grad_norm": 13.319860458374023, - "learning_rate": 1.1125588854365042e-05, - "loss": 1.4411, + "epoch": 0.56, + "grad_norm": 21.37445068359375, + "learning_rate": 1.629586244404468e-05, + "loss": 3.7241, "step": 4428 }, { - "epoch": 1.33, - "grad_norm": 11.283712387084961, - "learning_rate": 1.112358424376065e-05, - "loss": 2.1932, + "epoch": 0.56, + "grad_norm": 3.737511396408081, + "learning_rate": 1.6295025728987994e-05, + "loss": 0.4612, "step": 4429 }, { - "epoch": 1.33, - "grad_norm": 7.913817882537842, - "learning_rate": 1.112157963315626e-05, - "loss": 1.3824, + "epoch": 0.56, + "grad_norm": 15.169925689697266, + "learning_rate": 1.6294189013931307e-05, + "loss": 3.6944, "step": 4430 }, { - "epoch": 1.33, - "grad_norm": 10.032218933105469, - "learning_rate": 1.1119575022551869e-05, - "loss": 1.2246, + "epoch": 0.56, + "grad_norm": 35.48377990722656, + "learning_rate": 1.629335229887462e-05, + "loss": 2.125, "step": 4431 }, { - "epoch": 1.33, - "grad_norm": 25.357025146484375, - "learning_rate": 1.111757041194748e-05, - "loss": 1.7075, + "epoch": 0.56, + "grad_norm": 11.017925262451172, + "learning_rate": 1.629251558381793e-05, + "loss": 1.7788, "step": 4432 }, { - "epoch": 1.33, - "grad_norm": 16.777484893798828, - "learning_rate": 1.1115565801343089e-05, - "loss": 1.6018, + "epoch": 0.56, + "grad_norm": 18.080007553100586, + "learning_rate": 1.6291678868761245e-05, + "loss": 3.4252, "step": 4433 }, { - "epoch": 1.33, - "grad_norm": 27.090877532958984, - "learning_rate": 1.1113561190738699e-05, - "loss": 2.4302, + "epoch": 0.56, + "grad_norm": 13.970507621765137, + "learning_rate": 1.629084215370456e-05, + "loss": 1.3873, "step": 4434 }, { - "epoch": 1.33, - "grad_norm": 24.513057708740234, - "learning_rate": 1.111155658013431e-05, - "loss": 2.2252, + "epoch": 0.56, + "grad_norm": 17.634294509887695, + "learning_rate": 1.629000543864787e-05, + "loss": 2.8027, "step": 4435 }, { - "epoch": 1.33, - "grad_norm": 20.253862380981445, - "learning_rate": 1.1109551969529919e-05, - "loss": 1.4782, + "epoch": 0.56, + "grad_norm": 37.78720474243164, + "learning_rate": 1.6289168723591182e-05, + "loss": 1.2082, "step": 4436 }, { - "epoch": 1.33, - "grad_norm": 10.198271751403809, - "learning_rate": 1.1107547358925531e-05, - "loss": 2.0537, + "epoch": 0.56, + "grad_norm": 13.17506217956543, + "learning_rate": 1.6288332008534496e-05, + "loss": 0.7018, "step": 4437 }, { - "epoch": 1.33, - "grad_norm": 16.26251792907715, - "learning_rate": 1.110554274832114e-05, - "loss": 1.6344, + "epoch": 0.56, + "grad_norm": 21.82538414001465, + "learning_rate": 1.628749529347781e-05, + "loss": 1.4864, "step": 4438 }, { - "epoch": 1.33, - "grad_norm": 17.53883934020996, - "learning_rate": 1.110353813771675e-05, - "loss": 1.4399, + "epoch": 0.56, + "grad_norm": 18.28236961364746, + "learning_rate": 1.628665857842112e-05, + "loss": 1.2984, "step": 4439 }, { - "epoch": 1.33, - "grad_norm": 9.351251602172852, - "learning_rate": 1.1101533527112361e-05, - "loss": 1.2029, - "step": 4440 - }, - { - "epoch": 1.33, - "eval_loss": 0.22466596961021423, - "eval_runtime": 43.3151, - "eval_samples_per_second": 34.145, - "eval_steps_per_second": 34.145, + "epoch": 0.56, + "grad_norm": 10.602968215942383, + "learning_rate": 1.6285821863364434e-05, + "loss": 1.3774, "step": 4440 }, { - "epoch": 1.34, - "grad_norm": 18.78108787536621, - "learning_rate": 1.109952891650797e-05, - "loss": 1.8896, + "epoch": 0.56, + "grad_norm": 14.304097175598145, + "learning_rate": 1.6284985148307747e-05, + "loss": 1.4304, "step": 4441 }, { - "epoch": 1.34, - "grad_norm": 14.223852157592773, - "learning_rate": 1.109752430590358e-05, - "loss": 1.8473, + "epoch": 0.56, + "grad_norm": 13.172176361083984, + "learning_rate": 1.6284148433251058e-05, + "loss": 1.0238, "step": 4442 }, { - "epoch": 1.34, - "grad_norm": 29.34437370300293, - "learning_rate": 1.1095519695299188e-05, - "loss": 1.8663, + "epoch": 0.56, + "grad_norm": 16.124061584472656, + "learning_rate": 1.628331171819437e-05, + "loss": 0.9346, "step": 4443 }, { - "epoch": 1.34, - "grad_norm": 11.203985214233398, - "learning_rate": 1.10935150846948e-05, - "loss": 1.4461, + "epoch": 0.56, + "grad_norm": 7.68695592880249, + "learning_rate": 1.6282475003137685e-05, + "loss": 1.2211, "step": 4444 }, { - "epoch": 1.34, - "grad_norm": 13.125014305114746, - "learning_rate": 1.1091510474090408e-05, - "loss": 1.3763, + "epoch": 0.56, + "grad_norm": 18.085830688476562, + "learning_rate": 1.6281638288080995e-05, + "loss": 1.9951, "step": 4445 }, { - "epoch": 1.34, - "grad_norm": 17.99578094482422, - "learning_rate": 1.1089505863486018e-05, - "loss": 2.1525, + "epoch": 0.56, + "grad_norm": 12.31620979309082, + "learning_rate": 1.628080157302431e-05, + "loss": 1.1265, "step": 4446 }, { - "epoch": 1.34, - "grad_norm": 36.31547927856445, - "learning_rate": 1.108750125288163e-05, - "loss": 2.2111, + "epoch": 0.56, + "grad_norm": 9.333024024963379, + "learning_rate": 1.627996485796762e-05, + "loss": 2.1302, "step": 4447 }, { - "epoch": 1.34, - "grad_norm": 13.750075340270996, - "learning_rate": 1.1085496642277238e-05, - "loss": 1.3206, + "epoch": 0.56, + "grad_norm": 10.498042106628418, + "learning_rate": 1.6279128142910933e-05, + "loss": 0.7364, "step": 4448 }, { - "epoch": 1.34, - "grad_norm": 15.520805358886719, - "learning_rate": 1.1083492031672849e-05, - "loss": 3.3127, + "epoch": 0.56, + "grad_norm": 20.063735961914062, + "learning_rate": 1.6278291427854246e-05, + "loss": 3.5866, "step": 4449 }, { - "epoch": 1.34, - "grad_norm": 16.05813217163086, - "learning_rate": 1.1081487421068457e-05, - "loss": 1.5862, + "epoch": 0.56, + "grad_norm": 13.666686058044434, + "learning_rate": 1.6277454712797557e-05, + "loss": 2.8051, "step": 4450 }, { - "epoch": 1.34, - "grad_norm": 18.96892738342285, - "learning_rate": 1.1079482810464069e-05, - "loss": 1.4905, + "epoch": 0.56, + "grad_norm": 13.316627502441406, + "learning_rate": 1.627661799774087e-05, + "loss": 1.6376, "step": 4451 }, { - "epoch": 1.34, - "grad_norm": 11.585920333862305, - "learning_rate": 1.1077478199859679e-05, - "loss": 2.2658, + "epoch": 0.56, + "grad_norm": 11.108987808227539, + "learning_rate": 1.6275781282684184e-05, + "loss": 2.7904, "step": 4452 }, { - "epoch": 1.34, - "grad_norm": 21.121341705322266, - "learning_rate": 1.1075473589255287e-05, - "loss": 0.8119, + "epoch": 0.56, + "grad_norm": 49.902618408203125, + "learning_rate": 1.6274944567627494e-05, + "loss": 2.4332, "step": 4453 }, { - "epoch": 1.34, - "grad_norm": 39.30922317504883, - "learning_rate": 1.1073468978650899e-05, - "loss": 2.4467, + "epoch": 0.56, + "grad_norm": 18.142854690551758, + "learning_rate": 1.6274107852570808e-05, + "loss": 3.3683, "step": 4454 }, { - "epoch": 1.34, - "grad_norm": 15.98094654083252, - "learning_rate": 1.1071464368046507e-05, - "loss": 2.3305, + "epoch": 0.56, + "grad_norm": 31.451210021972656, + "learning_rate": 1.627327113751412e-05, + "loss": 1.4711, "step": 4455 }, { - "epoch": 1.34, - "grad_norm": 9.627312660217285, - "learning_rate": 1.1069459757442117e-05, - "loss": 1.2344, + "epoch": 0.56, + "grad_norm": 12.135986328125, + "learning_rate": 1.627243442245743e-05, + "loss": 2.2472, "step": 4456 }, { - "epoch": 1.34, - "grad_norm": 14.245975494384766, - "learning_rate": 1.1067455146837727e-05, - "loss": 0.9343, + "epoch": 0.56, + "grad_norm": 12.158326148986816, + "learning_rate": 1.6271597707400745e-05, + "loss": 1.5944, "step": 4457 }, { - "epoch": 1.34, - "grad_norm": 29.139923095703125, - "learning_rate": 1.1065450536233338e-05, - "loss": 2.0948, + "epoch": 0.56, + "grad_norm": 39.65235137939453, + "learning_rate": 1.627076099234406e-05, + "loss": 2.0681, "step": 4458 }, { - "epoch": 1.34, - "grad_norm": 23.93746566772461, - "learning_rate": 1.1063445925628948e-05, - "loss": 1.8753, + "epoch": 0.56, + "grad_norm": 19.438718795776367, + "learning_rate": 1.6269924277287373e-05, + "loss": 2.1166, "step": 4459 }, { - "epoch": 1.34, - "grad_norm": 8.223407745361328, - "learning_rate": 1.1061441315024558e-05, - "loss": 1.0049, + "epoch": 0.56, + "grad_norm": 7.061102390289307, + "learning_rate": 1.6269087562230683e-05, + "loss": 0.2029, "step": 4460 }, { - "epoch": 1.34, - "grad_norm": 28.680850982666016, - "learning_rate": 1.1059436704420168e-05, - "loss": 1.8643, + "epoch": 0.56, + "grad_norm": 7.9185919761657715, + "learning_rate": 1.6268250847173997e-05, + "loss": 0.6045, "step": 4461 }, { - "epoch": 1.34, - "grad_norm": 27.831514358520508, - "learning_rate": 1.1057432093815776e-05, - "loss": 1.8748, + "epoch": 0.56, + "grad_norm": 24.92377281188965, + "learning_rate": 1.626741413211731e-05, + "loss": 2.6631, "step": 4462 }, { - "epoch": 1.34, - "grad_norm": 17.656972885131836, - "learning_rate": 1.1055427483211388e-05, - "loss": 1.8081, + "epoch": 0.56, + "grad_norm": 14.203178405761719, + "learning_rate": 1.626657741706062e-05, + "loss": 0.8789, "step": 4463 }, { - "epoch": 1.34, - "grad_norm": 12.41053581237793, - "learning_rate": 1.1053422872606998e-05, - "loss": 1.6312, + "epoch": 0.56, + "grad_norm": 14.638683319091797, + "learning_rate": 1.6265740702003934e-05, + "loss": 2.8277, "step": 4464 }, { - "epoch": 1.34, - "grad_norm": 14.747664451599121, - "learning_rate": 1.1051418262002606e-05, - "loss": 1.8963, + "epoch": 0.56, + "grad_norm": 8.197480201721191, + "learning_rate": 1.6264903986947248e-05, + "loss": 2.014, "step": 4465 }, { - "epoch": 1.34, - "grad_norm": 20.91558074951172, - "learning_rate": 1.1049413651398218e-05, - "loss": 1.0479, + "epoch": 0.56, + "grad_norm": 55.8455924987793, + "learning_rate": 1.6264067271890558e-05, + "loss": 3.2082, "step": 4466 }, { - "epoch": 1.34, - "grad_norm": 18.06478500366211, - "learning_rate": 1.1047409040793827e-05, - "loss": 1.513, + "epoch": 0.56, + "grad_norm": 45.411041259765625, + "learning_rate": 1.626323055683387e-05, + "loss": 2.6712, "step": 4467 }, { - "epoch": 1.34, - "grad_norm": 25.323034286499023, - "learning_rate": 1.1045404430189437e-05, - "loss": 2.1898, + "epoch": 0.56, + "grad_norm": 9.196112632751465, + "learning_rate": 1.6262393841777185e-05, + "loss": 2.336, "step": 4468 }, { - "epoch": 1.34, - "grad_norm": 10.666196823120117, - "learning_rate": 1.1043399819585045e-05, - "loss": 1.6848, + "epoch": 0.56, + "grad_norm": 7.315990924835205, + "learning_rate": 1.62615571267205e-05, + "loss": 1.1812, "step": 4469 }, { - "epoch": 1.34, - "grad_norm": 13.26004695892334, - "learning_rate": 1.1041395208980657e-05, - "loss": 1.3236, + "epoch": 0.56, + "grad_norm": 17.87982749938965, + "learning_rate": 1.626072041166381e-05, + "loss": 1.2415, "step": 4470 }, { - "epoch": 1.34, - "grad_norm": 11.206985473632812, - "learning_rate": 1.1039390598376267e-05, - "loss": 1.2008, + "epoch": 0.56, + "grad_norm": 9.346860885620117, + "learning_rate": 1.6259883696607123e-05, + "loss": 1.159, "step": 4471 }, { - "epoch": 1.34, - "grad_norm": 22.27272605895996, - "learning_rate": 1.1037385987771875e-05, - "loss": 1.5483, + "epoch": 0.56, + "grad_norm": 9.384839057922363, + "learning_rate": 1.6259046981550436e-05, + "loss": 2.3701, "step": 4472 }, { - "epoch": 1.34, - "grad_norm": 13.044824600219727, - "learning_rate": 1.1035381377167487e-05, - "loss": 1.4811, + "epoch": 0.56, + "grad_norm": 16.067934036254883, + "learning_rate": 1.6258210266493747e-05, + "loss": 2.8654, "step": 4473 }, { - "epoch": 1.35, - "grad_norm": 15.387162208557129, - "learning_rate": 1.1033376766563096e-05, - "loss": 1.7151, + "epoch": 0.56, + "grad_norm": 6.0859880447387695, + "learning_rate": 1.625737355143706e-05, + "loss": 2.1036, "step": 4474 }, { - "epoch": 1.35, - "grad_norm": 20.35818862915039, - "learning_rate": 1.1031372155958706e-05, - "loss": 1.6009, + "epoch": 0.56, + "grad_norm": 6.0310750007629395, + "learning_rate": 1.625653683638037e-05, + "loss": 0.6741, "step": 4475 }, { - "epoch": 1.35, - "grad_norm": 12.326359748840332, - "learning_rate": 1.1029367545354314e-05, - "loss": 1.7638, + "epoch": 0.56, + "grad_norm": 42.43614959716797, + "learning_rate": 1.6255700121323684e-05, + "loss": 3.6709, "step": 4476 }, { - "epoch": 1.35, - "grad_norm": 29.4976863861084, - "learning_rate": 1.1027362934749926e-05, - "loss": 1.8569, + "epoch": 0.56, + "grad_norm": 7.449599266052246, + "learning_rate": 1.6254863406266998e-05, + "loss": 2.0419, "step": 4477 }, { - "epoch": 1.35, - "grad_norm": 11.03281307220459, - "learning_rate": 1.1025358324145536e-05, - "loss": 1.0993, + "epoch": 0.56, + "grad_norm": 18.67700958251953, + "learning_rate": 1.6254026691210308e-05, + "loss": 1.4978, "step": 4478 }, { - "epoch": 1.35, - "grad_norm": 20.460525512695312, - "learning_rate": 1.1023353713541144e-05, - "loss": 1.4282, + "epoch": 0.56, + "grad_norm": 14.114124298095703, + "learning_rate": 1.6253189976153622e-05, + "loss": 1.6081, "step": 4479 }, { - "epoch": 1.35, - "grad_norm": 11.63111400604248, - "learning_rate": 1.1021349102936756e-05, - "loss": 2.2856, + "epoch": 0.56, + "grad_norm": 17.91291046142578, + "learning_rate": 1.6252353261096932e-05, + "loss": 2.4313, "step": 4480 }, { - "epoch": 1.35, - "grad_norm": 17.048723220825195, - "learning_rate": 1.1019344492332364e-05, - "loss": 2.037, + "epoch": 0.56, + "grad_norm": 19.789363861083984, + "learning_rate": 1.6251516546040246e-05, + "loss": 2.3497, "step": 4481 }, { - "epoch": 1.35, - "grad_norm": 7.7821502685546875, - "learning_rate": 1.1017339881727975e-05, - "loss": 1.2083, + "epoch": 0.56, + "grad_norm": 5.772512435913086, + "learning_rate": 1.625067983098356e-05, + "loss": 0.6537, "step": 4482 }, { - "epoch": 1.35, - "grad_norm": 16.613143920898438, - "learning_rate": 1.1015335271123586e-05, - "loss": 1.3636, + "epoch": 0.56, + "grad_norm": 11.66680908203125, + "learning_rate": 1.6249843115926873e-05, + "loss": 1.1615, "step": 4483 }, { - "epoch": 1.35, - "grad_norm": 15.978687286376953, - "learning_rate": 1.1013330660519195e-05, - "loss": 1.5447, + "epoch": 0.56, + "grad_norm": 22.826932907104492, + "learning_rate": 1.6249006400870183e-05, + "loss": 1.9935, "step": 4484 }, { - "epoch": 1.35, - "grad_norm": 12.686722755432129, - "learning_rate": 1.1011326049914805e-05, - "loss": 1.308, + "epoch": 0.56, + "grad_norm": 20.87784194946289, + "learning_rate": 1.6248169685813497e-05, + "loss": 0.682, "step": 4485 }, { - "epoch": 1.35, - "grad_norm": 15.303074836730957, - "learning_rate": 1.1009321439310415e-05, - "loss": 2.4245, + "epoch": 0.56, + "grad_norm": 22.138505935668945, + "learning_rate": 1.624733297075681e-05, + "loss": 0.5427, "step": 4486 }, { - "epoch": 1.35, - "grad_norm": 12.841423988342285, - "learning_rate": 1.1007316828706025e-05, - "loss": 2.0611, + "epoch": 0.56, + "grad_norm": 25.351593017578125, + "learning_rate": 1.624649625570012e-05, + "loss": 1.4485, "step": 4487 }, { - "epoch": 1.35, - "grad_norm": 14.324323654174805, - "learning_rate": 1.1005312218101633e-05, - "loss": 1.9151, + "epoch": 0.56, + "grad_norm": 18.775243759155273, + "learning_rate": 1.6245659540643435e-05, + "loss": 1.9783, "step": 4488 }, { - "epoch": 1.35, - "grad_norm": 10.316316604614258, - "learning_rate": 1.1003307607497245e-05, - "loss": 1.8989, + "epoch": 0.56, + "grad_norm": 13.692991256713867, + "learning_rate": 1.6244822825586748e-05, + "loss": 2.1927, "step": 4489 }, { - "epoch": 1.35, - "grad_norm": 11.872175216674805, - "learning_rate": 1.1001302996892855e-05, - "loss": 1.4971, + "epoch": 0.56, + "grad_norm": 45.6162109375, + "learning_rate": 1.6243986110530062e-05, + "loss": 2.8494, "step": 4490 }, { - "epoch": 1.35, - "grad_norm": 13.584701538085938, - "learning_rate": 1.0999298386288464e-05, - "loss": 1.6093, + "epoch": 0.56, + "grad_norm": 18.776546478271484, + "learning_rate": 1.6243149395473372e-05, + "loss": 1.6298, "step": 4491 }, { - "epoch": 1.35, - "grad_norm": 23.555721282958984, - "learning_rate": 1.0997293775684075e-05, - "loss": 1.4708, + "epoch": 0.56, + "grad_norm": 8.89008903503418, + "learning_rate": 1.6242312680416686e-05, + "loss": 0.6049, "step": 4492 }, { - "epoch": 1.35, - "grad_norm": 13.753824234008789, - "learning_rate": 1.0995289165079684e-05, - "loss": 1.4468, + "epoch": 0.56, + "grad_norm": 14.663054466247559, + "learning_rate": 1.624147596536e-05, + "loss": 2.9699, "step": 4493 }, { - "epoch": 1.35, - "grad_norm": 23.030887603759766, - "learning_rate": 1.0993284554475294e-05, - "loss": 1.9327, + "epoch": 0.56, + "grad_norm": 14.046426773071289, + "learning_rate": 1.624063925030331e-05, + "loss": 2.0183, "step": 4494 }, { - "epoch": 1.35, - "grad_norm": 43.81611251831055, - "learning_rate": 1.0991279943870906e-05, - "loss": 2.3426, + "epoch": 0.56, + "grad_norm": 26.707117080688477, + "learning_rate": 1.6239802535246623e-05, + "loss": 1.9285, "step": 4495 }, { - "epoch": 1.35, - "grad_norm": 22.91530418395996, - "learning_rate": 1.0989275333266514e-05, - "loss": 2.1606, + "epoch": 0.56, + "grad_norm": 6.997550964355469, + "learning_rate": 1.6238965820189937e-05, + "loss": 1.8572, "step": 4496 }, { - "epoch": 1.35, - "grad_norm": 30.3094425201416, - "learning_rate": 1.0987270722662124e-05, - "loss": 2.3948, + "epoch": 0.56, + "grad_norm": 7.119721412658691, + "learning_rate": 1.623812910513325e-05, + "loss": 1.4935, "step": 4497 }, { - "epoch": 1.35, - "grad_norm": 90.9547348022461, - "learning_rate": 1.0985266112057732e-05, - "loss": 2.5067, + "epoch": 0.56, + "grad_norm": 27.102155685424805, + "learning_rate": 1.623729239007656e-05, + "loss": 2.3526, "step": 4498 }, { - "epoch": 1.35, - "grad_norm": 11.258960723876953, - "learning_rate": 1.0983261501453344e-05, - "loss": 2.0156, + "epoch": 0.56, + "grad_norm": 11.541977882385254, + "learning_rate": 1.6236455675019875e-05, + "loss": 1.8764, "step": 4499 }, { - "epoch": 1.35, - "grad_norm": 66.39472961425781, - "learning_rate": 1.0981256890848953e-05, - "loss": 2.6879, + "epoch": 0.56, + "grad_norm": 12.703827857971191, + "learning_rate": 1.6235618959963185e-05, + "loss": 3.0699, "step": 4500 }, { - "epoch": 1.35, - "grad_norm": 20.91155242919922, - "learning_rate": 1.0979252280244563e-05, - "loss": 1.7934, + "epoch": 0.56, + "grad_norm": 11.643512725830078, + "learning_rate": 1.62347822449065e-05, + "loss": 1.1002, "step": 4501 }, { - "epoch": 1.35, - "grad_norm": 18.402379989624023, - "learning_rate": 1.0977247669640175e-05, - "loss": 1.3445, + "epoch": 0.56, + "grad_norm": 9.464075088500977, + "learning_rate": 1.6233945529849812e-05, + "loss": 1.2133, "step": 4502 }, { - "epoch": 1.35, - "grad_norm": 15.267261505126953, - "learning_rate": 1.0975243059035783e-05, - "loss": 1.642, + "epoch": 0.57, + "grad_norm": 17.377323150634766, + "learning_rate": 1.6233108814793122e-05, + "loss": 1.7252, "step": 4503 }, { - "epoch": 1.35, - "grad_norm": 50.35695266723633, - "learning_rate": 1.0973238448431393e-05, - "loss": 2.0173, + "epoch": 0.57, + "grad_norm": 14.55684757232666, + "learning_rate": 1.6232272099736436e-05, + "loss": 2.0068, "step": 4504 }, { - "epoch": 1.35, - "grad_norm": 10.630228042602539, - "learning_rate": 1.0971233837827003e-05, - "loss": 1.9372, + "epoch": 0.57, + "grad_norm": 14.520007133483887, + "learning_rate": 1.6231435384679746e-05, + "loss": 1.9361, "step": 4505 }, { - "epoch": 1.35, - "grad_norm": 32.916053771972656, - "learning_rate": 1.0969229227222613e-05, - "loss": 2.2776, + "epoch": 0.57, + "grad_norm": 12.38199520111084, + "learning_rate": 1.623059866962306e-05, + "loss": 1.497, "step": 4506 }, { - "epoch": 1.36, - "grad_norm": 22.993738174438477, - "learning_rate": 1.0967224616618223e-05, - "loss": 2.2662, + "epoch": 0.57, + "grad_norm": 11.052742958068848, + "learning_rate": 1.6229761954566374e-05, + "loss": 1.7926, "step": 4507 }, { - "epoch": 1.36, - "grad_norm": 13.480685234069824, - "learning_rate": 1.0965220006013833e-05, - "loss": 1.4548, + "epoch": 0.57, + "grad_norm": 13.624369621276855, + "learning_rate": 1.6228925239509684e-05, + "loss": 2.5462, "step": 4508 }, { - "epoch": 1.36, - "grad_norm": 22.65932273864746, - "learning_rate": 1.0963215395409443e-05, - "loss": 2.1417, + "epoch": 0.57, + "grad_norm": 12.981353759765625, + "learning_rate": 1.6228088524452997e-05, + "loss": 0.9325, "step": 4509 }, { - "epoch": 1.36, - "grad_norm": 9.409554481506348, - "learning_rate": 1.0961210784805052e-05, - "loss": 1.1043, + "epoch": 0.57, + "grad_norm": 16.13641357421875, + "learning_rate": 1.622725180939631e-05, + "loss": 1.8671, "step": 4510 }, { - "epoch": 1.36, - "grad_norm": 51.40016555786133, - "learning_rate": 1.0959206174200664e-05, - "loss": 2.5073, + "epoch": 0.57, + "grad_norm": 14.619405746459961, + "learning_rate": 1.6226415094339625e-05, + "loss": 1.7138, "step": 4511 }, { - "epoch": 1.36, - "grad_norm": 32.164634704589844, - "learning_rate": 1.0957201563596272e-05, - "loss": 1.7867, + "epoch": 0.57, + "grad_norm": 17.65199089050293, + "learning_rate": 1.6225578379282935e-05, + "loss": 4.2354, "step": 4512 }, { - "epoch": 1.36, - "grad_norm": 29.420494079589844, - "learning_rate": 1.0955196952991882e-05, - "loss": 1.7466, + "epoch": 0.57, + "grad_norm": 26.615278244018555, + "learning_rate": 1.622474166422625e-05, + "loss": 3.0798, "step": 4513 }, { - "epoch": 1.36, - "grad_norm": 10.119473457336426, - "learning_rate": 1.0953192342387494e-05, - "loss": 1.3604, + "epoch": 0.57, + "grad_norm": 9.01013469696045, + "learning_rate": 1.6223904949169562e-05, + "loss": 1.7211, "step": 4514 }, { - "epoch": 1.36, - "grad_norm": 10.170442581176758, - "learning_rate": 1.0951187731783102e-05, - "loss": 1.2242, + "epoch": 0.57, + "grad_norm": 21.100177764892578, + "learning_rate": 1.6223068234112873e-05, + "loss": 4.2199, "step": 4515 }, { - "epoch": 1.36, - "grad_norm": 11.852310180664062, - "learning_rate": 1.0949183121178712e-05, - "loss": 1.2755, + "epoch": 0.57, + "grad_norm": 13.066444396972656, + "learning_rate": 1.6222231519056186e-05, + "loss": 0.5538, "step": 4516 }, { - "epoch": 1.36, - "grad_norm": 14.793402671813965, - "learning_rate": 1.094717851057432e-05, - "loss": 1.752, + "epoch": 0.57, + "grad_norm": 11.702042579650879, + "learning_rate": 1.62213948039995e-05, + "loss": 1.5223, "step": 4517 }, { - "epoch": 1.36, - "grad_norm": 42.632144927978516, - "learning_rate": 1.0945173899969932e-05, - "loss": 1.7613, + "epoch": 0.57, + "grad_norm": 17.563871383666992, + "learning_rate": 1.6220558088942814e-05, + "loss": 1.169, "step": 4518 }, { - "epoch": 1.36, - "grad_norm": 19.947559356689453, - "learning_rate": 1.0943169289365541e-05, - "loss": 1.9094, + "epoch": 0.57, + "grad_norm": 15.5070161819458, + "learning_rate": 1.6219721373886124e-05, + "loss": 1.4809, "step": 4519 }, { - "epoch": 1.36, - "grad_norm": 15.529866218566895, - "learning_rate": 1.0941164678761151e-05, - "loss": 1.3777, + "epoch": 0.57, + "grad_norm": 13.678018569946289, + "learning_rate": 1.6218884658829437e-05, + "loss": 2.9076, "step": 4520 }, { - "epoch": 1.36, - "grad_norm": 21.871538162231445, - "learning_rate": 1.0939160068156763e-05, - "loss": 1.91, + "epoch": 0.57, + "grad_norm": 8.950277328491211, + "learning_rate": 1.621804794377275e-05, + "loss": 2.0884, "step": 4521 }, { - "epoch": 1.36, - "grad_norm": 18.05958366394043, - "learning_rate": 1.0937155457552371e-05, - "loss": 1.6152, + "epoch": 0.57, + "grad_norm": 11.462152481079102, + "learning_rate": 1.621721122871606e-05, + "loss": 0.8893, "step": 4522 }, { - "epoch": 1.36, - "grad_norm": 13.304652214050293, - "learning_rate": 1.0935150846947981e-05, - "loss": 1.4861, + "epoch": 0.57, + "grad_norm": 47.0164680480957, + "learning_rate": 1.6216374513659375e-05, + "loss": 2.5958, "step": 4523 }, { - "epoch": 1.36, - "grad_norm": 72.59398651123047, - "learning_rate": 1.093314623634359e-05, - "loss": 2.0853, + "epoch": 0.57, + "grad_norm": 13.051041603088379, + "learning_rate": 1.621553779860269e-05, + "loss": 2.353, "step": 4524 }, { - "epoch": 1.36, - "grad_norm": 11.113508224487305, - "learning_rate": 1.0931141625739201e-05, - "loss": 1.5427, + "epoch": 0.57, + "grad_norm": 6.730961322784424, + "learning_rate": 1.6214701083546002e-05, + "loss": 0.7855, "step": 4525 }, { - "epoch": 1.36, - "grad_norm": 11.754325866699219, - "learning_rate": 1.0929137015134811e-05, - "loss": 1.8286, + "epoch": 0.57, + "grad_norm": 14.580228805541992, + "learning_rate": 1.6213864368489313e-05, + "loss": 3.6427, "step": 4526 }, { - "epoch": 1.36, - "grad_norm": 16.077978134155273, - "learning_rate": 1.092713240453042e-05, - "loss": 1.3027, + "epoch": 0.57, + "grad_norm": 12.631464004516602, + "learning_rate": 1.6213027653432626e-05, + "loss": 3.1712, "step": 4527 }, { - "epoch": 1.36, - "grad_norm": 14.799029350280762, - "learning_rate": 1.0925127793926032e-05, - "loss": 1.222, + "epoch": 0.57, + "grad_norm": 9.510821342468262, + "learning_rate": 1.6212190938375936e-05, + "loss": 1.8789, "step": 4528 }, { - "epoch": 1.36, - "grad_norm": 12.316436767578125, - "learning_rate": 1.092312318332164e-05, - "loss": 1.5382, + "epoch": 0.57, + "grad_norm": 18.531097412109375, + "learning_rate": 1.621135422331925e-05, + "loss": 2.4732, "step": 4529 }, { - "epoch": 1.36, - "grad_norm": 11.659561157226562, - "learning_rate": 1.092111857271725e-05, - "loss": 1.0118, + "epoch": 0.57, + "grad_norm": 25.47800636291504, + "learning_rate": 1.6210517508262564e-05, + "loss": 2.4834, "step": 4530 }, { - "epoch": 1.36, - "grad_norm": 10.741880416870117, - "learning_rate": 1.091911396211286e-05, - "loss": 1.1201, + "epoch": 0.57, + "grad_norm": 22.9942626953125, + "learning_rate": 1.6209680793205874e-05, + "loss": 1.6036, "step": 4531 }, { - "epoch": 1.36, - "grad_norm": 17.27454376220703, - "learning_rate": 1.091710935150847e-05, - "loss": 1.8521, + "epoch": 0.57, + "grad_norm": 24.319677352905273, + "learning_rate": 1.6208844078149188e-05, + "loss": 1.4908, "step": 4532 }, { - "epoch": 1.36, - "grad_norm": 19.821794509887695, - "learning_rate": 1.091510474090408e-05, - "loss": 1.1864, + "epoch": 0.57, + "grad_norm": 25.627683639526367, + "learning_rate": 1.6208007363092498e-05, + "loss": 1.5463, "step": 4533 }, { - "epoch": 1.36, - "grad_norm": 42.41746520996094, - "learning_rate": 1.091310013029969e-05, - "loss": 1.7798, + "epoch": 0.57, + "grad_norm": 11.917311668395996, + "learning_rate": 1.620717064803581e-05, + "loss": 2.1978, "step": 4534 }, { - "epoch": 1.36, - "grad_norm": 17.059722900390625, - "learning_rate": 1.09110955196953e-05, - "loss": 2.3114, + "epoch": 0.57, + "grad_norm": 11.899550437927246, + "learning_rate": 1.6206333932979125e-05, + "loss": 1.598, "step": 4535 }, { - "epoch": 1.36, - "grad_norm": 38.7147331237793, - "learning_rate": 1.0909090909090909e-05, - "loss": 2.4257, + "epoch": 0.57, + "grad_norm": 15.763212203979492, + "learning_rate": 1.6205497217922435e-05, + "loss": 1.3893, "step": 4536 }, { - "epoch": 1.36, - "grad_norm": 17.778711318969727, - "learning_rate": 1.090708629848652e-05, - "loss": 1.598, + "epoch": 0.57, + "grad_norm": 23.417814254760742, + "learning_rate": 1.620466050286575e-05, + "loss": 1.5707, "step": 4537 }, { - "epoch": 1.36, - "grad_norm": 55.522010803222656, - "learning_rate": 1.090508168788213e-05, - "loss": 1.7511, + "epoch": 0.57, + "grad_norm": 18.423015594482422, + "learning_rate": 1.6203823787809063e-05, + "loss": 2.6211, "step": 4538 }, { - "epoch": 1.36, - "grad_norm": 31.59695053100586, - "learning_rate": 1.090307707727774e-05, - "loss": 1.6838, + "epoch": 0.57, + "grad_norm": 10.28055477142334, + "learning_rate": 1.6202987072752376e-05, + "loss": 1.2414, "step": 4539 }, { - "epoch": 1.37, - "grad_norm": 14.828858375549316, - "learning_rate": 1.0901072466673351e-05, - "loss": 1.3515, + "epoch": 0.57, + "grad_norm": 7.889482498168945, + "learning_rate": 1.6202150357695687e-05, + "loss": 2.6767, "step": 4540 }, { - "epoch": 1.37, - "grad_norm": 18.156906127929688, - "learning_rate": 1.089906785606896e-05, - "loss": 2.1229, + "epoch": 0.57, + "grad_norm": 65.32218933105469, + "learning_rate": 1.6201313642639e-05, + "loss": 2.8384, "step": 4541 }, { - "epoch": 1.37, - "grad_norm": 14.6245698928833, - "learning_rate": 1.089706324546457e-05, - "loss": 1.0952, + "epoch": 0.57, + "grad_norm": 13.456228256225586, + "learning_rate": 1.6200476927582314e-05, + "loss": 1.3431, "step": 4542 }, { - "epoch": 1.37, - "grad_norm": 14.46635913848877, - "learning_rate": 1.0895058634860178e-05, - "loss": 1.6335, + "epoch": 0.57, + "grad_norm": 11.8510160446167, + "learning_rate": 1.6199640212525624e-05, + "loss": 1.0315, "step": 4543 }, { - "epoch": 1.37, - "grad_norm": 11.85185432434082, - "learning_rate": 1.089305402425579e-05, - "loss": 1.6408, + "epoch": 0.57, + "grad_norm": 20.21044921875, + "learning_rate": 1.6198803497468938e-05, + "loss": 2.6707, "step": 4544 }, { - "epoch": 1.37, - "grad_norm": 9.316162109375, - "learning_rate": 1.08910494136514e-05, - "loss": 1.0231, + "epoch": 0.57, + "grad_norm": 14.6653470993042, + "learning_rate": 1.619796678241225e-05, + "loss": 2.2289, "step": 4545 }, { - "epoch": 1.37, - "grad_norm": 16.85337257385254, - "learning_rate": 1.0889044803047008e-05, - "loss": 1.486, + "epoch": 0.57, + "grad_norm": 8.660666465759277, + "learning_rate": 1.6197130067355565e-05, + "loss": 1.8441, "step": 4546 }, { - "epoch": 1.37, - "grad_norm": 13.948755264282227, - "learning_rate": 1.088704019244262e-05, - "loss": 2.3643, + "epoch": 0.57, + "grad_norm": 7.917065143585205, + "learning_rate": 1.6196293352298875e-05, + "loss": 1.0702, "step": 4547 }, { - "epoch": 1.37, - "grad_norm": 22.807424545288086, - "learning_rate": 1.0885035581838228e-05, - "loss": 1.6381, + "epoch": 0.57, + "grad_norm": 33.09801483154297, + "learning_rate": 1.619545663724219e-05, + "loss": 1.7716, "step": 4548 }, { - "epoch": 1.37, - "grad_norm": 17.668928146362305, - "learning_rate": 1.0883030971233838e-05, - "loss": 1.8671, + "epoch": 0.57, + "grad_norm": 9.391871452331543, + "learning_rate": 1.6194619922185503e-05, + "loss": 1.15, "step": 4549 }, { - "epoch": 1.37, - "grad_norm": 61.22587966918945, - "learning_rate": 1.088102636062945e-05, - "loss": 2.4726, + "epoch": 0.57, + "grad_norm": 33.01217269897461, + "learning_rate": 1.6193783207128813e-05, + "loss": 1.6423, "step": 4550 }, { - "epoch": 1.37, - "grad_norm": 15.125883102416992, - "learning_rate": 1.0879021750025058e-05, - "loss": 1.6064, + "epoch": 0.57, + "grad_norm": 12.896798133850098, + "learning_rate": 1.6192946492072127e-05, + "loss": 1.6256, "step": 4551 }, { - "epoch": 1.37, - "grad_norm": 12.281584739685059, - "learning_rate": 1.0877017139420669e-05, - "loss": 1.9036, + "epoch": 0.57, + "grad_norm": 33.08998107910156, + "learning_rate": 1.619210977701544e-05, + "loss": 2.8372, "step": 4552 }, { - "epoch": 1.37, - "grad_norm": 20.28942108154297, - "learning_rate": 1.0875012528816277e-05, - "loss": 2.022, + "epoch": 0.57, + "grad_norm": 6.515081882476807, + "learning_rate": 1.619127306195875e-05, + "loss": 0.5338, "step": 4553 }, { - "epoch": 1.37, - "grad_norm": 12.765894889831543, - "learning_rate": 1.0873007918211889e-05, - "loss": 1.2068, + "epoch": 0.57, + "grad_norm": 6.269851207733154, + "learning_rate": 1.6190436346902064e-05, + "loss": 0.8314, "step": 4554 }, { - "epoch": 1.37, - "grad_norm": 14.457947731018066, - "learning_rate": 1.0871003307607497e-05, - "loss": 1.2275, + "epoch": 0.57, + "grad_norm": 6.227377891540527, + "learning_rate": 1.6189599631845378e-05, + "loss": 2.0353, "step": 4555 }, { - "epoch": 1.37, - "grad_norm": 26.644542694091797, - "learning_rate": 1.0868998697003109e-05, - "loss": 3.3596, + "epoch": 0.57, + "grad_norm": 17.164512634277344, + "learning_rate": 1.6188762916788688e-05, + "loss": 0.7052, "step": 4556 }, { - "epoch": 1.37, - "grad_norm": 17.826154708862305, - "learning_rate": 1.0866994086398719e-05, - "loss": 2.4254, + "epoch": 0.57, + "grad_norm": 24.479812622070312, + "learning_rate": 1.6187926201732002e-05, + "loss": 1.5668, "step": 4557 }, { - "epoch": 1.37, - "grad_norm": 26.188709259033203, - "learning_rate": 1.0864989475794327e-05, - "loss": 1.8766, + "epoch": 0.57, + "grad_norm": 6.707143783569336, + "learning_rate": 1.6187089486675312e-05, + "loss": 0.2343, "step": 4558 }, { - "epoch": 1.37, - "grad_norm": 15.313033103942871, - "learning_rate": 1.0862984865189939e-05, - "loss": 1.5112, + "epoch": 0.57, + "grad_norm": 12.978679656982422, + "learning_rate": 1.6186252771618626e-05, + "loss": 0.7059, "step": 4559 }, { - "epoch": 1.37, - "grad_norm": 15.900823593139648, - "learning_rate": 1.0860980254585548e-05, - "loss": 2.0939, + "epoch": 0.57, + "grad_norm": 14.660187721252441, + "learning_rate": 1.618541605656194e-05, + "loss": 2.3506, "step": 4560 }, { - "epoch": 1.37, - "eval_loss": 0.21247684955596924, - "eval_runtime": 43.2712, - "eval_samples_per_second": 34.18, - "eval_steps_per_second": 34.18, - "step": 4560 + "epoch": 0.57, + "grad_norm": 8.758925437927246, + "learning_rate": 1.618457934150525e-05, + "loss": 1.3554, + "step": 4561 }, { - "epoch": 1.37, - "grad_norm": 9.430264472961426, - "learning_rate": 1.0858975643981158e-05, - "loss": 1.7693, - "step": 4561 - }, - { - "epoch": 1.37, - "grad_norm": 9.332898139953613, - "learning_rate": 1.0856971033376766e-05, - "loss": 1.048, + "epoch": 0.57, + "grad_norm": 9.19585132598877, + "learning_rate": 1.6183742626448563e-05, + "loss": 2.1781, "step": 4562 }, { - "epoch": 1.37, - "grad_norm": 15.586241722106934, - "learning_rate": 1.0854966422772378e-05, - "loss": 1.3119, + "epoch": 0.57, + "grad_norm": 15.945395469665527, + "learning_rate": 1.6182905911391877e-05, + "loss": 0.7306, "step": 4563 }, { - "epoch": 1.37, - "grad_norm": 23.395587921142578, - "learning_rate": 1.0852961812167988e-05, - "loss": 2.1297, + "epoch": 0.57, + "grad_norm": 10.57514762878418, + "learning_rate": 1.6182069196335187e-05, + "loss": 1.2656, "step": 4564 }, { - "epoch": 1.37, - "grad_norm": 11.959819793701172, - "learning_rate": 1.0850957201563596e-05, - "loss": 1.2409, + "epoch": 0.57, + "grad_norm": 18.79050064086914, + "learning_rate": 1.61812324812785e-05, + "loss": 1.7017, "step": 4565 }, { - "epoch": 1.37, - "grad_norm": 9.007245063781738, - "learning_rate": 1.0848952590959208e-05, - "loss": 1.714, + "epoch": 0.57, + "grad_norm": 15.656498908996582, + "learning_rate": 1.6180395766221814e-05, + "loss": 2.3983, "step": 4566 }, { - "epoch": 1.37, - "grad_norm": 56.485538482666016, - "learning_rate": 1.0846947980354816e-05, - "loss": 2.3396, + "epoch": 0.57, + "grad_norm": 12.713788986206055, + "learning_rate": 1.6179559051165128e-05, + "loss": 3.1377, "step": 4567 }, { - "epoch": 1.37, - "grad_norm": 12.010300636291504, - "learning_rate": 1.0844943369750427e-05, - "loss": 2.0262, + "epoch": 0.57, + "grad_norm": 11.3866548538208, + "learning_rate": 1.617872233610844e-05, + "loss": 2.89, "step": 4568 }, { - "epoch": 1.37, - "grad_norm": 37.940059661865234, - "learning_rate": 1.0842938759146038e-05, - "loss": 1.7123, + "epoch": 0.57, + "grad_norm": 15.300727844238281, + "learning_rate": 1.6177885621051752e-05, + "loss": 2.2796, "step": 4569 }, { - "epoch": 1.37, - "grad_norm": 13.00949764251709, - "learning_rate": 1.0840934148541647e-05, - "loss": 1.3661, + "epoch": 0.57, + "grad_norm": 13.088519096374512, + "learning_rate": 1.6177048905995066e-05, + "loss": 1.4001, "step": 4570 }, { - "epoch": 1.37, - "grad_norm": 15.618173599243164, - "learning_rate": 1.0838929537937257e-05, - "loss": 1.9199, + "epoch": 0.57, + "grad_norm": 12.312095642089844, + "learning_rate": 1.6176212190938376e-05, + "loss": 1.031, "step": 4571 }, { - "epoch": 1.37, - "grad_norm": 22.606422424316406, - "learning_rate": 1.0836924927332865e-05, - "loss": 1.8386, + "epoch": 0.57, + "grad_norm": 25.807453155517578, + "learning_rate": 1.617537547588169e-05, + "loss": 1.5651, "step": 4572 }, { - "epoch": 1.37, - "grad_norm": 13.844141960144043, - "learning_rate": 1.0834920316728477e-05, - "loss": 1.3548, + "epoch": 0.57, + "grad_norm": 7.2591400146484375, + "learning_rate": 1.6174538760825003e-05, + "loss": 0.7109, "step": 4573 }, { - "epoch": 1.38, - "grad_norm": 11.279787063598633, - "learning_rate": 1.0832915706124085e-05, - "loss": 1.5721, + "epoch": 0.57, + "grad_norm": 13.769720077514648, + "learning_rate": 1.6173702045768317e-05, + "loss": 1.0747, "step": 4574 }, { - "epoch": 1.38, - "grad_norm": 13.838190078735352, - "learning_rate": 1.0830911095519695e-05, - "loss": 1.6001, + "epoch": 0.57, + "grad_norm": 11.24695110321045, + "learning_rate": 1.6172865330711627e-05, + "loss": 1.2303, "step": 4575 }, { - "epoch": 1.38, - "grad_norm": 18.21175765991211, - "learning_rate": 1.0828906484915307e-05, - "loss": 1.9498, + "epoch": 0.57, + "grad_norm": 31.83026123046875, + "learning_rate": 1.617202861565494e-05, + "loss": 2.0739, "step": 4576 }, { - "epoch": 1.38, - "grad_norm": 11.330121994018555, - "learning_rate": 1.0826901874310916e-05, - "loss": 1.3935, + "epoch": 0.57, + "grad_norm": 15.03792667388916, + "learning_rate": 1.6171191900598254e-05, + "loss": 0.9974, "step": 4577 }, { - "epoch": 1.38, - "grad_norm": 18.719507217407227, - "learning_rate": 1.0824897263706526e-05, - "loss": 1.4156, + "epoch": 0.57, + "grad_norm": 15.796013832092285, + "learning_rate": 1.6170355185541565e-05, + "loss": 2.6044, "step": 4578 }, { - "epoch": 1.38, - "grad_norm": 49.91222381591797, - "learning_rate": 1.0822892653102136e-05, - "loss": 2.8363, + "epoch": 0.57, + "grad_norm": 14.602738380432129, + "learning_rate": 1.616951847048488e-05, + "loss": 2.3995, "step": 4579 }, { - "epoch": 1.38, - "grad_norm": 32.154563903808594, - "learning_rate": 1.0820888042497746e-05, - "loss": 2.5501, + "epoch": 0.57, + "grad_norm": 77.08251190185547, + "learning_rate": 1.6168681755428192e-05, + "loss": 2.5041, "step": 4580 }, { - "epoch": 1.38, - "grad_norm": 8.45114803314209, - "learning_rate": 1.0818883431893356e-05, - "loss": 1.6866, + "epoch": 0.57, + "grad_norm": 8.562460899353027, + "learning_rate": 1.6167845040371502e-05, + "loss": 0.6393, "step": 4581 }, { - "epoch": 1.38, - "grad_norm": 44.169677734375, - "learning_rate": 1.0816878821288966e-05, - "loss": 2.4302, + "epoch": 0.58, + "grad_norm": 7.505651473999023, + "learning_rate": 1.6167008325314816e-05, + "loss": 0.7842, "step": 4582 }, { - "epoch": 1.38, - "grad_norm": 25.283023834228516, - "learning_rate": 1.0814874210684576e-05, - "loss": 2.2415, + "epoch": 0.58, + "grad_norm": 6.744200229644775, + "learning_rate": 1.616617161025813e-05, + "loss": 1.9606, "step": 4583 }, { - "epoch": 1.38, - "grad_norm": 16.97034454345703, - "learning_rate": 1.0812869600080184e-05, - "loss": 1.8726, + "epoch": 0.58, + "grad_norm": 17.124282836914062, + "learning_rate": 1.616533489520144e-05, + "loss": 2.2951, "step": 4584 }, { - "epoch": 1.38, - "grad_norm": 9.19655990600586, - "learning_rate": 1.0810864989475796e-05, - "loss": 1.1808, + "epoch": 0.58, + "grad_norm": 12.59799861907959, + "learning_rate": 1.6164498180144753e-05, + "loss": 2.3428, "step": 4585 }, { - "epoch": 1.38, - "grad_norm": 18.277463912963867, - "learning_rate": 1.0808860378871405e-05, - "loss": 1.8823, + "epoch": 0.58, + "grad_norm": 8.041672706604004, + "learning_rate": 1.6163661465088064e-05, + "loss": 0.4739, "step": 4586 }, { - "epoch": 1.38, - "grad_norm": 21.819673538208008, - "learning_rate": 1.0806855768267015e-05, - "loss": 2.0347, + "epoch": 0.58, + "grad_norm": 6.622467517852783, + "learning_rate": 1.6162824750031377e-05, + "loss": 0.7039, "step": 4587 }, { - "epoch": 1.38, - "grad_norm": 9.92550277709961, - "learning_rate": 1.0804851157662627e-05, - "loss": 1.032, + "epoch": 0.58, + "grad_norm": 12.706607818603516, + "learning_rate": 1.616198803497469e-05, + "loss": 1.9807, "step": 4588 }, { - "epoch": 1.38, - "grad_norm": 22.52004623413086, - "learning_rate": 1.0802846547058235e-05, - "loss": 2.1637, + "epoch": 0.58, + "grad_norm": 3.131267547607422, + "learning_rate": 1.6161151319918e-05, + "loss": 0.1042, "step": 4589 }, { - "epoch": 1.38, - "grad_norm": 18.34803581237793, - "learning_rate": 1.0800841936453845e-05, - "loss": 1.9793, + "epoch": 0.58, + "grad_norm": 10.65312385559082, + "learning_rate": 1.6160314604861315e-05, + "loss": 1.5446, "step": 4590 }, { - "epoch": 1.38, - "grad_norm": 24.83062171936035, - "learning_rate": 1.0798837325849453e-05, - "loss": 2.2503, + "epoch": 0.58, + "grad_norm": 3.5985212326049805, + "learning_rate": 1.615947788980463e-05, + "loss": 0.1319, "step": 4591 }, { - "epoch": 1.38, - "grad_norm": 31.985370635986328, - "learning_rate": 1.0796832715245065e-05, - "loss": 1.9019, + "epoch": 0.58, + "grad_norm": 16.998870849609375, + "learning_rate": 1.615864117474794e-05, + "loss": 2.8475, "step": 4592 }, { - "epoch": 1.38, - "grad_norm": 11.606969833374023, - "learning_rate": 1.0794828104640674e-05, - "loss": 1.5377, + "epoch": 0.58, + "grad_norm": 14.827058792114258, + "learning_rate": 1.6157804459691252e-05, + "loss": 1.9056, "step": 4593 }, { - "epoch": 1.38, - "grad_norm": 14.753523826599121, - "learning_rate": 1.0792823494036284e-05, - "loss": 1.6545, + "epoch": 0.58, + "grad_norm": 12.889955520629883, + "learning_rate": 1.6156967744634566e-05, + "loss": 3.2301, "step": 4594 }, { - "epoch": 1.38, - "grad_norm": 19.58027458190918, - "learning_rate": 1.0790818883431895e-05, - "loss": 1.7343, + "epoch": 0.58, + "grad_norm": 46.53408432006836, + "learning_rate": 1.615613102957788e-05, + "loss": 1.6417, "step": 4595 }, { - "epoch": 1.38, - "grad_norm": 17.902088165283203, - "learning_rate": 1.0788814272827504e-05, - "loss": 1.763, + "epoch": 0.58, + "grad_norm": 10.040948867797852, + "learning_rate": 1.615529431452119e-05, + "loss": 1.7649, "step": 4596 }, { - "epoch": 1.38, - "grad_norm": 10.415716171264648, - "learning_rate": 1.0786809662223114e-05, - "loss": 1.6063, + "epoch": 0.58, + "grad_norm": 6.867565631866455, + "learning_rate": 1.6154457599464504e-05, + "loss": 1.5661, "step": 4597 }, { - "epoch": 1.38, - "grad_norm": 13.281739234924316, - "learning_rate": 1.0784805051618722e-05, - "loss": 1.8627, + "epoch": 0.58, + "grad_norm": 13.134184837341309, + "learning_rate": 1.6153620884407817e-05, + "loss": 1.8915, "step": 4598 }, { - "epoch": 1.38, - "grad_norm": 10.292632102966309, - "learning_rate": 1.0782800441014334e-05, - "loss": 1.7306, + "epoch": 0.58, + "grad_norm": 15.050349235534668, + "learning_rate": 1.6152784169351128e-05, + "loss": 1.4449, "step": 4599 }, { - "epoch": 1.38, - "grad_norm": 12.653688430786133, - "learning_rate": 1.0780795830409944e-05, - "loss": 1.3753, + "epoch": 0.58, + "grad_norm": 17.474966049194336, + "learning_rate": 1.615194745429444e-05, + "loss": 1.6006, "step": 4600 }, { - "epoch": 1.38, - "grad_norm": 26.442541122436523, - "learning_rate": 1.0778791219805553e-05, - "loss": 2.2257, + "epoch": 0.58, + "grad_norm": 9.59934139251709, + "learning_rate": 1.6151110739237755e-05, + "loss": 0.7479, "step": 4601 }, { - "epoch": 1.38, - "grad_norm": 18.668859481811523, - "learning_rate": 1.0776786609201164e-05, - "loss": 1.8788, + "epoch": 0.58, + "grad_norm": 14.621644973754883, + "learning_rate": 1.615027402418107e-05, + "loss": 2.0671, "step": 4602 }, { - "epoch": 1.38, - "grad_norm": 19.046268463134766, - "learning_rate": 1.0774781998596773e-05, - "loss": 2.1165, + "epoch": 0.58, + "grad_norm": 14.647985458374023, + "learning_rate": 1.614943730912438e-05, + "loss": 0.7475, "step": 4603 }, { - "epoch": 1.38, - "grad_norm": 11.989215850830078, - "learning_rate": 1.0772777387992383e-05, - "loss": 1.2069, + "epoch": 0.58, + "grad_norm": 17.761341094970703, + "learning_rate": 1.6148600594067692e-05, + "loss": 1.6466, "step": 4604 }, { - "epoch": 1.38, - "grad_norm": 16.762723922729492, - "learning_rate": 1.0770772777387993e-05, - "loss": 1.3351, + "epoch": 0.58, + "grad_norm": 14.85982894897461, + "learning_rate": 1.6147763879011006e-05, + "loss": 1.6539, "step": 4605 }, { - "epoch": 1.38, - "grad_norm": 21.549854278564453, - "learning_rate": 1.0768768166783603e-05, - "loss": 1.8815, + "epoch": 0.58, + "grad_norm": 16.099637985229492, + "learning_rate": 1.6146927163954316e-05, + "loss": 1.3589, "step": 4606 }, { - "epoch": 1.39, - "grad_norm": 46.833518981933594, - "learning_rate": 1.0766763556179213e-05, - "loss": 2.6215, + "epoch": 0.58, + "grad_norm": 12.23636531829834, + "learning_rate": 1.614609044889763e-05, + "loss": 1.4119, "step": 4607 }, { - "epoch": 1.39, - "grad_norm": 13.283019065856934, - "learning_rate": 1.0764758945574823e-05, - "loss": 1.9159, + "epoch": 0.58, + "grad_norm": 12.770011901855469, + "learning_rate": 1.6145253733840944e-05, + "loss": 0.4493, "step": 4608 }, { - "epoch": 1.39, - "grad_norm": 41.622859954833984, - "learning_rate": 1.0762754334970433e-05, - "loss": 1.4153, + "epoch": 0.58, + "grad_norm": 132.66159057617188, + "learning_rate": 1.6144417018784254e-05, + "loss": 3.0702, "step": 4609 }, { - "epoch": 1.39, - "grad_norm": 9.129386901855469, - "learning_rate": 1.0760749724366042e-05, - "loss": 1.0581, + "epoch": 0.58, + "grad_norm": 13.396702766418457, + "learning_rate": 1.6143580303727568e-05, + "loss": 1.1574, "step": 4610 }, { - "epoch": 1.39, - "grad_norm": 63.40047836303711, - "learning_rate": 1.0758745113761653e-05, - "loss": 1.7917, + "epoch": 0.58, + "grad_norm": 26.069164276123047, + "learning_rate": 1.6142743588670878e-05, + "loss": 2.2689, "step": 4611 }, { - "epoch": 1.39, - "grad_norm": 119.53295135498047, - "learning_rate": 1.0756740503157263e-05, - "loss": 1.7074, + "epoch": 0.58, + "grad_norm": 11.59582233428955, + "learning_rate": 1.614190687361419e-05, + "loss": 2.3476, "step": 4612 }, { - "epoch": 1.39, - "grad_norm": 15.246512413024902, - "learning_rate": 1.0754735892552872e-05, - "loss": 1.3584, + "epoch": 0.58, + "grad_norm": 20.005428314208984, + "learning_rate": 1.6141070158557505e-05, + "loss": 2.1947, "step": 4613 }, { - "epoch": 1.39, - "grad_norm": 12.469449043273926, - "learning_rate": 1.0752731281948484e-05, - "loss": 1.8839, + "epoch": 0.58, + "grad_norm": 14.68831729888916, + "learning_rate": 1.6140233443500815e-05, + "loss": 1.5459, "step": 4614 }, { - "epoch": 1.39, - "grad_norm": 38.270023345947266, - "learning_rate": 1.0750726671344092e-05, - "loss": 1.4275, + "epoch": 0.58, + "grad_norm": 5.946023464202881, + "learning_rate": 1.613939672844413e-05, + "loss": 0.2806, "step": 4615 }, { - "epoch": 1.39, - "grad_norm": 16.700550079345703, - "learning_rate": 1.0748722060739702e-05, - "loss": 1.771, + "epoch": 0.58, + "grad_norm": 11.920265197753906, + "learning_rate": 1.6138560013387443e-05, + "loss": 0.4182, "step": 4616 }, { - "epoch": 1.39, - "grad_norm": 23.500558853149414, - "learning_rate": 1.074671745013531e-05, - "loss": 2.2971, + "epoch": 0.58, + "grad_norm": 9.490656852722168, + "learning_rate": 1.6137723298330753e-05, + "loss": 0.8312, "step": 4617 }, { - "epoch": 1.39, - "grad_norm": 17.58616065979004, - "learning_rate": 1.0744712839530922e-05, - "loss": 1.05, + "epoch": 0.58, + "grad_norm": 17.456974029541016, + "learning_rate": 1.6136886583274067e-05, + "loss": 1.4119, "step": 4618 }, { - "epoch": 1.39, - "grad_norm": 12.4642972946167, - "learning_rate": 1.0742708228926532e-05, - "loss": 1.1186, + "epoch": 0.58, + "grad_norm": 29.047473907470703, + "learning_rate": 1.613604986821738e-05, + "loss": 1.9479, "step": 4619 }, { - "epoch": 1.39, - "grad_norm": 12.4574556350708, - "learning_rate": 1.074070361832214e-05, - "loss": 1.4973, + "epoch": 0.58, + "grad_norm": 2.6419622898101807, + "learning_rate": 1.613521315316069e-05, + "loss": 0.0435, "step": 4620 }, { - "epoch": 1.39, - "grad_norm": 10.377580642700195, - "learning_rate": 1.0738699007717753e-05, - "loss": 1.8076, + "epoch": 0.58, + "grad_norm": 14.935125350952148, + "learning_rate": 1.6134376438104004e-05, + "loss": 2.1664, "step": 4621 }, { - "epoch": 1.39, - "grad_norm": 19.34781265258789, - "learning_rate": 1.0736694397113361e-05, - "loss": 2.2722, + "epoch": 0.58, + "grad_norm": 8.660603523254395, + "learning_rate": 1.6133539723047318e-05, + "loss": 2.3279, "step": 4622 }, { - "epoch": 1.39, - "grad_norm": 16.955293655395508, - "learning_rate": 1.0734689786508971e-05, - "loss": 1.7219, + "epoch": 0.58, + "grad_norm": 23.47085189819336, + "learning_rate": 1.613270300799063e-05, + "loss": 2.4712, "step": 4623 }, { - "epoch": 1.39, - "grad_norm": 18.286012649536133, - "learning_rate": 1.0732685175904583e-05, - "loss": 2.9251, + "epoch": 0.58, + "grad_norm": 15.253812789916992, + "learning_rate": 1.613186629293394e-05, + "loss": 1.809, "step": 4624 }, { - "epoch": 1.39, - "grad_norm": 19.848609924316406, - "learning_rate": 1.0730680565300191e-05, - "loss": 2.2197, + "epoch": 0.58, + "grad_norm": 19.041500091552734, + "learning_rate": 1.6131029577877255e-05, + "loss": 1.2805, "step": 4625 }, { - "epoch": 1.39, - "grad_norm": 10.013169288635254, - "learning_rate": 1.0728675954695801e-05, - "loss": 0.7125, + "epoch": 0.58, + "grad_norm": 17.887468338012695, + "learning_rate": 1.613019286282057e-05, + "loss": 2.9352, "step": 4626 }, { - "epoch": 1.39, - "grad_norm": 20.401023864746094, - "learning_rate": 1.0726671344091411e-05, - "loss": 1.5097, + "epoch": 0.58, + "grad_norm": 26.64823341369629, + "learning_rate": 1.612935614776388e-05, + "loss": 1.194, "step": 4627 }, { - "epoch": 1.39, - "grad_norm": 20.424652099609375, - "learning_rate": 1.0724666733487021e-05, - "loss": 1.6132, + "epoch": 0.58, + "grad_norm": 15.227701187133789, + "learning_rate": 1.6128519432707193e-05, + "loss": 1.1081, "step": 4628 }, { - "epoch": 1.39, - "grad_norm": 17.268680572509766, - "learning_rate": 1.072266212288263e-05, - "loss": 1.589, + "epoch": 0.58, + "grad_norm": 9.078343391418457, + "learning_rate": 1.6127682717650507e-05, + "loss": 1.4022, "step": 4629 }, { - "epoch": 1.39, - "grad_norm": 10.245696067810059, - "learning_rate": 1.0720657512278242e-05, - "loss": 2.4915, + "epoch": 0.58, + "grad_norm": 35.389652252197266, + "learning_rate": 1.612684600259382e-05, + "loss": 1.8525, "step": 4630 }, { - "epoch": 1.39, - "grad_norm": 21.644811630249023, - "learning_rate": 1.0718652901673852e-05, - "loss": 2.4812, + "epoch": 0.58, + "grad_norm": 80.05748748779297, + "learning_rate": 1.612600928753713e-05, + "loss": 3.0423, "step": 4631 }, { - "epoch": 1.39, - "grad_norm": 19.70071792602539, - "learning_rate": 1.071664829106946e-05, - "loss": 1.8817, + "epoch": 0.58, + "grad_norm": 8.841204643249512, + "learning_rate": 1.6125172572480444e-05, + "loss": 0.655, "step": 4632 }, { - "epoch": 1.39, - "grad_norm": 22.212900161743164, - "learning_rate": 1.0714643680465072e-05, - "loss": 1.9495, + "epoch": 0.58, + "grad_norm": 14.465465545654297, + "learning_rate": 1.6124335857423758e-05, + "loss": 2.3567, "step": 4633 }, { - "epoch": 1.39, - "grad_norm": 11.387062072753906, - "learning_rate": 1.071263906986068e-05, - "loss": 1.3702, + "epoch": 0.58, + "grad_norm": 30.510494232177734, + "learning_rate": 1.6123499142367068e-05, + "loss": 1.9619, "step": 4634 }, { - "epoch": 1.39, - "grad_norm": 10.107064247131348, - "learning_rate": 1.071063445925629e-05, - "loss": 1.0777, + "epoch": 0.58, + "grad_norm": 14.832023620605469, + "learning_rate": 1.612266242731038e-05, + "loss": 1.2945, "step": 4635 }, { - "epoch": 1.39, - "grad_norm": 34.227264404296875, - "learning_rate": 1.0708629848651899e-05, - "loss": 1.9404, + "epoch": 0.58, + "grad_norm": 14.375158309936523, + "learning_rate": 1.6121825712253695e-05, + "loss": 1.9831, "step": 4636 }, { - "epoch": 1.39, - "grad_norm": 12.135993003845215, - "learning_rate": 1.070662523804751e-05, - "loss": 1.2095, + "epoch": 0.58, + "grad_norm": 12.27586555480957, + "learning_rate": 1.6120988997197006e-05, + "loss": 2.1559, "step": 4637 }, { - "epoch": 1.39, - "grad_norm": 15.475272178649902, - "learning_rate": 1.070462062744312e-05, - "loss": 1.4376, + "epoch": 0.58, + "grad_norm": 22.290096282958984, + "learning_rate": 1.612015228214032e-05, + "loss": 2.4757, "step": 4638 }, { - "epoch": 1.39, - "grad_norm": 29.245624542236328, - "learning_rate": 1.0702616016838729e-05, - "loss": 2.0634, + "epoch": 0.58, + "grad_norm": 51.71296310424805, + "learning_rate": 1.611931556708363e-05, + "loss": 2.2293, "step": 4639 }, { - "epoch": 1.4, - "grad_norm": 33.319969177246094, - "learning_rate": 1.070061140623434e-05, - "loss": 1.772, + "epoch": 0.58, + "grad_norm": 22.068788528442383, + "learning_rate": 1.6118478852026943e-05, + "loss": 0.9596, "step": 4640 }, { - "epoch": 1.4, - "grad_norm": 15.358686447143555, - "learning_rate": 1.0698606795629949e-05, - "loss": 1.5593, + "epoch": 0.58, + "grad_norm": 11.041895866394043, + "learning_rate": 1.6117642136970257e-05, + "loss": 2.1294, "step": 4641 }, { - "epoch": 1.4, - "grad_norm": 13.498563766479492, - "learning_rate": 1.069660218502556e-05, - "loss": 1.8836, + "epoch": 0.58, + "grad_norm": 9.11545467376709, + "learning_rate": 1.6116805421913567e-05, + "loss": 1.3963, "step": 4642 }, { - "epoch": 1.4, - "grad_norm": 11.653413772583008, - "learning_rate": 1.0694597574421171e-05, - "loss": 1.3999, + "epoch": 0.58, + "grad_norm": 15.038471221923828, + "learning_rate": 1.611596870685688e-05, + "loss": 1.2528, "step": 4643 }, { - "epoch": 1.4, - "grad_norm": 7.912506103515625, - "learning_rate": 1.069259296381678e-05, - "loss": 1.0855, + "epoch": 0.58, + "grad_norm": 12.920796394348145, + "learning_rate": 1.6115131991800194e-05, + "loss": 1.743, "step": 4644 }, { - "epoch": 1.4, - "grad_norm": 18.94805908203125, - "learning_rate": 1.069058835321239e-05, - "loss": 2.4709, + "epoch": 0.58, + "grad_norm": 10.468306541442871, + "learning_rate": 1.6114295276743505e-05, + "loss": 1.2616, "step": 4645 }, { - "epoch": 1.4, - "grad_norm": 14.374813079833984, - "learning_rate": 1.0688583742607998e-05, - "loss": 1.4454, + "epoch": 0.58, + "grad_norm": 15.705430030822754, + "learning_rate": 1.6113458561686818e-05, + "loss": 1.9102, "step": 4646 }, { - "epoch": 1.4, - "grad_norm": 77.0107650756836, - "learning_rate": 1.068657913200361e-05, - "loss": 2.156, + "epoch": 0.58, + "grad_norm": 15.315054893493652, + "learning_rate": 1.6112621846630132e-05, + "loss": 4.8984, "step": 4647 }, { - "epoch": 1.4, - "grad_norm": 14.53071403503418, - "learning_rate": 1.0684574521399218e-05, - "loss": 1.3087, + "epoch": 0.58, + "grad_norm": 27.60223388671875, + "learning_rate": 1.6111785131573442e-05, + "loss": 2.6091, "step": 4648 }, { - "epoch": 1.4, - "grad_norm": 18.24622917175293, - "learning_rate": 1.0682569910794828e-05, - "loss": 2.3464, + "epoch": 0.58, + "grad_norm": 7.672779083251953, + "learning_rate": 1.6110948416516756e-05, + "loss": 1.8401, "step": 4649 }, { - "epoch": 1.4, - "grad_norm": 7.772927761077881, - "learning_rate": 1.068056530019044e-05, - "loss": 1.3107, + "epoch": 0.58, + "grad_norm": 21.329416275024414, + "learning_rate": 1.611011170146007e-05, + "loss": 1.8097, "step": 4650 }, { - "epoch": 1.4, - "grad_norm": 4.152658462524414, - "learning_rate": 1.0678560689586048e-05, - "loss": 0.467, + "epoch": 0.58, + "grad_norm": 14.985925674438477, + "learning_rate": 1.6109274986403383e-05, + "loss": 2.3634, "step": 4651 }, { - "epoch": 1.4, - "grad_norm": 14.69251537322998, - "learning_rate": 1.0676556078981658e-05, - "loss": 1.5746, + "epoch": 0.58, + "grad_norm": 40.59479522705078, + "learning_rate": 1.6108438271346693e-05, + "loss": 2.189, "step": 4652 }, { - "epoch": 1.4, - "grad_norm": 15.464540481567383, - "learning_rate": 1.0674551468377268e-05, - "loss": 1.5305, + "epoch": 0.58, + "grad_norm": 12.507532119750977, + "learning_rate": 1.6107601556290007e-05, + "loss": 2.6886, "step": 4653 }, { - "epoch": 1.4, - "grad_norm": 11.303486824035645, - "learning_rate": 1.0672546857772879e-05, - "loss": 1.8223, + "epoch": 0.58, + "grad_norm": 36.30336380004883, + "learning_rate": 1.610676484123332e-05, + "loss": 0.8976, "step": 4654 }, { - "epoch": 1.4, - "grad_norm": 23.572040557861328, - "learning_rate": 1.0670542247168489e-05, - "loss": 1.9877, + "epoch": 0.58, + "grad_norm": 25.99757194519043, + "learning_rate": 1.610592812617663e-05, + "loss": 2.6275, "step": 4655 }, { - "epoch": 1.4, - "grad_norm": 22.017297744750977, - "learning_rate": 1.0668537636564099e-05, - "loss": 1.2985, + "epoch": 0.58, + "grad_norm": 10.68349552154541, + "learning_rate": 1.6105091411119945e-05, + "loss": 3.1259, "step": 4656 }, { - "epoch": 1.4, - "grad_norm": 13.40971851348877, - "learning_rate": 1.0666533025959709e-05, - "loss": 1.3887, + "epoch": 0.58, + "grad_norm": 20.675979614257812, + "learning_rate": 1.6104254696063258e-05, + "loss": 1.5156, "step": 4657 }, { - "epoch": 1.4, - "grad_norm": 17.30711555480957, - "learning_rate": 1.0664528415355317e-05, - "loss": 1.6859, + "epoch": 0.58, + "grad_norm": 13.17327880859375, + "learning_rate": 1.6103417981006572e-05, + "loss": 0.7102, "step": 4658 }, { - "epoch": 1.4, - "grad_norm": 8.817755699157715, - "learning_rate": 1.0662523804750929e-05, - "loss": 1.308, + "epoch": 0.58, + "grad_norm": 22.457372665405273, + "learning_rate": 1.6102581265949882e-05, + "loss": 1.4038, "step": 4659 }, { - "epoch": 1.4, - "grad_norm": 29.949804306030273, - "learning_rate": 1.0660519194146537e-05, - "loss": 1.2951, + "epoch": 0.58, + "grad_norm": 12.7407865524292, + "learning_rate": 1.6101744550893196e-05, + "loss": 2.5345, "step": 4660 }, { - "epoch": 1.4, - "grad_norm": 83.96745300292969, - "learning_rate": 1.0658514583542147e-05, - "loss": 2.6652, + "epoch": 0.58, + "grad_norm": 35.302730560302734, + "learning_rate": 1.610090783583651e-05, + "loss": 2.2454, "step": 4661 }, { - "epoch": 1.4, - "grad_norm": 9.81541919708252, - "learning_rate": 1.065650997293776e-05, - "loss": 0.9763, + "epoch": 0.59, + "grad_norm": 36.84288024902344, + "learning_rate": 1.610007112077982e-05, + "loss": 3.0446, "step": 4662 }, { - "epoch": 1.4, - "grad_norm": 24.451271057128906, - "learning_rate": 1.0654505362333368e-05, - "loss": 2.2991, + "epoch": 0.59, + "grad_norm": 11.649720191955566, + "learning_rate": 1.6099234405723133e-05, + "loss": 0.5907, "step": 4663 }, { - "epoch": 1.4, - "grad_norm": 21.38994789123535, - "learning_rate": 1.0652500751728978e-05, - "loss": 1.4465, + "epoch": 0.59, + "grad_norm": 13.00683307647705, + "learning_rate": 1.6098397690666444e-05, + "loss": 3.9629, "step": 4664 }, { - "epoch": 1.4, - "grad_norm": 9.186866760253906, - "learning_rate": 1.0650496141124586e-05, - "loss": 0.5958, + "epoch": 0.59, + "grad_norm": 22.276634216308594, + "learning_rate": 1.6097560975609757e-05, + "loss": 2.0421, "step": 4665 }, { - "epoch": 1.4, - "grad_norm": 18.036312103271484, - "learning_rate": 1.0648491530520198e-05, - "loss": 1.7996, + "epoch": 0.59, + "grad_norm": 3.471829891204834, + "learning_rate": 1.609672426055307e-05, + "loss": 0.071, "step": 4666 }, { - "epoch": 1.4, - "grad_norm": 13.11405086517334, - "learning_rate": 1.0646486919915808e-05, - "loss": 0.8623, + "epoch": 0.59, + "grad_norm": 16.312870025634766, + "learning_rate": 1.609588754549638e-05, + "loss": 1.2167, "step": 4667 }, { - "epoch": 1.4, - "grad_norm": 15.945577621459961, - "learning_rate": 1.0644482309311416e-05, - "loss": 1.5742, + "epoch": 0.59, + "grad_norm": 28.203134536743164, + "learning_rate": 1.6095050830439695e-05, + "loss": 1.3825, "step": 4668 }, { - "epoch": 1.4, - "grad_norm": 10.415094375610352, - "learning_rate": 1.0642477698707028e-05, - "loss": 1.0011, + "epoch": 0.59, + "grad_norm": 7.57172155380249, + "learning_rate": 1.6094214115383005e-05, + "loss": 0.6187, "step": 4669 }, { - "epoch": 1.4, - "grad_norm": 13.777579307556152, - "learning_rate": 1.0640473088102636e-05, - "loss": 1.6847, + "epoch": 0.59, + "grad_norm": 12.809304237365723, + "learning_rate": 1.609337740032632e-05, + "loss": 1.9076, "step": 4670 }, { - "epoch": 1.4, - "grad_norm": 29.36578369140625, - "learning_rate": 1.0638468477498247e-05, - "loss": 1.2423, + "epoch": 0.59, + "grad_norm": 14.868522644042969, + "learning_rate": 1.6092540685269632e-05, + "loss": 1.2166, "step": 4671 }, { - "epoch": 1.4, - "grad_norm": 26.05026626586914, - "learning_rate": 1.0636463866893855e-05, - "loss": 1.976, + "epoch": 0.59, + "grad_norm": 41.13257598876953, + "learning_rate": 1.6091703970212946e-05, + "loss": 2.9149, "step": 4672 }, { - "epoch": 1.4, - "grad_norm": 26.071765899658203, - "learning_rate": 1.0634459256289467e-05, - "loss": 2.2839, + "epoch": 0.59, + "grad_norm": 14.411210060119629, + "learning_rate": 1.6090867255156256e-05, + "loss": 1.8933, "step": 4673 }, { - "epoch": 1.41, - "grad_norm": 16.8984317779541, - "learning_rate": 1.0632454645685077e-05, - "loss": 1.3912, + "epoch": 0.59, + "grad_norm": 22.125534057617188, + "learning_rate": 1.609003054009957e-05, + "loss": 2.9775, "step": 4674 }, { - "epoch": 1.41, - "grad_norm": 65.57967376708984, - "learning_rate": 1.0630450035080687e-05, - "loss": 2.2416, + "epoch": 0.59, + "grad_norm": 16.977636337280273, + "learning_rate": 1.6089193825042884e-05, + "loss": 2.0797, "step": 4675 }, { - "epoch": 1.41, - "grad_norm": 12.922877311706543, - "learning_rate": 1.0628445424476297e-05, - "loss": 1.5951, + "epoch": 0.59, + "grad_norm": 15.84194564819336, + "learning_rate": 1.6088357109986194e-05, + "loss": 1.6354, "step": 4676 }, { - "epoch": 1.41, - "grad_norm": 15.637014389038086, - "learning_rate": 1.0626440813871905e-05, - "loss": 1.9517, + "epoch": 0.59, + "grad_norm": 6.054462432861328, + "learning_rate": 1.6087520394929507e-05, + "loss": 0.5514, "step": 4677 }, { - "epoch": 1.41, - "grad_norm": 12.687530517578125, - "learning_rate": 1.0624436203267517e-05, - "loss": 1.3298, + "epoch": 0.59, + "grad_norm": 6.776900291442871, + "learning_rate": 1.608668367987282e-05, + "loss": 0.9739, "step": 4678 }, { - "epoch": 1.41, - "grad_norm": 47.006107330322266, - "learning_rate": 1.0622431592663126e-05, - "loss": 2.6819, + "epoch": 0.59, + "grad_norm": 13.240543365478516, + "learning_rate": 1.6085846964816135e-05, + "loss": 1.1257, "step": 4679 }, { - "epoch": 1.41, - "grad_norm": 12.535234451293945, - "learning_rate": 1.0620426982058736e-05, - "loss": 1.4554, - "step": 4680 - }, - { - "epoch": 1.41, - "eval_loss": 0.2101414054632187, - "eval_runtime": 43.0905, - "eval_samples_per_second": 34.323, - "eval_steps_per_second": 34.323, + "epoch": 0.59, + "grad_norm": 16.568603515625, + "learning_rate": 1.6085010249759445e-05, + "loss": 2.5752, "step": 4680 }, { - "epoch": 1.41, - "grad_norm": 9.197644233703613, - "learning_rate": 1.0618422371454347e-05, - "loss": 1.1242, + "epoch": 0.59, + "grad_norm": 11.308704376220703, + "learning_rate": 1.608417353470276e-05, + "loss": 1.13, "step": 4681 }, { - "epoch": 1.41, - "grad_norm": 11.781989097595215, - "learning_rate": 1.0616417760849956e-05, - "loss": 1.9253, + "epoch": 0.59, + "grad_norm": 21.552059173583984, + "learning_rate": 1.6083336819646072e-05, + "loss": 2.0728, "step": 4682 }, { - "epoch": 1.41, - "grad_norm": 41.59600830078125, - "learning_rate": 1.0614413150245566e-05, - "loss": 2.3303, + "epoch": 0.59, + "grad_norm": 10.485605239868164, + "learning_rate": 1.6082500104589383e-05, + "loss": 2.5726, "step": 4683 }, { - "epoch": 1.41, - "grad_norm": 13.190827369689941, - "learning_rate": 1.0612408539641174e-05, - "loss": 0.8268, + "epoch": 0.59, + "grad_norm": 18.043210983276367, + "learning_rate": 1.6081663389532696e-05, + "loss": 1.7581, "step": 4684 }, { - "epoch": 1.41, - "grad_norm": 57.965003967285156, - "learning_rate": 1.0610403929036786e-05, - "loss": 2.375, + "epoch": 0.59, + "grad_norm": 20.532176971435547, + "learning_rate": 1.608082667447601e-05, + "loss": 2.0524, "step": 4685 }, { - "epoch": 1.41, - "grad_norm": 21.654773712158203, - "learning_rate": 1.0608399318432396e-05, - "loss": 2.0238, + "epoch": 0.59, + "grad_norm": 19.0378360748291, + "learning_rate": 1.6079989959419324e-05, + "loss": 1.6556, "step": 4686 }, { - "epoch": 1.41, - "grad_norm": 21.622922897338867, - "learning_rate": 1.0606394707828005e-05, - "loss": 1.8048, + "epoch": 0.59, + "grad_norm": 14.122875213623047, + "learning_rate": 1.6079153244362634e-05, + "loss": 2.5804, "step": 4687 }, { - "epoch": 1.41, - "grad_norm": 21.0383358001709, - "learning_rate": 1.0604390097223616e-05, - "loss": 2.2016, + "epoch": 0.59, + "grad_norm": 13.868156433105469, + "learning_rate": 1.6078316529305947e-05, + "loss": 1.3452, "step": 4688 }, { - "epoch": 1.41, - "grad_norm": 25.455896377563477, - "learning_rate": 1.0602385486619225e-05, - "loss": 1.9829, + "epoch": 0.59, + "grad_norm": 6.920721054077148, + "learning_rate": 1.6077479814249258e-05, + "loss": 0.7653, "step": 4689 }, { - "epoch": 1.41, - "grad_norm": 27.83086395263672, - "learning_rate": 1.0600380876014835e-05, - "loss": 1.637, + "epoch": 0.59, + "grad_norm": 46.12936782836914, + "learning_rate": 1.607664309919257e-05, + "loss": 0.5357, "step": 4690 }, { - "epoch": 1.41, - "grad_norm": 14.252532005310059, - "learning_rate": 1.0598376265410443e-05, - "loss": 0.7723, + "epoch": 0.59, + "grad_norm": 9.634956359863281, + "learning_rate": 1.6075806384135885e-05, + "loss": 1.4596, "step": 4691 }, { - "epoch": 1.41, - "grad_norm": 16.23126220703125, - "learning_rate": 1.0596371654806055e-05, - "loss": 1.5121, + "epoch": 0.59, + "grad_norm": 4.42297887802124, + "learning_rate": 1.6074969669079195e-05, + "loss": 0.235, "step": 4692 }, { - "epoch": 1.41, - "grad_norm": 26.036483764648438, - "learning_rate": 1.0594367044201665e-05, - "loss": 2.4891, + "epoch": 0.59, + "grad_norm": 13.291658401489258, + "learning_rate": 1.607413295402251e-05, + "loss": 2.0819, "step": 4693 }, { - "epoch": 1.41, - "grad_norm": 11.481901168823242, - "learning_rate": 1.0592362433597273e-05, - "loss": 1.8866, + "epoch": 0.59, + "grad_norm": 12.573540687561035, + "learning_rate": 1.6073296238965823e-05, + "loss": 2.758, "step": 4694 }, { - "epoch": 1.41, - "grad_norm": 15.858734130859375, - "learning_rate": 1.0590357822992885e-05, - "loss": 1.6499, + "epoch": 0.59, + "grad_norm": 23.449174880981445, + "learning_rate": 1.6072459523909133e-05, + "loss": 1.1348, "step": 4695 }, { - "epoch": 1.41, - "grad_norm": 42.61335754394531, - "learning_rate": 1.0588353212388494e-05, - "loss": 2.7187, + "epoch": 0.59, + "grad_norm": 4.471280097961426, + "learning_rate": 1.6071622808852446e-05, + "loss": 0.6692, "step": 4696 }, { - "epoch": 1.41, - "grad_norm": 18.403488159179688, - "learning_rate": 1.0586348601784104e-05, - "loss": 1.5052, + "epoch": 0.59, + "grad_norm": 12.927807807922363, + "learning_rate": 1.6070786093795757e-05, + "loss": 1.8158, "step": 4697 }, { - "epoch": 1.41, - "grad_norm": 12.110722541809082, - "learning_rate": 1.0584343991179715e-05, - "loss": 1.499, + "epoch": 0.59, + "grad_norm": 22.802858352661133, + "learning_rate": 1.606994937873907e-05, + "loss": 3.4538, "step": 4698 }, { - "epoch": 1.41, - "grad_norm": 22.188568115234375, - "learning_rate": 1.0582339380575324e-05, - "loss": 2.2641, + "epoch": 0.59, + "grad_norm": 14.071301460266113, + "learning_rate": 1.6069112663682384e-05, + "loss": 1.1313, "step": 4699 }, { - "epoch": 1.41, - "grad_norm": 22.298145294189453, - "learning_rate": 1.0580334769970934e-05, - "loss": 1.9698, + "epoch": 0.59, + "grad_norm": 32.92894744873047, + "learning_rate": 1.6068275948625698e-05, + "loss": 3.067, "step": 4700 }, { - "epoch": 1.41, - "grad_norm": 16.0262508392334, - "learning_rate": 1.0578330159366544e-05, - "loss": 1.4256, + "epoch": 0.59, + "grad_norm": 10.579645156860352, + "learning_rate": 1.6067439233569008e-05, + "loss": 1.6233, "step": 4701 }, { - "epoch": 1.41, - "grad_norm": 7.636526584625244, - "learning_rate": 1.0576325548762154e-05, - "loss": 0.9309, + "epoch": 0.59, + "grad_norm": 44.29985809326172, + "learning_rate": 1.606660251851232e-05, + "loss": 1.7801, "step": 4702 }, { - "epoch": 1.41, - "grad_norm": 16.564451217651367, - "learning_rate": 1.0574320938157762e-05, - "loss": 0.9893, + "epoch": 0.59, + "grad_norm": 15.755998611450195, + "learning_rate": 1.6065765803455635e-05, + "loss": 2.3343, "step": 4703 }, { - "epoch": 1.41, - "grad_norm": 12.611454963684082, - "learning_rate": 1.0572316327553374e-05, - "loss": 1.1581, + "epoch": 0.59, + "grad_norm": 10.83920669555664, + "learning_rate": 1.6064929088398946e-05, + "loss": 2.1506, "step": 4704 }, { - "epoch": 1.41, - "grad_norm": 19.108123779296875, - "learning_rate": 1.0570311716948984e-05, - "loss": 2.3804, + "epoch": 0.59, + "grad_norm": 37.44891357421875, + "learning_rate": 1.606409237334226e-05, + "loss": 3.1074, "step": 4705 }, { - "epoch": 1.41, - "grad_norm": 28.391921997070312, - "learning_rate": 1.0568307106344593e-05, - "loss": 1.5563, + "epoch": 0.59, + "grad_norm": 16.268556594848633, + "learning_rate": 1.6063255658285573e-05, + "loss": 2.9378, "step": 4706 }, { - "epoch": 1.42, - "grad_norm": 14.404617309570312, - "learning_rate": 1.0566302495740205e-05, - "loss": 1.6261, + "epoch": 0.59, + "grad_norm": 7.230185031890869, + "learning_rate": 1.6062418943228886e-05, + "loss": 2.6047, "step": 4707 }, { - "epoch": 1.42, - "grad_norm": 16.0683650970459, - "learning_rate": 1.0564297885135813e-05, - "loss": 1.5423, + "epoch": 0.59, + "grad_norm": 42.817569732666016, + "learning_rate": 1.6061582228172197e-05, + "loss": 2.9179, "step": 4708 }, { - "epoch": 1.42, - "grad_norm": 15.672560691833496, - "learning_rate": 1.0562293274531423e-05, - "loss": 1.2338, + "epoch": 0.59, + "grad_norm": 11.923335075378418, + "learning_rate": 1.606074551311551e-05, + "loss": 1.2606, "step": 4709 }, { - "epoch": 1.42, - "grad_norm": 63.09310531616211, - "learning_rate": 1.0560288663927035e-05, - "loss": 3.1458, + "epoch": 0.59, + "grad_norm": 18.071735382080078, + "learning_rate": 1.6059908798058824e-05, + "loss": 1.5856, "step": 4710 }, { - "epoch": 1.42, - "grad_norm": 31.956239700317383, - "learning_rate": 1.0558284053322643e-05, - "loss": 1.1543, + "epoch": 0.59, + "grad_norm": 24.506731033325195, + "learning_rate": 1.6059072083002134e-05, + "loss": 2.0624, "step": 4711 }, { - "epoch": 1.42, - "grad_norm": 18.227100372314453, - "learning_rate": 1.0556279442718253e-05, - "loss": 1.5385, + "epoch": 0.59, + "grad_norm": 17.046676635742188, + "learning_rate": 1.6058235367945448e-05, + "loss": 1.8085, "step": 4712 }, { - "epoch": 1.42, - "grad_norm": 17.809656143188477, - "learning_rate": 1.0554274832113862e-05, - "loss": 1.6656, + "epoch": 0.59, + "grad_norm": 10.442744255065918, + "learning_rate": 1.605739865288876e-05, + "loss": 2.6301, "step": 4713 }, { - "epoch": 1.42, - "grad_norm": 14.160889625549316, - "learning_rate": 1.0552270221509473e-05, - "loss": 1.4603, + "epoch": 0.59, + "grad_norm": 11.123990058898926, + "learning_rate": 1.6056561937832075e-05, + "loss": 2.6547, "step": 4714 }, { - "epoch": 1.42, - "grad_norm": 19.26714324951172, - "learning_rate": 1.0550265610905082e-05, - "loss": 1.4625, + "epoch": 0.59, + "grad_norm": 7.235197067260742, + "learning_rate": 1.6055725222775385e-05, + "loss": 0.9485, "step": 4715 }, { - "epoch": 1.42, - "grad_norm": 14.98919677734375, - "learning_rate": 1.0548261000300692e-05, - "loss": 1.7723, + "epoch": 0.59, + "grad_norm": 14.619832038879395, + "learning_rate": 1.60548885077187e-05, + "loss": 1.2491, "step": 4716 }, { - "epoch": 1.42, - "grad_norm": 15.679314613342285, - "learning_rate": 1.0546256389696304e-05, - "loss": 2.7869, + "epoch": 0.59, + "grad_norm": 22.21745491027832, + "learning_rate": 1.605405179266201e-05, + "loss": 2.5148, "step": 4717 }, { - "epoch": 1.42, - "grad_norm": 21.745820999145508, - "learning_rate": 1.0544251779091912e-05, - "loss": 1.7639, + "epoch": 0.59, + "grad_norm": 25.608308792114258, + "learning_rate": 1.6053215077605323e-05, + "loss": 3.2723, "step": 4718 }, { - "epoch": 1.42, - "grad_norm": 36.71981430053711, - "learning_rate": 1.0542247168487522e-05, - "loss": 1.5607, + "epoch": 0.59, + "grad_norm": 12.341256141662598, + "learning_rate": 1.6052378362548637e-05, + "loss": 2.5153, "step": 4719 }, { - "epoch": 1.42, - "grad_norm": 37.882877349853516, - "learning_rate": 1.054024255788313e-05, - "loss": 2.3179, + "epoch": 0.59, + "grad_norm": 8.019450187683105, + "learning_rate": 1.6051541647491947e-05, + "loss": 2.0929, "step": 4720 }, { - "epoch": 1.42, - "grad_norm": 13.184647560119629, - "learning_rate": 1.0538237947278742e-05, - "loss": 2.0276, + "epoch": 0.59, + "grad_norm": 36.901058197021484, + "learning_rate": 1.605070493243526e-05, + "loss": 3.1348, "step": 4721 }, { - "epoch": 1.42, - "grad_norm": 7.884564399719238, - "learning_rate": 1.053623333667435e-05, - "loss": 2.1323, + "epoch": 0.59, + "grad_norm": 15.31069278717041, + "learning_rate": 1.604986821737857e-05, + "loss": 2.0974, "step": 4722 }, { - "epoch": 1.42, - "grad_norm": 13.331478118896484, - "learning_rate": 1.053422872606996e-05, - "loss": 1.2308, + "epoch": 0.59, + "grad_norm": 12.79121208190918, + "learning_rate": 1.6049031502321885e-05, + "loss": 1.4689, "step": 4723 }, { - "epoch": 1.42, - "grad_norm": 10.356979370117188, - "learning_rate": 1.0532224115465573e-05, - "loss": 1.2406, + "epoch": 0.59, + "grad_norm": 10.151163101196289, + "learning_rate": 1.6048194787265198e-05, + "loss": 3.6915, "step": 4724 }, { - "epoch": 1.42, - "grad_norm": 27.083091735839844, - "learning_rate": 1.0530219504861181e-05, - "loss": 1.3356, + "epoch": 0.59, + "grad_norm": 15.297480583190918, + "learning_rate": 1.604735807220851e-05, + "loss": 2.0763, "step": 4725 }, { - "epoch": 1.42, - "grad_norm": 14.349416732788086, - "learning_rate": 1.0528214894256791e-05, - "loss": 1.337, + "epoch": 0.59, + "grad_norm": 10.322936058044434, + "learning_rate": 1.6046521357151822e-05, + "loss": 0.5036, "step": 4726 }, { - "epoch": 1.42, - "grad_norm": 13.042084693908691, - "learning_rate": 1.0526210283652401e-05, - "loss": 1.3127, + "epoch": 0.59, + "grad_norm": 38.53228759765625, + "learning_rate": 1.6045684642095136e-05, + "loss": 3.0067, "step": 4727 }, { - "epoch": 1.42, - "grad_norm": 39.22003936767578, - "learning_rate": 1.0524205673048011e-05, - "loss": 1.7421, + "epoch": 0.59, + "grad_norm": 8.970709800720215, + "learning_rate": 1.604484792703845e-05, + "loss": 0.9686, "step": 4728 }, { - "epoch": 1.42, - "grad_norm": 7.622201919555664, - "learning_rate": 1.0522201062443623e-05, - "loss": 0.9831, + "epoch": 0.59, + "grad_norm": 8.248995780944824, + "learning_rate": 1.604401121198176e-05, + "loss": 0.9467, "step": 4729 }, { - "epoch": 1.42, - "grad_norm": 31.560455322265625, - "learning_rate": 1.0520196451839231e-05, - "loss": 2.0777, + "epoch": 0.59, + "grad_norm": 5.467845439910889, + "learning_rate": 1.6043174496925073e-05, + "loss": 0.8413, "step": 4730 }, { - "epoch": 1.42, - "grad_norm": 13.57126235961914, - "learning_rate": 1.0518191841234841e-05, - "loss": 1.4528, + "epoch": 0.59, + "grad_norm": 12.535849571228027, + "learning_rate": 1.6042337781868387e-05, + "loss": 1.2482, "step": 4731 }, { - "epoch": 1.42, - "grad_norm": 25.839723587036133, - "learning_rate": 1.051618723063045e-05, - "loss": 1.3498, + "epoch": 0.59, + "grad_norm": 21.349803924560547, + "learning_rate": 1.6041501066811697e-05, + "loss": 3.1314, "step": 4732 }, { - "epoch": 1.42, - "grad_norm": 8.843565940856934, - "learning_rate": 1.0514182620026062e-05, - "loss": 2.0511, + "epoch": 0.59, + "grad_norm": 15.141033172607422, + "learning_rate": 1.604066435175501e-05, + "loss": 3.0181, "step": 4733 }, { - "epoch": 1.42, - "grad_norm": 13.93571949005127, - "learning_rate": 1.051217800942167e-05, - "loss": 1.4334, + "epoch": 0.59, + "grad_norm": 8.8250732421875, + "learning_rate": 1.6039827636698324e-05, + "loss": 1.0692, "step": 4734 }, { - "epoch": 1.42, - "grad_norm": 32.184539794921875, - "learning_rate": 1.051017339881728e-05, - "loss": 2.1138, + "epoch": 0.59, + "grad_norm": 9.506110191345215, + "learning_rate": 1.6038990921641638e-05, + "loss": 1.1641, "step": 4735 }, { - "epoch": 1.42, - "grad_norm": 12.731632232666016, - "learning_rate": 1.0508168788212892e-05, - "loss": 1.506, + "epoch": 0.59, + "grad_norm": 12.257560729980469, + "learning_rate": 1.603815420658495e-05, + "loss": 1.8662, "step": 4736 }, { - "epoch": 1.42, - "grad_norm": 9.572550773620605, - "learning_rate": 1.05061641776085e-05, - "loss": 2.4816, + "epoch": 0.59, + "grad_norm": 24.33102035522461, + "learning_rate": 1.6037317491528262e-05, + "loss": 1.547, "step": 4737 }, { - "epoch": 1.42, - "grad_norm": 13.963497161865234, - "learning_rate": 1.050415956700411e-05, - "loss": 1.4354, + "epoch": 0.59, + "grad_norm": 15.50571346282959, + "learning_rate": 1.6036480776471576e-05, + "loss": 1.1362, "step": 4738 }, { - "epoch": 1.42, - "grad_norm": 19.97682762145996, - "learning_rate": 1.0502154956399719e-05, - "loss": 1.7492, + "epoch": 0.59, + "grad_norm": 6.502542495727539, + "learning_rate": 1.6035644061414886e-05, + "loss": 2.4382, "step": 4739 }, { - "epoch": 1.43, - "grad_norm": 11.23585033416748, - "learning_rate": 1.050015034579533e-05, - "loss": 1.396, + "epoch": 0.59, + "grad_norm": 23.697622299194336, + "learning_rate": 1.60348073463582e-05, + "loss": 0.6613, "step": 4740 }, { - "epoch": 1.43, - "grad_norm": 21.9536075592041, - "learning_rate": 1.049814573519094e-05, - "loss": 1.1373, + "epoch": 0.59, + "grad_norm": 25.582416534423828, + "learning_rate": 1.6033970631301513e-05, + "loss": 1.6498, "step": 4741 }, { - "epoch": 1.43, - "grad_norm": 15.670050621032715, - "learning_rate": 1.0496141124586549e-05, - "loss": 1.9185, + "epoch": 0.6, + "grad_norm": 38.71111297607422, + "learning_rate": 1.6033133916244824e-05, + "loss": 2.4338, "step": 4742 }, { - "epoch": 1.43, - "grad_norm": 18.927589416503906, - "learning_rate": 1.049413651398216e-05, - "loss": 1.7097, + "epoch": 0.6, + "grad_norm": 7.77809476852417, + "learning_rate": 1.6032297201188137e-05, + "loss": 0.7501, "step": 4743 }, { - "epoch": 1.43, - "grad_norm": 14.530732154846191, - "learning_rate": 1.049213190337777e-05, - "loss": 1.1204, + "epoch": 0.6, + "grad_norm": 9.764083862304688, + "learning_rate": 1.603146048613145e-05, + "loss": 2.6929, "step": 4744 }, { - "epoch": 1.43, - "grad_norm": 29.094755172729492, - "learning_rate": 1.049012729277338e-05, - "loss": 1.787, + "epoch": 0.6, + "grad_norm": 28.068134307861328, + "learning_rate": 1.603062377107476e-05, + "loss": 1.5478, "step": 4745 }, { - "epoch": 1.43, - "grad_norm": 25.101224899291992, - "learning_rate": 1.048812268216899e-05, - "loss": 2.8473, + "epoch": 0.6, + "grad_norm": 12.991608619689941, + "learning_rate": 1.6029787056018075e-05, + "loss": 1.7992, "step": 4746 }, { - "epoch": 1.43, - "grad_norm": 17.750200271606445, - "learning_rate": 1.04861180715646e-05, - "loss": 1.8474, + "epoch": 0.6, + "grad_norm": 25.247655868530273, + "learning_rate": 1.602895034096139e-05, + "loss": 3.3891, "step": 4747 }, { - "epoch": 1.43, - "grad_norm": 22.220870971679688, - "learning_rate": 1.048411346096021e-05, - "loss": 1.8063, + "epoch": 0.6, + "grad_norm": 18.69204330444336, + "learning_rate": 1.60281136259047e-05, + "loss": 2.7039, "step": 4748 }, { - "epoch": 1.43, - "grad_norm": 7.666170120239258, - "learning_rate": 1.048210885035582e-05, - "loss": 1.4058, + "epoch": 0.6, + "grad_norm": 17.054393768310547, + "learning_rate": 1.6027276910848012e-05, + "loss": 2.4671, "step": 4749 }, { - "epoch": 1.43, - "grad_norm": 28.113630294799805, - "learning_rate": 1.048010423975143e-05, - "loss": 2.1896, + "epoch": 0.6, + "grad_norm": 13.61137580871582, + "learning_rate": 1.6026440195791323e-05, + "loss": 1.3915, "step": 4750 }, { - "epoch": 1.43, - "grad_norm": 13.658616065979004, - "learning_rate": 1.0478099629147038e-05, - "loss": 1.7455, + "epoch": 0.6, + "grad_norm": 118.35962677001953, + "learning_rate": 1.6025603480734636e-05, + "loss": 2.3395, "step": 4751 }, { - "epoch": 1.43, - "grad_norm": 11.242774963378906, - "learning_rate": 1.047609501854265e-05, - "loss": 1.9807, + "epoch": 0.6, + "grad_norm": 17.12057876586914, + "learning_rate": 1.602476676567795e-05, + "loss": 0.4615, "step": 4752 }, { - "epoch": 1.43, - "grad_norm": 11.421576499938965, - "learning_rate": 1.047409040793826e-05, - "loss": 1.5406, + "epoch": 0.6, + "grad_norm": 8.947701454162598, + "learning_rate": 1.602393005062126e-05, + "loss": 0.7204, "step": 4753 }, { - "epoch": 1.43, - "grad_norm": 24.668371200561523, - "learning_rate": 1.0472085797333868e-05, - "loss": 2.2752, + "epoch": 0.6, + "grad_norm": 10.49992561340332, + "learning_rate": 1.6023093335564574e-05, + "loss": 0.4165, "step": 4754 }, { - "epoch": 1.43, - "grad_norm": 24.07050132751465, - "learning_rate": 1.047008118672948e-05, - "loss": 1.4799, + "epoch": 0.6, + "grad_norm": 23.460378646850586, + "learning_rate": 1.6022256620507887e-05, + "loss": 2.3077, "step": 4755 }, { - "epoch": 1.43, - "grad_norm": 38.27885055541992, - "learning_rate": 1.0468076576125088e-05, - "loss": 1.5369, + "epoch": 0.6, + "grad_norm": 17.311359405517578, + "learning_rate": 1.60214199054512e-05, + "loss": 2.3508, "step": 4756 }, { - "epoch": 1.43, - "grad_norm": 20.224210739135742, - "learning_rate": 1.0466071965520699e-05, - "loss": 1.9254, + "epoch": 0.6, + "grad_norm": 9.50903034210205, + "learning_rate": 1.602058319039451e-05, + "loss": 3.0923, "step": 4757 }, { - "epoch": 1.43, - "grad_norm": 10.304624557495117, - "learning_rate": 1.0464067354916307e-05, - "loss": 1.5751, + "epoch": 0.6, + "grad_norm": 25.622928619384766, + "learning_rate": 1.6019746475337825e-05, + "loss": 2.3855, "step": 4758 }, { - "epoch": 1.43, - "grad_norm": 23.387136459350586, - "learning_rate": 1.0462062744311919e-05, - "loss": 1.2816, + "epoch": 0.6, + "grad_norm": 11.12957763671875, + "learning_rate": 1.601890976028114e-05, + "loss": 0.692, "step": 4759 }, { - "epoch": 1.43, - "grad_norm": 8.415315628051758, - "learning_rate": 1.0460058133707529e-05, - "loss": 1.2024, + "epoch": 0.6, + "grad_norm": 15.091212272644043, + "learning_rate": 1.601807304522445e-05, + "loss": 1.551, "step": 4760 }, { - "epoch": 1.43, - "grad_norm": 15.556396484375, - "learning_rate": 1.0458053523103137e-05, - "loss": 1.7196, + "epoch": 0.6, + "grad_norm": 10.973529815673828, + "learning_rate": 1.6017236330167763e-05, + "loss": 2.5152, "step": 4761 }, { - "epoch": 1.43, - "grad_norm": 70.80048370361328, - "learning_rate": 1.0456048912498749e-05, - "loss": 1.6606, + "epoch": 0.6, + "grad_norm": 10.444022178649902, + "learning_rate": 1.6016399615111076e-05, + "loss": 0.7957, "step": 4762 }, { - "epoch": 1.43, - "grad_norm": 8.032504081726074, - "learning_rate": 1.0454044301894357e-05, - "loss": 1.4636, + "epoch": 0.6, + "grad_norm": 16.062118530273438, + "learning_rate": 1.601556290005439e-05, + "loss": 1.7941, "step": 4763 }, { - "epoch": 1.43, - "grad_norm": 13.399280548095703, - "learning_rate": 1.0452039691289967e-05, - "loss": 2.0113, + "epoch": 0.6, + "grad_norm": 10.109869956970215, + "learning_rate": 1.60147261849977e-05, + "loss": 3.889, "step": 4764 }, { - "epoch": 1.43, - "grad_norm": 10.88759708404541, - "learning_rate": 1.0450035080685576e-05, - "loss": 1.7287, + "epoch": 0.6, + "grad_norm": 21.51214027404785, + "learning_rate": 1.6013889469941014e-05, + "loss": 2.4326, "step": 4765 }, { - "epoch": 1.43, - "grad_norm": 46.82052230834961, - "learning_rate": 1.0448030470081188e-05, - "loss": 2.7536, + "epoch": 0.6, + "grad_norm": 23.204986572265625, + "learning_rate": 1.6013052754884327e-05, + "loss": 2.5666, "step": 4766 }, { - "epoch": 1.43, - "grad_norm": 19.954078674316406, - "learning_rate": 1.0446025859476798e-05, - "loss": 1.6187, + "epoch": 0.6, + "grad_norm": 9.954808235168457, + "learning_rate": 1.6012216039827638e-05, + "loss": 1.3047, "step": 4767 }, { - "epoch": 1.43, - "grad_norm": 46.83697509765625, - "learning_rate": 1.0444021248872406e-05, - "loss": 2.2134, + "epoch": 0.6, + "grad_norm": 13.861639976501465, + "learning_rate": 1.601137932477095e-05, + "loss": 3.0222, "step": 4768 }, { - "epoch": 1.43, - "grad_norm": 29.12272834777832, - "learning_rate": 1.0442016638268018e-05, - "loss": 2.3703, + "epoch": 0.6, + "grad_norm": 16.373212814331055, + "learning_rate": 1.6010542609714265e-05, + "loss": 1.7138, "step": 4769 }, { - "epoch": 1.43, - "grad_norm": 37.321956634521484, - "learning_rate": 1.0440012027663626e-05, - "loss": 1.2239, + "epoch": 0.6, + "grad_norm": 33.26377868652344, + "learning_rate": 1.6009705894657575e-05, + "loss": 4.864, "step": 4770 }, { - "epoch": 1.43, - "grad_norm": 18.08650779724121, - "learning_rate": 1.0438007417059236e-05, - "loss": 1.4429, + "epoch": 0.6, + "grad_norm": 21.3576602935791, + "learning_rate": 1.600886917960089e-05, + "loss": 1.6901, "step": 4771 }, { - "epoch": 1.43, - "grad_norm": 15.265766143798828, - "learning_rate": 1.0436002806454848e-05, - "loss": 1.2304, + "epoch": 0.6, + "grad_norm": 15.211508750915527, + "learning_rate": 1.6008032464544202e-05, + "loss": 2.8331, "step": 4772 }, { - "epoch": 1.44, - "grad_norm": 16.230012893676758, - "learning_rate": 1.0433998195850457e-05, - "loss": 1.2576, + "epoch": 0.6, + "grad_norm": 12.891839981079102, + "learning_rate": 1.6007195749487513e-05, + "loss": 1.9634, "step": 4773 }, { - "epoch": 1.44, - "grad_norm": 27.102861404418945, - "learning_rate": 1.0431993585246067e-05, - "loss": 2.3902, + "epoch": 0.6, + "grad_norm": 25.414566040039062, + "learning_rate": 1.6006359034430826e-05, + "loss": 3.7175, "step": 4774 }, { - "epoch": 1.44, - "grad_norm": 9.05881404876709, - "learning_rate": 1.0429988974641677e-05, - "loss": 0.8901, + "epoch": 0.6, + "grad_norm": 6.307236194610596, + "learning_rate": 1.6005522319374137e-05, + "loss": 3.0202, "step": 4775 }, { - "epoch": 1.44, - "grad_norm": 24.15192985534668, - "learning_rate": 1.0427984364037287e-05, - "loss": 1.4729, + "epoch": 0.6, + "grad_norm": 14.140206336975098, + "learning_rate": 1.600468560431745e-05, + "loss": 1.3212, "step": 4776 }, { - "epoch": 1.44, - "grad_norm": 11.738104820251465, - "learning_rate": 1.0425979753432895e-05, - "loss": 1.5979, + "epoch": 0.6, + "grad_norm": 4.296204566955566, + "learning_rate": 1.6003848889260764e-05, + "loss": 0.3131, "step": 4777 }, { - "epoch": 1.44, - "grad_norm": 17.82452392578125, - "learning_rate": 1.0423975142828507e-05, - "loss": 1.2219, + "epoch": 0.6, + "grad_norm": 41.11643981933594, + "learning_rate": 1.6003012174204074e-05, + "loss": 3.8694, "step": 4778 }, { - "epoch": 1.44, - "grad_norm": 10.79785442352295, - "learning_rate": 1.0421970532224117e-05, - "loss": 1.1237, + "epoch": 0.6, + "grad_norm": 8.5755033493042, + "learning_rate": 1.6002175459147388e-05, + "loss": 0.5878, "step": 4779 }, { - "epoch": 1.44, - "grad_norm": 66.31954193115234, - "learning_rate": 1.0419965921619725e-05, - "loss": 1.1348, + "epoch": 0.6, + "grad_norm": 15.202603340148926, + "learning_rate": 1.60013387440907e-05, + "loss": 2.3258, "step": 4780 }, { - "epoch": 1.44, - "grad_norm": 14.166802406311035, - "learning_rate": 1.0417961311015337e-05, - "loss": 1.5118, + "epoch": 0.6, + "grad_norm": 36.033565521240234, + "learning_rate": 1.6000502029034012e-05, + "loss": 4.0675, "step": 4781 }, { - "epoch": 1.44, - "grad_norm": 19.842845916748047, - "learning_rate": 1.0415956700410946e-05, - "loss": 1.22, + "epoch": 0.6, + "grad_norm": 14.063431739807129, + "learning_rate": 1.5999665313977325e-05, + "loss": 2.0848, "step": 4782 }, { - "epoch": 1.44, - "grad_norm": 19.89260482788086, - "learning_rate": 1.0413952089806556e-05, - "loss": 1.4466, + "epoch": 0.6, + "grad_norm": 19.439945220947266, + "learning_rate": 1.599882859892064e-05, + "loss": 1.6212, "step": 4783 }, { - "epoch": 1.44, - "grad_norm": 8.727401733398438, - "learning_rate": 1.0411947479202167e-05, - "loss": 1.2331, + "epoch": 0.6, + "grad_norm": 8.303755760192871, + "learning_rate": 1.5997991883863953e-05, + "loss": 1.6547, "step": 4784 }, { - "epoch": 1.44, - "grad_norm": 16.119182586669922, - "learning_rate": 1.0409942868597776e-05, - "loss": 2.1112, + "epoch": 0.6, + "grad_norm": 8.920239448547363, + "learning_rate": 1.5997155168807263e-05, + "loss": 3.9113, "step": 4785 }, { - "epoch": 1.44, - "grad_norm": 8.426682472229004, - "learning_rate": 1.0407938257993386e-05, - "loss": 0.7954, + "epoch": 0.6, + "grad_norm": 19.899768829345703, + "learning_rate": 1.5996318453750577e-05, + "loss": 2.0214, "step": 4786 }, { - "epoch": 1.44, - "grad_norm": 11.057311058044434, - "learning_rate": 1.0405933647388994e-05, - "loss": 1.4438, + "epoch": 0.6, + "grad_norm": 7.666860580444336, + "learning_rate": 1.599548173869389e-05, + "loss": 1.6604, "step": 4787 }, { - "epoch": 1.44, - "grad_norm": 11.847160339355469, - "learning_rate": 1.0403929036784606e-05, - "loss": 1.6443, + "epoch": 0.6, + "grad_norm": 23.639904022216797, + "learning_rate": 1.59946450236372e-05, + "loss": 1.0322, "step": 4788 }, { - "epoch": 1.44, - "grad_norm": 52.34111022949219, - "learning_rate": 1.0401924426180214e-05, - "loss": 1.8, + "epoch": 0.6, + "grad_norm": 31.30626106262207, + "learning_rate": 1.5993808308580514e-05, + "loss": 0.8971, "step": 4789 }, { - "epoch": 1.44, - "grad_norm": 12.100299835205078, - "learning_rate": 1.0399919815575825e-05, - "loss": 0.9117, + "epoch": 0.6, + "grad_norm": 25.765701293945312, + "learning_rate": 1.5992971593523828e-05, + "loss": 1.9441, "step": 4790 }, { - "epoch": 1.44, - "grad_norm": 25.9334774017334, - "learning_rate": 1.0397915204971436e-05, - "loss": 1.6836, + "epoch": 0.6, + "grad_norm": 10.381741523742676, + "learning_rate": 1.599213487846714e-05, + "loss": 0.5661, "step": 4791 }, { - "epoch": 1.44, - "grad_norm": 16.446218490600586, - "learning_rate": 1.0395910594367045e-05, - "loss": 1.581, + "epoch": 0.6, + "grad_norm": 13.025772094726562, + "learning_rate": 1.5991298163410452e-05, + "loss": 1.9931, "step": 4792 }, { - "epoch": 1.44, - "grad_norm": 16.541593551635742, - "learning_rate": 1.0393905983762655e-05, - "loss": 1.6899, + "epoch": 0.6, + "grad_norm": 13.313878059387207, + "learning_rate": 1.5990461448353765e-05, + "loss": 2.152, "step": 4793 }, { - "epoch": 1.44, - "grad_norm": 14.012060165405273, - "learning_rate": 1.0391901373158265e-05, - "loss": 1.3736, + "epoch": 0.6, + "grad_norm": 12.876245498657227, + "learning_rate": 1.598962473329708e-05, + "loss": 1.0786, "step": 4794 }, { - "epoch": 1.44, - "grad_norm": 18.858366012573242, - "learning_rate": 1.0389896762553875e-05, - "loss": 1.2005, + "epoch": 0.6, + "grad_norm": 12.202618598937988, + "learning_rate": 1.598878801824039e-05, + "loss": 1.5297, "step": 4795 }, { - "epoch": 1.44, - "grad_norm": 11.462150573730469, - "learning_rate": 1.0387892151949485e-05, - "loss": 1.0013, + "epoch": 0.6, + "grad_norm": 6.869583606719971, + "learning_rate": 1.5987951303183703e-05, + "loss": 0.3643, "step": 4796 }, { - "epoch": 1.44, - "grad_norm": 13.834492683410645, - "learning_rate": 1.0385887541345095e-05, - "loss": 0.8818, + "epoch": 0.6, + "grad_norm": 17.388463973999023, + "learning_rate": 1.5987114588127017e-05, + "loss": 2.3765, "step": 4797 }, { - "epoch": 1.44, - "grad_norm": 44.42348861694336, - "learning_rate": 1.0383882930740705e-05, - "loss": 2.2629, + "epoch": 0.6, + "grad_norm": 22.159502029418945, + "learning_rate": 1.5986277873070327e-05, + "loss": 2.6772, "step": 4798 }, { - "epoch": 1.44, - "grad_norm": 21.56435775756836, - "learning_rate": 1.0381878320136314e-05, - "loss": 2.646, + "epoch": 0.6, + "grad_norm": 10.427502632141113, + "learning_rate": 1.598544115801364e-05, + "loss": 2.6432, "step": 4799 }, { - "epoch": 1.44, - "grad_norm": 12.599276542663574, - "learning_rate": 1.0379873709531925e-05, - "loss": 1.2467, + "epoch": 0.6, + "grad_norm": 15.760404586791992, + "learning_rate": 1.5984604442956954e-05, + "loss": 0.9616, "step": 4800 }, { - "epoch": 1.44, - "eval_loss": 0.20445984601974487, - "eval_runtime": 43.0646, - "eval_samples_per_second": 34.344, - "eval_steps_per_second": 34.344, + "epoch": 0.6, + "eval_loss": 0.11780498921871185, + "eval_runtime": 93.8932, + "eval_samples_per_second": 37.724, + "eval_steps_per_second": 37.724, "step": 4800 }, { - "epoch": 1.44, - "grad_norm": 21.611980438232422, - "learning_rate": 1.0377869098927534e-05, - "loss": 1.8799, + "epoch": 0.6, + "grad_norm": 13.106200218200684, + "learning_rate": 1.5983767727900264e-05, + "loss": 2.8001, "step": 4801 }, { - "epoch": 1.44, - "grad_norm": 26.111543655395508, - "learning_rate": 1.0375864488323144e-05, - "loss": 1.8642, + "epoch": 0.6, + "grad_norm": 19.5357723236084, + "learning_rate": 1.5982931012843578e-05, + "loss": 3.5563, "step": 4802 }, { - "epoch": 1.44, - "grad_norm": 19.41090965270996, - "learning_rate": 1.0373859877718756e-05, - "loss": 1.8098, + "epoch": 0.6, + "grad_norm": 11.600570678710938, + "learning_rate": 1.598209429778689e-05, + "loss": 1.7516, "step": 4803 }, { - "epoch": 1.44, - "grad_norm": 23.62296485900879, - "learning_rate": 1.0371855267114364e-05, - "loss": 2.0232, + "epoch": 0.6, + "grad_norm": 63.57748031616211, + "learning_rate": 1.5981257582730202e-05, + "loss": 1.5366, "step": 4804 }, { - "epoch": 1.44, - "grad_norm": 25.114355087280273, - "learning_rate": 1.0369850656509974e-05, - "loss": 1.348, + "epoch": 0.6, + "grad_norm": 7.862771511077881, + "learning_rate": 1.5980420867673516e-05, + "loss": 1.0459, "step": 4805 }, { - "epoch": 1.44, - "grad_norm": 15.635001182556152, - "learning_rate": 1.0367846045905583e-05, - "loss": 1.4803, + "epoch": 0.6, + "grad_norm": 28.971946716308594, + "learning_rate": 1.5979584152616826e-05, + "loss": 2.5234, "step": 4806 }, { - "epoch": 1.45, - "grad_norm": 12.778668403625488, - "learning_rate": 1.0365841435301194e-05, - "loss": 1.837, + "epoch": 0.6, + "grad_norm": 16.047687530517578, + "learning_rate": 1.597874743756014e-05, + "loss": 1.0227, "step": 4807 }, { - "epoch": 1.45, - "grad_norm": 19.424400329589844, - "learning_rate": 1.0363836824696803e-05, - "loss": 1.6641, + "epoch": 0.6, + "grad_norm": 13.568920135498047, + "learning_rate": 1.5977910722503453e-05, + "loss": 2.0609, "step": 4808 }, { - "epoch": 1.45, - "grad_norm": 9.217851638793945, - "learning_rate": 1.0361832214092413e-05, - "loss": 0.925, + "epoch": 0.6, + "grad_norm": 11.14881420135498, + "learning_rate": 1.5977074007446763e-05, + "loss": 1.15, "step": 4809 }, { - "epoch": 1.45, - "grad_norm": 26.674348831176758, - "learning_rate": 1.0359827603488025e-05, - "loss": 1.5977, + "epoch": 0.6, + "grad_norm": 13.92988109588623, + "learning_rate": 1.5976237292390077e-05, + "loss": 3.2552, "step": 4810 }, { - "epoch": 1.45, - "grad_norm": 13.42351245880127, - "learning_rate": 1.0357822992883633e-05, - "loss": 1.3927, + "epoch": 0.6, + "grad_norm": 10.029170036315918, + "learning_rate": 1.597540057733339e-05, + "loss": 1.4787, "step": 4811 }, { - "epoch": 1.45, - "grad_norm": 36.85655975341797, - "learning_rate": 1.0355818382279243e-05, - "loss": 1.8373, + "epoch": 0.6, + "grad_norm": 16.906906127929688, + "learning_rate": 1.5974563862276704e-05, + "loss": 1.2747, "step": 4812 }, { - "epoch": 1.45, - "grad_norm": 49.87083435058594, - "learning_rate": 1.0353813771674851e-05, - "loss": 1.6954, + "epoch": 0.6, + "grad_norm": 28.421478271484375, + "learning_rate": 1.5973727147220015e-05, + "loss": 2.1904, "step": 4813 }, { - "epoch": 1.45, - "grad_norm": 30.802719116210938, - "learning_rate": 1.0351809161070463e-05, - "loss": 2.0961, + "epoch": 0.6, + "grad_norm": 7.58961296081543, + "learning_rate": 1.5972890432163328e-05, + "loss": 1.2808, "step": 4814 }, { - "epoch": 1.45, - "grad_norm": 22.26266860961914, - "learning_rate": 1.0349804550466073e-05, - "loss": 1.3555, + "epoch": 0.6, + "grad_norm": 19.453166961669922, + "learning_rate": 1.5972053717106642e-05, + "loss": 2.7334, "step": 4815 }, { - "epoch": 1.45, - "grad_norm": 18.78736114501953, - "learning_rate": 1.0347799939861682e-05, - "loss": 1.4008, + "epoch": 0.6, + "grad_norm": 12.650189399719238, + "learning_rate": 1.5971217002049952e-05, + "loss": 2.8295, "step": 4816 }, { - "epoch": 1.45, - "grad_norm": 10.189375877380371, - "learning_rate": 1.0345795329257293e-05, - "loss": 0.6236, + "epoch": 0.6, + "grad_norm": 14.469470977783203, + "learning_rate": 1.5970380286993266e-05, + "loss": 1.7788, "step": 4817 }, { - "epoch": 1.45, - "grad_norm": 14.57485580444336, - "learning_rate": 1.0343790718652902e-05, - "loss": 1.3009, + "epoch": 0.6, + "grad_norm": 10.258919715881348, + "learning_rate": 1.596954357193658e-05, + "loss": 0.8568, "step": 4818 }, { - "epoch": 1.45, - "grad_norm": 16.516605377197266, - "learning_rate": 1.0341786108048512e-05, - "loss": 1.3581, + "epoch": 0.6, + "grad_norm": 11.894331932067871, + "learning_rate": 1.596870685687989e-05, + "loss": 1.9034, "step": 4819 }, { - "epoch": 1.45, - "grad_norm": 12.826788902282715, - "learning_rate": 1.0339781497444122e-05, - "loss": 1.9941, + "epoch": 0.6, + "grad_norm": 8.998530387878418, + "learning_rate": 1.5967870141823203e-05, + "loss": 1.0493, "step": 4820 }, { - "epoch": 1.45, - "grad_norm": 43.45224380493164, - "learning_rate": 1.0337776886839732e-05, - "loss": 1.8419, + "epoch": 0.61, + "grad_norm": 14.476089477539062, + "learning_rate": 1.5967033426766517e-05, + "loss": 1.9384, "step": 4821 }, { - "epoch": 1.45, - "grad_norm": 33.118247985839844, - "learning_rate": 1.0335772276235342e-05, - "loss": 1.7156, + "epoch": 0.61, + "grad_norm": 23.7738094329834, + "learning_rate": 1.596619671170983e-05, + "loss": 0.8177, "step": 4822 }, { - "epoch": 1.45, - "grad_norm": 29.060731887817383, - "learning_rate": 1.0333767665630952e-05, - "loss": 1.5526, + "epoch": 0.61, + "grad_norm": 5.879161357879639, + "learning_rate": 1.596535999665314e-05, + "loss": 1.3281, "step": 4823 }, { - "epoch": 1.45, - "grad_norm": 30.88375473022461, - "learning_rate": 1.0331763055026562e-05, - "loss": 1.9801, + "epoch": 0.61, + "grad_norm": 6.781850814819336, + "learning_rate": 1.5964523281596455e-05, + "loss": 2.4006, "step": 4824 }, { - "epoch": 1.45, - "grad_norm": 26.168588638305664, - "learning_rate": 1.032975844442217e-05, - "loss": 1.63, + "epoch": 0.61, + "grad_norm": 13.156648635864258, + "learning_rate": 1.5963686566539768e-05, + "loss": 1.1909, "step": 4825 }, { - "epoch": 1.45, - "grad_norm": 20.537208557128906, - "learning_rate": 1.0327753833817783e-05, - "loss": 1.582, + "epoch": 0.61, + "grad_norm": 51.71833801269531, + "learning_rate": 1.596284985148308e-05, + "loss": 2.3185, "step": 4826 }, { - "epoch": 1.45, - "grad_norm": 15.51192855834961, - "learning_rate": 1.0325749223213393e-05, - "loss": 2.2813, + "epoch": 0.61, + "grad_norm": 15.952625274658203, + "learning_rate": 1.5962013136426392e-05, + "loss": 2.0639, "step": 4827 }, { - "epoch": 1.45, - "grad_norm": 18.50126838684082, - "learning_rate": 1.0323744612609001e-05, - "loss": 1.5379, + "epoch": 0.61, + "grad_norm": 13.1610107421875, + "learning_rate": 1.5961176421369702e-05, + "loss": 1.4908, "step": 4828 }, { - "epoch": 1.45, - "grad_norm": 19.359989166259766, - "learning_rate": 1.0321740002004613e-05, - "loss": 1.4196, + "epoch": 0.61, + "grad_norm": 7.100407600402832, + "learning_rate": 1.5960339706313016e-05, + "loss": 0.9432, "step": 4829 }, { - "epoch": 1.45, - "grad_norm": 71.47821807861328, - "learning_rate": 1.0319735391400221e-05, - "loss": 2.627, + "epoch": 0.61, + "grad_norm": 11.82643985748291, + "learning_rate": 1.595950299125633e-05, + "loss": 1.1546, "step": 4830 }, { - "epoch": 1.45, - "grad_norm": 8.43783950805664, - "learning_rate": 1.0317730780795831e-05, - "loss": 1.1434, + "epoch": 0.61, + "grad_norm": 38.60078811645508, + "learning_rate": 1.595866627619964e-05, + "loss": 2.5733, "step": 4831 }, { - "epoch": 1.45, - "grad_norm": 49.501930236816406, - "learning_rate": 1.031572617019144e-05, - "loss": 2.0673, + "epoch": 0.61, + "grad_norm": 16.149267196655273, + "learning_rate": 1.5957829561142954e-05, + "loss": 2.1974, "step": 4832 }, { - "epoch": 1.45, - "grad_norm": 11.1619234085083, - "learning_rate": 1.0313721559587051e-05, - "loss": 1.8033, + "epoch": 0.61, + "grad_norm": 15.542160987854004, + "learning_rate": 1.5956992846086264e-05, + "loss": 2.3165, "step": 4833 }, { - "epoch": 1.45, - "grad_norm": 7.6200408935546875, - "learning_rate": 1.0311716948982662e-05, - "loss": 0.8928, + "epoch": 0.61, + "grad_norm": 11.998748779296875, + "learning_rate": 1.5956156131029578e-05, + "loss": 2.2605, "step": 4834 }, { - "epoch": 1.45, - "grad_norm": 19.349905014038086, - "learning_rate": 1.030971233837827e-05, - "loss": 1.6486, + "epoch": 0.61, + "grad_norm": 9.115984916687012, + "learning_rate": 1.595531941597289e-05, + "loss": 1.1619, "step": 4835 }, { - "epoch": 1.45, - "grad_norm": 8.616739273071289, - "learning_rate": 1.0307707727773882e-05, - "loss": 1.2562, + "epoch": 0.61, + "grad_norm": 33.47035217285156, + "learning_rate": 1.5954482700916205e-05, + "loss": 2.7648, "step": 4836 }, { - "epoch": 1.45, - "grad_norm": 30.071773529052734, - "learning_rate": 1.030570311716949e-05, - "loss": 1.4047, + "epoch": 0.61, + "grad_norm": 8.653242111206055, + "learning_rate": 1.5953645985859515e-05, + "loss": 3.0944, "step": 4837 }, { - "epoch": 1.45, - "grad_norm": 12.02312183380127, - "learning_rate": 1.03036985065651e-05, - "loss": 1.5015, + "epoch": 0.61, + "grad_norm": 6.583014965057373, + "learning_rate": 1.595280927080283e-05, + "loss": 2.3757, "step": 4838 }, { - "epoch": 1.45, - "grad_norm": 13.285801887512207, - "learning_rate": 1.0301693895960712e-05, - "loss": 1.4643, + "epoch": 0.61, + "grad_norm": 45.90184020996094, + "learning_rate": 1.5951972555746142e-05, + "loss": 2.0398, "step": 4839 }, { - "epoch": 1.46, - "grad_norm": 20.811077117919922, - "learning_rate": 1.029968928535632e-05, - "loss": 1.8669, + "epoch": 0.61, + "grad_norm": 7.804754734039307, + "learning_rate": 1.5951135840689453e-05, + "loss": 2.4884, "step": 4840 }, { - "epoch": 1.46, - "grad_norm": 21.803550720214844, - "learning_rate": 1.029768467475193e-05, - "loss": 2.0773, + "epoch": 0.61, + "grad_norm": 4.901381492614746, + "learning_rate": 1.5950299125632766e-05, + "loss": 0.7774, "step": 4841 }, { - "epoch": 1.46, - "grad_norm": 35.751792907714844, - "learning_rate": 1.0295680064147539e-05, - "loss": 2.3893, + "epoch": 0.61, + "grad_norm": 20.78880500793457, + "learning_rate": 1.594946241057608e-05, + "loss": 1.8401, "step": 4842 }, { - "epoch": 1.46, - "grad_norm": 14.943572044372559, - "learning_rate": 1.029367545354315e-05, - "loss": 1.9117, + "epoch": 0.61, + "grad_norm": 12.154851913452148, + "learning_rate": 1.5948625695519394e-05, + "loss": 2.0679, "step": 4843 }, { - "epoch": 1.46, - "grad_norm": 19.128864288330078, - "learning_rate": 1.0291670842938759e-05, - "loss": 1.9138, + "epoch": 0.61, + "grad_norm": 16.952661514282227, + "learning_rate": 1.5947788980462704e-05, + "loss": 1.7988, "step": 4844 }, { - "epoch": 1.46, - "grad_norm": 17.63079833984375, - "learning_rate": 1.028966623233437e-05, - "loss": 1.2719, + "epoch": 0.61, + "grad_norm": 6.197364330291748, + "learning_rate": 1.5946952265406018e-05, + "loss": 1.1393, "step": 4845 }, { - "epoch": 1.46, - "grad_norm": 40.6061897277832, - "learning_rate": 1.028766162172998e-05, - "loss": 1.8636, + "epoch": 0.61, + "grad_norm": 61.77532196044922, + "learning_rate": 1.594611555034933e-05, + "loss": 3.0282, "step": 4846 }, { - "epoch": 1.46, - "grad_norm": 11.293282508850098, - "learning_rate": 1.028565701112559e-05, - "loss": 1.1617, + "epoch": 0.61, + "grad_norm": 38.84717559814453, + "learning_rate": 1.594527883529264e-05, + "loss": 0.968, "step": 4847 }, { - "epoch": 1.46, - "grad_norm": 7.563941955566406, - "learning_rate": 1.0283652400521201e-05, - "loss": 1.266, + "epoch": 0.61, + "grad_norm": 10.309555053710938, + "learning_rate": 1.5944442120235955e-05, + "loss": 2.0344, "step": 4848 }, { - "epoch": 1.46, - "grad_norm": 37.318321228027344, - "learning_rate": 1.028164778991681e-05, - "loss": 4.1312, + "epoch": 0.61, + "grad_norm": 13.139561653137207, + "learning_rate": 1.594360540517927e-05, + "loss": 3.2425, "step": 4849 }, { - "epoch": 1.46, - "grad_norm": 15.308874130249023, - "learning_rate": 1.027964317931242e-05, - "loss": 1.563, + "epoch": 0.61, + "grad_norm": 16.187753677368164, + "learning_rate": 1.5942768690122582e-05, + "loss": 2.3351, "step": 4850 }, { - "epoch": 1.46, - "grad_norm": 12.4258451461792, - "learning_rate": 1.0277638568708028e-05, - "loss": 1.7674, + "epoch": 0.61, + "grad_norm": 9.564990997314453, + "learning_rate": 1.5941931975065893e-05, + "loss": 1.5682, "step": 4851 }, { - "epoch": 1.46, - "grad_norm": 12.700484275817871, - "learning_rate": 1.027563395810364e-05, - "loss": 1.2176, + "epoch": 0.61, + "grad_norm": 19.982685089111328, + "learning_rate": 1.5941095260009206e-05, + "loss": 1.553, "step": 4852 }, { - "epoch": 1.46, - "grad_norm": 26.99094581604004, - "learning_rate": 1.027362934749925e-05, - "loss": 1.5912, + "epoch": 0.61, + "grad_norm": 25.18014907836914, + "learning_rate": 1.5940258544952517e-05, + "loss": 3.4746, "step": 4853 }, { - "epoch": 1.46, - "grad_norm": 52.29587936401367, - "learning_rate": 1.0271624736894858e-05, - "loss": 2.6716, + "epoch": 0.61, + "grad_norm": 20.708629608154297, + "learning_rate": 1.593942182989583e-05, + "loss": 1.016, "step": 4854 }, { - "epoch": 1.46, - "grad_norm": 28.816272735595703, - "learning_rate": 1.026962012629047e-05, - "loss": 2.7605, + "epoch": 0.61, + "grad_norm": 11.345169067382812, + "learning_rate": 1.5938585114839144e-05, + "loss": 1.6501, "step": 4855 }, { - "epoch": 1.46, - "grad_norm": 28.791126251220703, - "learning_rate": 1.0267615515686078e-05, - "loss": 1.8248, + "epoch": 0.61, + "grad_norm": 2.4988794326782227, + "learning_rate": 1.5937748399782454e-05, + "loss": 0.2278, "step": 4856 }, { - "epoch": 1.46, - "grad_norm": 17.190807342529297, - "learning_rate": 1.0265610905081688e-05, - "loss": 1.6728, + "epoch": 0.61, + "grad_norm": 11.859201431274414, + "learning_rate": 1.5936911684725768e-05, + "loss": 1.9389, "step": 4857 }, { - "epoch": 1.46, - "grad_norm": 17.33432960510254, - "learning_rate": 1.02636062944773e-05, - "loss": 1.427, + "epoch": 0.61, + "grad_norm": 6.510630130767822, + "learning_rate": 1.593607496966908e-05, + "loss": 1.506, "step": 4858 }, { - "epoch": 1.46, - "grad_norm": 18.81448745727539, - "learning_rate": 1.0261601683872909e-05, - "loss": 1.4079, + "epoch": 0.61, + "grad_norm": 14.640543937683105, + "learning_rate": 1.593523825461239e-05, + "loss": 0.8049, "step": 4859 }, { - "epoch": 1.46, - "grad_norm": 26.18571662902832, - "learning_rate": 1.0259597073268519e-05, - "loss": 1.3066, + "epoch": 0.61, + "grad_norm": 24.115324020385742, + "learning_rate": 1.5934401539555705e-05, + "loss": 2.1795, "step": 4860 }, { - "epoch": 1.46, - "grad_norm": 22.120250701904297, - "learning_rate": 1.0257592462664127e-05, - "loss": 1.8271, + "epoch": 0.61, + "grad_norm": 23.66679573059082, + "learning_rate": 1.5933564824499016e-05, + "loss": 1.728, "step": 4861 }, { - "epoch": 1.46, - "grad_norm": 19.113744735717773, - "learning_rate": 1.0255587852059739e-05, - "loss": 0.9674, + "epoch": 0.61, + "grad_norm": 14.935659408569336, + "learning_rate": 1.593272810944233e-05, + "loss": 1.4521, "step": 4862 }, { - "epoch": 1.46, - "grad_norm": 15.18375301361084, - "learning_rate": 1.0253583241455347e-05, - "loss": 1.3229, + "epoch": 0.61, + "grad_norm": 16.17740821838379, + "learning_rate": 1.5931891394385643e-05, + "loss": 1.7848, "step": 4863 }, { - "epoch": 1.46, - "grad_norm": 14.799633979797363, - "learning_rate": 1.0251578630850957e-05, - "loss": 1.1113, + "epoch": 0.61, + "grad_norm": 17.31148910522461, + "learning_rate": 1.5931054679328957e-05, + "loss": 0.8621, "step": 4864 }, { - "epoch": 1.46, - "grad_norm": 13.80958366394043, - "learning_rate": 1.0249574020246569e-05, - "loss": 1.6729, + "epoch": 0.61, + "grad_norm": 13.355034828186035, + "learning_rate": 1.5930217964272267e-05, + "loss": 2.0393, "step": 4865 }, { - "epoch": 1.46, - "grad_norm": 38.37821578979492, - "learning_rate": 1.0247569409642177e-05, - "loss": 1.5533, + "epoch": 0.61, + "grad_norm": 8.642890930175781, + "learning_rate": 1.592938124921558e-05, + "loss": 3.2727, "step": 4866 }, { - "epoch": 1.46, - "grad_norm": 28.763893127441406, - "learning_rate": 1.0245564799037788e-05, - "loss": 1.699, + "epoch": 0.61, + "grad_norm": 26.613035202026367, + "learning_rate": 1.5928544534158894e-05, + "loss": 1.9293, "step": 4867 }, { - "epoch": 1.46, - "grad_norm": 20.876445770263672, - "learning_rate": 1.0243560188433398e-05, - "loss": 1.3333, + "epoch": 0.61, + "grad_norm": 38.60970687866211, + "learning_rate": 1.5927707819102204e-05, + "loss": 3.1967, "step": 4868 }, { - "epoch": 1.46, - "grad_norm": 52.730857849121094, - "learning_rate": 1.0241555577829008e-05, - "loss": 1.4987, + "epoch": 0.61, + "grad_norm": 14.778573989868164, + "learning_rate": 1.5926871104045518e-05, + "loss": 0.7463, "step": 4869 }, { - "epoch": 1.46, - "grad_norm": 13.893447875976562, - "learning_rate": 1.0239550967224618e-05, - "loss": 1.6037, + "epoch": 0.61, + "grad_norm": 22.723390579223633, + "learning_rate": 1.592603438898883e-05, + "loss": 2.1331, "step": 4870 }, { - "epoch": 1.46, - "grad_norm": 28.038808822631836, - "learning_rate": 1.0237546356620228e-05, - "loss": 3.0782, + "epoch": 0.61, + "grad_norm": 14.271210670471191, + "learning_rate": 1.5925197673932145e-05, + "loss": 2.4401, "step": 4871 }, { - "epoch": 1.46, - "grad_norm": 26.272857666015625, - "learning_rate": 1.0235541746015838e-05, - "loss": 1.7294, + "epoch": 0.61, + "grad_norm": 13.761286735534668, + "learning_rate": 1.5924360958875456e-05, + "loss": 1.13, "step": 4872 }, { - "epoch": 1.47, - "grad_norm": 87.54938507080078, - "learning_rate": 1.0233537135411446e-05, - "loss": 2.8123, + "epoch": 0.61, + "grad_norm": 14.678537368774414, + "learning_rate": 1.592352424381877e-05, + "loss": 1.6376, "step": 4873 }, { - "epoch": 1.47, - "grad_norm": 25.847455978393555, - "learning_rate": 1.0231532524807058e-05, - "loss": 1.3259, + "epoch": 0.61, + "grad_norm": 20.866357803344727, + "learning_rate": 1.5922687528762083e-05, + "loss": 1.9563, "step": 4874 }, { - "epoch": 1.47, - "grad_norm": 10.328527450561523, - "learning_rate": 1.0229527914202666e-05, - "loss": 0.9254, + "epoch": 0.61, + "grad_norm": 8.459383964538574, + "learning_rate": 1.5921850813705393e-05, + "loss": 0.4451, "step": 4875 }, { - "epoch": 1.47, - "grad_norm": 19.659297943115234, - "learning_rate": 1.0227523303598277e-05, - "loss": 1.1272, + "epoch": 0.61, + "grad_norm": 11.555710792541504, + "learning_rate": 1.5921014098648707e-05, + "loss": 2.7204, "step": 4876 }, { - "epoch": 1.47, - "grad_norm": 25.877544403076172, - "learning_rate": 1.0225518692993888e-05, - "loss": 1.8762, + "epoch": 0.61, + "grad_norm": 5.4839253425598145, + "learning_rate": 1.592017738359202e-05, + "loss": 0.143, "step": 4877 }, { - "epoch": 1.47, - "grad_norm": 22.409603118896484, - "learning_rate": 1.0223514082389497e-05, - "loss": 2.1701, + "epoch": 0.61, + "grad_norm": 15.43350887298584, + "learning_rate": 1.5919340668535334e-05, + "loss": 3.0729, "step": 4878 }, { - "epoch": 1.47, - "grad_norm": 48.738155364990234, - "learning_rate": 1.0221509471785107e-05, - "loss": 2.1954, + "epoch": 0.61, + "grad_norm": 8.133819580078125, + "learning_rate": 1.5918503953478644e-05, + "loss": 1.1949, "step": 4879 }, { - "epoch": 1.47, - "grad_norm": 13.183887481689453, - "learning_rate": 1.0219504861180715e-05, - "loss": 1.5571, + "epoch": 0.61, + "grad_norm": 17.95276641845703, + "learning_rate": 1.5917667238421958e-05, + "loss": 3.1655, "step": 4880 }, { - "epoch": 1.47, - "grad_norm": 18.5463809967041, - "learning_rate": 1.0217500250576327e-05, - "loss": 1.5617, + "epoch": 0.61, + "grad_norm": 15.375577926635742, + "learning_rate": 1.5916830523365268e-05, + "loss": 2.6146, "step": 4881 }, { - "epoch": 1.47, - "grad_norm": 22.341135025024414, - "learning_rate": 1.0215495639971937e-05, - "loss": 1.0747, + "epoch": 0.61, + "grad_norm": 24.520431518554688, + "learning_rate": 1.5915993808308582e-05, + "loss": 2.0791, "step": 4882 }, { - "epoch": 1.47, - "grad_norm": 10.601929664611816, - "learning_rate": 1.0213491029367545e-05, - "loss": 1.539, + "epoch": 0.61, + "grad_norm": 8.88187313079834, + "learning_rate": 1.5915157093251896e-05, + "loss": 1.4884, "step": 4883 }, { - "epoch": 1.47, - "grad_norm": 12.355496406555176, - "learning_rate": 1.0211486418763157e-05, - "loss": 1.3754, + "epoch": 0.61, + "grad_norm": 10.687667846679688, + "learning_rate": 1.5914320378195206e-05, + "loss": 0.5388, "step": 4884 }, { - "epoch": 1.47, - "grad_norm": 11.556707382202148, - "learning_rate": 1.0209481808158766e-05, - "loss": 1.5293, + "epoch": 0.61, + "grad_norm": 10.427061080932617, + "learning_rate": 1.591348366313852e-05, + "loss": 1.0136, "step": 4885 }, { - "epoch": 1.47, - "grad_norm": 19.798812866210938, - "learning_rate": 1.0207477197554376e-05, - "loss": 0.9688, + "epoch": 0.61, + "grad_norm": 11.740730285644531, + "learning_rate": 1.591264694808183e-05, + "loss": 1.8335, "step": 4886 }, { - "epoch": 1.47, - "grad_norm": 13.712343215942383, - "learning_rate": 1.0205472586949984e-05, - "loss": 1.5041, + "epoch": 0.61, + "grad_norm": 13.261872291564941, + "learning_rate": 1.5911810233025143e-05, + "loss": 2.3499, "step": 4887 }, { - "epoch": 1.47, - "grad_norm": 10.897645950317383, - "learning_rate": 1.0203467976345596e-05, - "loss": 1.0905, + "epoch": 0.61, + "grad_norm": 28.052698135375977, + "learning_rate": 1.5910973517968457e-05, + "loss": 2.8952, "step": 4888 }, { - "epoch": 1.47, - "grad_norm": 28.161396026611328, - "learning_rate": 1.0201463365741206e-05, - "loss": 1.7225, + "epoch": 0.61, + "grad_norm": 7.432820796966553, + "learning_rate": 1.5910136802911767e-05, + "loss": 1.5737, "step": 4889 }, { - "epoch": 1.47, - "grad_norm": 34.16568374633789, - "learning_rate": 1.0199458755136814e-05, - "loss": 2.0862, + "epoch": 0.61, + "grad_norm": 6.619985103607178, + "learning_rate": 1.590930008785508e-05, + "loss": 0.5595, "step": 4890 }, { - "epoch": 1.47, - "grad_norm": 19.398710250854492, - "learning_rate": 1.0197454144532426e-05, - "loss": 1.5471, + "epoch": 0.61, + "grad_norm": 11.153505325317383, + "learning_rate": 1.5908463372798395e-05, + "loss": 2.432, "step": 4891 }, { - "epoch": 1.47, - "grad_norm": 24.547252655029297, - "learning_rate": 1.0195449533928035e-05, - "loss": 3.0927, + "epoch": 0.61, + "grad_norm": 11.692172050476074, + "learning_rate": 1.5907626657741708e-05, + "loss": 0.3043, "step": 4892 }, { - "epoch": 1.47, - "grad_norm": 11.176749229431152, - "learning_rate": 1.0193444923323645e-05, - "loss": 1.1639, + "epoch": 0.61, + "grad_norm": 8.810521125793457, + "learning_rate": 1.590678994268502e-05, + "loss": 1.3017, "step": 4893 }, { - "epoch": 1.47, - "grad_norm": 50.76742172241211, - "learning_rate": 1.0191440312719255e-05, - "loss": 2.6678, + "epoch": 0.61, + "grad_norm": 12.059951782226562, + "learning_rate": 1.5905953227628332e-05, + "loss": 1.153, "step": 4894 }, { - "epoch": 1.47, - "grad_norm": 9.156387329101562, - "learning_rate": 1.0189435702114865e-05, - "loss": 0.9565, + "epoch": 0.61, + "grad_norm": 9.708504676818848, + "learning_rate": 1.5905116512571646e-05, + "loss": 1.6417, "step": 4895 }, { - "epoch": 1.47, - "grad_norm": 23.076967239379883, - "learning_rate": 1.0187431091510475e-05, - "loss": 1.6497, + "epoch": 0.61, + "grad_norm": 7.3110833168029785, + "learning_rate": 1.5904279797514956e-05, + "loss": 0.9502, "step": 4896 }, { - "epoch": 1.47, - "grad_norm": 16.60446548461914, - "learning_rate": 1.0185426480906085e-05, - "loss": 1.5919, + "epoch": 0.61, + "grad_norm": 4.893747329711914, + "learning_rate": 1.590344308245827e-05, + "loss": 0.491, "step": 4897 }, { - "epoch": 1.47, - "grad_norm": 12.027603149414062, - "learning_rate": 1.0183421870301695e-05, - "loss": 2.2724, + "epoch": 0.61, + "grad_norm": 9.931058883666992, + "learning_rate": 1.5902606367401583e-05, + "loss": 1.3111, "step": 4898 }, { - "epoch": 1.47, - "grad_norm": 18.373804092407227, - "learning_rate": 1.0181417259697303e-05, - "loss": 0.8864, + "epoch": 0.61, + "grad_norm": 10.12717056274414, + "learning_rate": 1.5901769652344897e-05, + "loss": 1.4998, "step": 4899 }, { - "epoch": 1.47, - "grad_norm": 33.86744689941406, - "learning_rate": 1.0179412649092915e-05, - "loss": 2.4096, + "epoch": 0.61, + "grad_norm": 9.866714477539062, + "learning_rate": 1.5900932937288207e-05, + "loss": 2.3386, "step": 4900 }, { - "epoch": 1.47, - "grad_norm": 19.00617218017578, - "learning_rate": 1.0177408038488525e-05, - "loss": 1.4766, + "epoch": 0.62, + "grad_norm": 5.476425647735596, + "learning_rate": 1.590009622223152e-05, + "loss": 1.6813, "step": 4901 }, { - "epoch": 1.47, - "grad_norm": 10.875265121459961, - "learning_rate": 1.0175403427884134e-05, - "loss": 1.6248, + "epoch": 0.62, + "grad_norm": 9.888223648071289, + "learning_rate": 1.5899259507174835e-05, + "loss": 2.1962, "step": 4902 }, { - "epoch": 1.47, - "grad_norm": 18.423274993896484, - "learning_rate": 1.0173398817279745e-05, - "loss": 2.1238, + "epoch": 0.62, + "grad_norm": 17.479103088378906, + "learning_rate": 1.5898422792118145e-05, + "loss": 2.7827, "step": 4903 }, { - "epoch": 1.47, - "grad_norm": 14.276087760925293, - "learning_rate": 1.0171394206675354e-05, - "loss": 1.471, + "epoch": 0.62, + "grad_norm": 23.746028900146484, + "learning_rate": 1.589758607706146e-05, + "loss": 2.0941, "step": 4904 }, { - "epoch": 1.47, - "grad_norm": 8.41922664642334, - "learning_rate": 1.0169389596070964e-05, - "loss": 0.8771, + "epoch": 0.62, + "grad_norm": 8.534255981445312, + "learning_rate": 1.5896749362004772e-05, + "loss": 1.9096, "step": 4905 }, { - "epoch": 1.48, - "grad_norm": 17.14394760131836, - "learning_rate": 1.0167384985466572e-05, - "loss": 1.0387, + "epoch": 0.62, + "grad_norm": 8.96469783782959, + "learning_rate": 1.5895912646948082e-05, + "loss": 1.7232, "step": 4906 }, { - "epoch": 1.48, - "grad_norm": 20.45330047607422, - "learning_rate": 1.0165380374862184e-05, - "loss": 1.9674, + "epoch": 0.62, + "grad_norm": 17.049060821533203, + "learning_rate": 1.5895075931891396e-05, + "loss": 1.6946, "step": 4907 }, { - "epoch": 1.48, - "grad_norm": 6.636265754699707, - "learning_rate": 1.0163375764257794e-05, - "loss": 0.3816, + "epoch": 0.62, + "grad_norm": 38.09623718261719, + "learning_rate": 1.589423921683471e-05, + "loss": 4.0612, "step": 4908 }, { - "epoch": 1.48, - "grad_norm": 15.284008979797363, - "learning_rate": 1.0161371153653403e-05, - "loss": 1.6652, + "epoch": 0.62, + "grad_norm": 12.76835823059082, + "learning_rate": 1.589340250177802e-05, + "loss": 3.0702, "step": 4909 }, { - "epoch": 1.48, - "grad_norm": 16.42159080505371, - "learning_rate": 1.0159366543049014e-05, - "loss": 2.3129, + "epoch": 0.62, + "grad_norm": 79.78744506835938, + "learning_rate": 1.5892565786721334e-05, + "loss": 2.443, "step": 4910 }, { - "epoch": 1.48, - "grad_norm": 19.10989761352539, - "learning_rate": 1.0157361932444623e-05, - "loss": 1.8581, + "epoch": 0.62, + "grad_norm": 102.937744140625, + "learning_rate": 1.5891729071664647e-05, + "loss": 1.7817, "step": 4911 }, { - "epoch": 1.48, - "grad_norm": 30.188232421875, - "learning_rate": 1.0155357321840233e-05, - "loss": 1.5959, + "epoch": 0.62, + "grad_norm": 12.582338333129883, + "learning_rate": 1.5890892356607957e-05, + "loss": 1.7731, "step": 4912 }, { - "epoch": 1.48, - "grad_norm": 12.307696342468262, - "learning_rate": 1.0153352711235845e-05, - "loss": 1.6547, + "epoch": 0.62, + "grad_norm": 14.361742973327637, + "learning_rate": 1.589005564155127e-05, + "loss": 1.8241, "step": 4913 }, { - "epoch": 1.48, - "grad_norm": 12.962136268615723, - "learning_rate": 1.0151348100631453e-05, - "loss": 1.8137, + "epoch": 0.62, + "grad_norm": 13.8981294631958, + "learning_rate": 1.588921892649458e-05, + "loss": 0.2828, "step": 4914 }, { - "epoch": 1.48, - "grad_norm": 10.3368558883667, - "learning_rate": 1.0149343490027063e-05, - "loss": 0.9236, + "epoch": 0.62, + "grad_norm": 73.4322280883789, + "learning_rate": 1.5888382211437895e-05, + "loss": 3.2836, "step": 4915 }, { - "epoch": 1.48, - "grad_norm": 20.91614532470703, - "learning_rate": 1.0147338879422673e-05, - "loss": 1.7155, + "epoch": 0.62, + "grad_norm": 18.202783584594727, + "learning_rate": 1.588754549638121e-05, + "loss": 1.3835, "step": 4916 }, { - "epoch": 1.48, - "grad_norm": 38.6691780090332, - "learning_rate": 1.0145334268818283e-05, - "loss": 2.1508, + "epoch": 0.62, + "grad_norm": 15.362526893615723, + "learning_rate": 1.588670878132452e-05, + "loss": 2.5685, "step": 4917 }, { - "epoch": 1.48, - "grad_norm": 9.685796737670898, - "learning_rate": 1.0143329658213892e-05, - "loss": 1.7707, + "epoch": 0.62, + "grad_norm": 24.820188522338867, + "learning_rate": 1.5885872066267833e-05, + "loss": 2.3165, "step": 4918 }, { - "epoch": 1.48, - "grad_norm": 14.471162796020508, - "learning_rate": 1.0141325047609503e-05, - "loss": 1.5946, + "epoch": 0.62, + "grad_norm": 110.93339538574219, + "learning_rate": 1.5885035351211146e-05, + "loss": 3.0482, "step": 4919 }, { - "epoch": 1.48, - "grad_norm": 18.22927474975586, - "learning_rate": 1.0139320437005114e-05, - "loss": 1.2375, - "step": 4920 - }, - { - "epoch": 1.48, - "eval_loss": 0.2065747231245041, - "eval_runtime": 43.2855, - "eval_samples_per_second": 34.169, - "eval_steps_per_second": 34.169, + "epoch": 0.62, + "grad_norm": 9.03571891784668, + "learning_rate": 1.588419863615446e-05, + "loss": 1.4118, "step": 4920 }, { - "epoch": 1.48, - "grad_norm": 51.773658752441406, - "learning_rate": 1.0137315826400722e-05, - "loss": 0.986, + "epoch": 0.62, + "grad_norm": 49.11772155761719, + "learning_rate": 1.588336192109777e-05, + "loss": 1.68, "step": 4921 }, { - "epoch": 1.48, - "grad_norm": 105.13095092773438, - "learning_rate": 1.0135311215796334e-05, - "loss": 2.8983, + "epoch": 0.62, + "grad_norm": 9.715534210205078, + "learning_rate": 1.5882525206041084e-05, + "loss": 0.9393, "step": 4922 }, { - "epoch": 1.48, - "grad_norm": 14.783707618713379, - "learning_rate": 1.0133306605191942e-05, - "loss": 1.9219, + "epoch": 0.62, + "grad_norm": 11.484861373901367, + "learning_rate": 1.5881688490984397e-05, + "loss": 2.2139, "step": 4923 }, { - "epoch": 1.48, - "grad_norm": 8.431492805480957, - "learning_rate": 1.0131301994587552e-05, - "loss": 0.9575, + "epoch": 0.62, + "grad_norm": 6.814791202545166, + "learning_rate": 1.5880851775927708e-05, + "loss": 1.5921, "step": 4924 }, { - "epoch": 1.48, - "grad_norm": 14.085335731506348, - "learning_rate": 1.0129297383983164e-05, - "loss": 1.2251, + "epoch": 0.62, + "grad_norm": 21.735166549682617, + "learning_rate": 1.588001506087102e-05, + "loss": 1.5475, "step": 4925 }, { - "epoch": 1.48, - "grad_norm": 17.102575302124023, - "learning_rate": 1.0127292773378772e-05, - "loss": 1.4003, + "epoch": 0.62, + "grad_norm": 21.243627548217773, + "learning_rate": 1.5879178345814335e-05, + "loss": 2.0279, "step": 4926 }, { - "epoch": 1.48, - "grad_norm": 13.873455047607422, - "learning_rate": 1.0125288162774382e-05, - "loss": 1.3603, + "epoch": 0.62, + "grad_norm": 11.919015884399414, + "learning_rate": 1.587834163075765e-05, + "loss": 1.765, "step": 4927 }, { - "epoch": 1.48, - "grad_norm": 8.495047569274902, - "learning_rate": 1.012328355216999e-05, - "loss": 1.8705, + "epoch": 0.62, + "grad_norm": 33.100257873535156, + "learning_rate": 1.587750491570096e-05, + "loss": 1.8073, "step": 4928 }, { - "epoch": 1.48, - "grad_norm": 12.480233192443848, - "learning_rate": 1.0121278941565603e-05, - "loss": 1.6846, + "epoch": 0.62, + "grad_norm": 11.556069374084473, + "learning_rate": 1.5876668200644273e-05, + "loss": 1.0835, "step": 4929 }, { - "epoch": 1.48, - "grad_norm": 9.334129333496094, - "learning_rate": 1.0119274330961211e-05, - "loss": 1.924, + "epoch": 0.62, + "grad_norm": 26.59724998474121, + "learning_rate": 1.5875831485587586e-05, + "loss": 4.0394, "step": 4930 }, { - "epoch": 1.48, - "grad_norm": 12.200242042541504, - "learning_rate": 1.0117269720356821e-05, - "loss": 1.4727, + "epoch": 0.62, + "grad_norm": 23.04380989074707, + "learning_rate": 1.5874994770530896e-05, + "loss": 2.3475, "step": 4931 }, { - "epoch": 1.48, - "grad_norm": 10.241484642028809, - "learning_rate": 1.0115265109752433e-05, - "loss": 1.2527, + "epoch": 0.62, + "grad_norm": 14.799032211303711, + "learning_rate": 1.587415805547421e-05, + "loss": 1.123, "step": 4932 }, { - "epoch": 1.48, - "grad_norm": 18.393444061279297, - "learning_rate": 1.0113260499148041e-05, - "loss": 1.9137, + "epoch": 0.62, + "grad_norm": 3.6003031730651855, + "learning_rate": 1.5873321340417524e-05, + "loss": 0.2666, "step": 4933 }, { - "epoch": 1.48, - "grad_norm": 30.996809005737305, - "learning_rate": 1.0111255888543651e-05, - "loss": 1.9626, + "epoch": 0.62, + "grad_norm": 18.916271209716797, + "learning_rate": 1.5872484625360834e-05, + "loss": 3.6269, "step": 4934 }, { - "epoch": 1.48, - "grad_norm": 22.243946075439453, - "learning_rate": 1.010925127793926e-05, - "loss": 1.4565, + "epoch": 0.62, + "grad_norm": 10.74314022064209, + "learning_rate": 1.5871647910304148e-05, + "loss": 1.4515, "step": 4935 }, { - "epoch": 1.48, - "grad_norm": 10.407564163208008, - "learning_rate": 1.0107246667334871e-05, - "loss": 1.0015, + "epoch": 0.62, + "grad_norm": 24.510046005249023, + "learning_rate": 1.587081119524746e-05, + "loss": 3.0281, "step": 4936 }, { - "epoch": 1.48, - "grad_norm": 17.139793395996094, - "learning_rate": 1.010524205673048e-05, - "loss": 1.3928, + "epoch": 0.62, + "grad_norm": 7.19443416595459, + "learning_rate": 1.586997448019077e-05, + "loss": 0.3961, "step": 4937 }, { - "epoch": 1.48, - "grad_norm": 34.3438606262207, - "learning_rate": 1.010323744612609e-05, - "loss": 1.5863, + "epoch": 0.62, + "grad_norm": 8.771178245544434, + "learning_rate": 1.5869137765134085e-05, + "loss": 2.8149, "step": 4938 }, { - "epoch": 1.48, - "grad_norm": 14.947382926940918, - "learning_rate": 1.0101232835521702e-05, - "loss": 1.3544, + "epoch": 0.62, + "grad_norm": 7.240764141082764, + "learning_rate": 1.5868301050077395e-05, + "loss": 1.5047, "step": 4939 }, { - "epoch": 1.49, - "grad_norm": 11.249651908874512, - "learning_rate": 1.009922822491731e-05, - "loss": 1.3051, + "epoch": 0.62, + "grad_norm": 43.71602249145508, + "learning_rate": 1.586746433502071e-05, + "loss": 2.9675, "step": 4940 }, { - "epoch": 1.49, - "grad_norm": 16.109973907470703, - "learning_rate": 1.009722361431292e-05, - "loss": 2.0421, + "epoch": 0.62, + "grad_norm": 31.66825294494629, + "learning_rate": 1.5866627619964023e-05, + "loss": 4.119, "step": 4941 }, { - "epoch": 1.49, - "grad_norm": 14.892985343933105, - "learning_rate": 1.009521900370853e-05, - "loss": 1.1327, + "epoch": 0.62, + "grad_norm": 10.924080848693848, + "learning_rate": 1.5865790904907333e-05, + "loss": 1.1588, "step": 4942 }, { - "epoch": 1.49, - "grad_norm": 27.952659606933594, - "learning_rate": 1.009321439310414e-05, - "loss": 2.3613, + "epoch": 0.62, + "grad_norm": 7.768029689788818, + "learning_rate": 1.5864954189850647e-05, + "loss": 1.9436, "step": 4943 }, { - "epoch": 1.49, - "grad_norm": 27.172393798828125, - "learning_rate": 1.009120978249975e-05, - "loss": 2.1473, + "epoch": 0.62, + "grad_norm": 18.65514373779297, + "learning_rate": 1.586411747479396e-05, + "loss": 3.5025, "step": 4944 }, { - "epoch": 1.49, - "grad_norm": 9.170857429504395, - "learning_rate": 1.008920517189536e-05, - "loss": 1.2239, + "epoch": 0.62, + "grad_norm": 17.466522216796875, + "learning_rate": 1.586328075973727e-05, + "loss": 2.2984, "step": 4945 }, { - "epoch": 1.49, - "grad_norm": 11.424710273742676, - "learning_rate": 1.008720056129097e-05, - "loss": 0.8955, + "epoch": 0.62, + "grad_norm": 5.136081695556641, + "learning_rate": 1.5862444044680584e-05, + "loss": 0.314, "step": 4946 }, { - "epoch": 1.49, - "grad_norm": 13.28514289855957, - "learning_rate": 1.0085195950686579e-05, - "loss": 1.6111, + "epoch": 0.62, + "grad_norm": 9.889122009277344, + "learning_rate": 1.5861607329623898e-05, + "loss": 1.0442, "step": 4947 }, { - "epoch": 1.49, - "grad_norm": 32.51649475097656, - "learning_rate": 1.008319134008219e-05, - "loss": 1.9871, + "epoch": 0.62, + "grad_norm": 30.280052185058594, + "learning_rate": 1.586077061456721e-05, + "loss": 1.293, "step": 4948 }, { - "epoch": 1.49, - "grad_norm": 15.511556625366211, - "learning_rate": 1.00811867294778e-05, - "loss": 1.37, + "epoch": 0.62, + "grad_norm": 25.252389907836914, + "learning_rate": 1.5859933899510522e-05, + "loss": 3.7356, "step": 4949 }, { - "epoch": 1.49, - "grad_norm": 16.7358341217041, - "learning_rate": 1.007918211887341e-05, - "loss": 1.6897, + "epoch": 0.62, + "grad_norm": 20.295284271240234, + "learning_rate": 1.5859097184453835e-05, + "loss": 1.943, "step": 4950 }, { - "epoch": 1.49, - "grad_norm": 13.070718765258789, - "learning_rate": 1.0077177508269021e-05, - "loss": 0.666, + "epoch": 0.62, + "grad_norm": 12.399702072143555, + "learning_rate": 1.585826046939715e-05, + "loss": 1.38, "step": 4951 }, { - "epoch": 1.49, - "grad_norm": 22.743227005004883, - "learning_rate": 1.007517289766463e-05, - "loss": 1.8756, + "epoch": 0.62, + "grad_norm": 16.72775650024414, + "learning_rate": 1.585742375434046e-05, + "loss": 1.3178, "step": 4952 }, { - "epoch": 1.49, - "grad_norm": 20.257579803466797, - "learning_rate": 1.007316828706024e-05, - "loss": 1.9218, + "epoch": 0.62, + "grad_norm": 10.180647850036621, + "learning_rate": 1.5856587039283773e-05, + "loss": 0.878, "step": 4953 }, { - "epoch": 1.49, - "grad_norm": 32.02715301513672, - "learning_rate": 1.0071163676455848e-05, - "loss": 1.2685, + "epoch": 0.62, + "grad_norm": 30.216543197631836, + "learning_rate": 1.5855750324227087e-05, + "loss": 2.0368, "step": 4954 }, { - "epoch": 1.49, - "grad_norm": 15.203670501708984, - "learning_rate": 1.006915906585146e-05, - "loss": 1.5685, + "epoch": 0.62, + "grad_norm": 17.116146087646484, + "learning_rate": 1.58549136091704e-05, + "loss": 1.4108, "step": 4955 }, { - "epoch": 1.49, - "grad_norm": 7.677938461303711, - "learning_rate": 1.006715445524707e-05, - "loss": 1.0445, + "epoch": 0.62, + "grad_norm": 6.7107768058776855, + "learning_rate": 1.585407689411371e-05, + "loss": 1.7707, "step": 4956 }, { - "epoch": 1.49, - "grad_norm": 24.447038650512695, - "learning_rate": 1.0065149844642678e-05, - "loss": 1.7323, + "epoch": 0.62, + "grad_norm": 10.454689025878906, + "learning_rate": 1.5853240179057024e-05, + "loss": 1.6508, "step": 4957 }, { - "epoch": 1.49, - "grad_norm": 12.981391906738281, - "learning_rate": 1.006314523403829e-05, - "loss": 1.9296, + "epoch": 0.62, + "grad_norm": 9.240266799926758, + "learning_rate": 1.5852403464000338e-05, + "loss": 1.6171, "step": 4958 }, { - "epoch": 1.49, - "grad_norm": 38.580570220947266, - "learning_rate": 1.0061140623433898e-05, - "loss": 1.909, + "epoch": 0.62, + "grad_norm": 34.222286224365234, + "learning_rate": 1.5851566748943648e-05, + "loss": 1.233, "step": 4959 }, { - "epoch": 1.49, - "grad_norm": 11.870725631713867, - "learning_rate": 1.0059136012829508e-05, - "loss": 1.3958, + "epoch": 0.62, + "grad_norm": 13.324649810791016, + "learning_rate": 1.5850730033886962e-05, + "loss": 1.1105, "step": 4960 }, { - "epoch": 1.49, - "grad_norm": 7.389049530029297, - "learning_rate": 1.0057131402225117e-05, - "loss": 0.8741, + "epoch": 0.62, + "grad_norm": 15.83174991607666, + "learning_rate": 1.5849893318830275e-05, + "loss": 1.5953, "step": 4961 }, { - "epoch": 1.49, - "grad_norm": 12.032258033752441, - "learning_rate": 1.0055126791620729e-05, - "loss": 1.4206, + "epoch": 0.62, + "grad_norm": 16.952129364013672, + "learning_rate": 1.5849056603773586e-05, + "loss": 2.568, "step": 4962 }, { - "epoch": 1.49, - "grad_norm": 22.42498207092285, - "learning_rate": 1.0053122181016339e-05, - "loss": 2.5246, + "epoch": 0.62, + "grad_norm": 6.830140590667725, + "learning_rate": 1.58482198887169e-05, + "loss": 1.5096, "step": 4963 }, { - "epoch": 1.49, - "grad_norm": 12.639444351196289, - "learning_rate": 1.0051117570411949e-05, - "loss": 1.8031, + "epoch": 0.62, + "grad_norm": 9.433883666992188, + "learning_rate": 1.584738317366021e-05, + "loss": 2.3211, "step": 4964 }, { - "epoch": 1.49, - "grad_norm": 33.977386474609375, - "learning_rate": 1.0049112959807559e-05, - "loss": 1.595, + "epoch": 0.62, + "grad_norm": 33.27182388305664, + "learning_rate": 1.5846546458603523e-05, + "loss": 4.0639, "step": 4965 }, { - "epoch": 1.49, - "grad_norm": 13.614667892456055, - "learning_rate": 1.0047108349203167e-05, - "loss": 0.975, + "epoch": 0.62, + "grad_norm": 15.093009948730469, + "learning_rate": 1.5845709743546837e-05, + "loss": 1.7505, "step": 4966 }, { - "epoch": 1.49, - "grad_norm": 18.450183868408203, - "learning_rate": 1.0045103738598779e-05, - "loss": 2.5177, + "epoch": 0.62, + "grad_norm": 5.886056900024414, + "learning_rate": 1.5844873028490147e-05, + "loss": 0.6074, "step": 4967 }, { - "epoch": 1.49, - "grad_norm": 15.293846130371094, - "learning_rate": 1.0043099127994389e-05, - "loss": 1.472, + "epoch": 0.62, + "grad_norm": 22.552860260009766, + "learning_rate": 1.584403631343346e-05, + "loss": 1.9228, "step": 4968 }, { - "epoch": 1.49, - "grad_norm": 16.424936294555664, - "learning_rate": 1.0041094517389997e-05, - "loss": 1.477, + "epoch": 0.62, + "grad_norm": 13.729313850402832, + "learning_rate": 1.5843199598376774e-05, + "loss": 2.583, "step": 4969 }, { - "epoch": 1.49, - "grad_norm": 13.09062385559082, - "learning_rate": 1.003908990678561e-05, - "loss": 1.414, + "epoch": 0.62, + "grad_norm": 8.290722846984863, + "learning_rate": 1.5842362883320085e-05, + "loss": 0.8759, "step": 4970 }, { - "epoch": 1.49, - "grad_norm": 21.331588745117188, - "learning_rate": 1.0037085296181218e-05, - "loss": 2.6522, + "epoch": 0.62, + "grad_norm": 10.95855712890625, + "learning_rate": 1.58415261682634e-05, + "loss": 1.096, "step": 4971 }, { - "epoch": 1.49, - "grad_norm": 44.23732376098633, - "learning_rate": 1.0035080685576828e-05, - "loss": 1.6369, + "epoch": 0.62, + "grad_norm": 17.684823989868164, + "learning_rate": 1.5840689453206712e-05, + "loss": 2.0879, "step": 4972 }, { - "epoch": 1.5, - "grad_norm": 13.58181095123291, - "learning_rate": 1.0033076074972436e-05, - "loss": 1.6649, + "epoch": 0.62, + "grad_norm": 7.833134174346924, + "learning_rate": 1.5839852738150022e-05, + "loss": 1.6442, "step": 4973 }, { - "epoch": 1.5, - "grad_norm": 25.254730224609375, - "learning_rate": 1.0031071464368048e-05, - "loss": 2.0687, + "epoch": 0.62, + "grad_norm": 22.88947296142578, + "learning_rate": 1.5839016023093336e-05, + "loss": 2.8722, "step": 4974 }, { - "epoch": 1.5, - "grad_norm": 10.236112594604492, - "learning_rate": 1.0029066853763658e-05, - "loss": 1.3942, + "epoch": 0.62, + "grad_norm": 14.389993667602539, + "learning_rate": 1.583817930803665e-05, + "loss": 1.0751, "step": 4975 }, { - "epoch": 1.5, - "grad_norm": 13.936944961547852, - "learning_rate": 1.0027062243159266e-05, - "loss": 2.3225, + "epoch": 0.62, + "grad_norm": 6.895519733428955, + "learning_rate": 1.5837342592979963e-05, + "loss": 0.6522, "step": 4976 }, { - "epoch": 1.5, - "grad_norm": 11.376901626586914, - "learning_rate": 1.0025057632554878e-05, - "loss": 1.3294, + "epoch": 0.62, + "grad_norm": 9.349720001220703, + "learning_rate": 1.5836505877923273e-05, + "loss": 0.8121, "step": 4977 }, { - "epoch": 1.5, - "grad_norm": 15.195333480834961, - "learning_rate": 1.0023053021950487e-05, - "loss": 2.3572, + "epoch": 0.62, + "grad_norm": 16.215843200683594, + "learning_rate": 1.5835669162866587e-05, + "loss": 1.9194, "step": 4978 }, { - "epoch": 1.5, - "grad_norm": 35.94542694091797, - "learning_rate": 1.0021048411346097e-05, - "loss": 3.3368, + "epoch": 0.62, + "grad_norm": 10.783334732055664, + "learning_rate": 1.58348324478099e-05, + "loss": 0.9586, "step": 4979 }, { - "epoch": 1.5, - "grad_norm": 21.131452560424805, - "learning_rate": 1.0019043800741705e-05, - "loss": 1.618, + "epoch": 0.62, + "grad_norm": 18.04204750061035, + "learning_rate": 1.583399573275321e-05, + "loss": 1.4249, "step": 4980 }, { - "epoch": 1.5, - "grad_norm": 22.635162353515625, - "learning_rate": 1.0017039190137317e-05, - "loss": 2.0817, + "epoch": 0.63, + "grad_norm": 16.274057388305664, + "learning_rate": 1.5833159017696525e-05, + "loss": 0.9971, "step": 4981 }, { - "epoch": 1.5, - "grad_norm": 22.40682601928711, - "learning_rate": 1.0015034579532927e-05, - "loss": 1.2649, + "epoch": 0.63, + "grad_norm": 12.840714454650879, + "learning_rate": 1.583232230263984e-05, + "loss": 1.802, "step": 4982 }, { - "epoch": 1.5, - "grad_norm": 7.878605365753174, - "learning_rate": 1.0013029968928535e-05, - "loss": 1.3573, + "epoch": 0.63, + "grad_norm": 9.13722038269043, + "learning_rate": 1.5831485587583152e-05, + "loss": 1.0917, "step": 4983 }, { - "epoch": 1.5, - "grad_norm": 19.825239181518555, - "learning_rate": 1.0011025358324147e-05, - "loss": 1.9824, + "epoch": 0.63, + "grad_norm": 19.6453857421875, + "learning_rate": 1.5830648872526462e-05, + "loss": 3.3396, "step": 4984 }, { - "epoch": 1.5, - "grad_norm": 14.39383602142334, - "learning_rate": 1.0009020747719755e-05, - "loss": 1.8926, + "epoch": 0.63, + "grad_norm": 13.792831420898438, + "learning_rate": 1.5829812157469776e-05, + "loss": 1.3786, "step": 4985 }, { - "epoch": 1.5, - "grad_norm": 21.175561904907227, - "learning_rate": 1.0007016137115366e-05, - "loss": 1.114, + "epoch": 0.63, + "grad_norm": 9.151402473449707, + "learning_rate": 1.582897544241309e-05, + "loss": 0.5139, "step": 4986 }, { - "epoch": 1.5, - "grad_norm": 17.892271041870117, - "learning_rate": 1.0005011526510977e-05, - "loss": 1.2682, + "epoch": 0.63, + "grad_norm": 13.77298355102539, + "learning_rate": 1.58281387273564e-05, + "loss": 2.9223, "step": 4987 }, { - "epoch": 1.5, - "grad_norm": 14.41032600402832, - "learning_rate": 1.0003006915906586e-05, - "loss": 1.3076, + "epoch": 0.63, + "grad_norm": 13.152591705322266, + "learning_rate": 1.5827302012299713e-05, + "loss": 1.7311, "step": 4988 }, { - "epoch": 1.5, - "grad_norm": 10.802579879760742, - "learning_rate": 1.0001002305302196e-05, - "loss": 1.1787, + "epoch": 0.63, + "grad_norm": 13.780109405517578, + "learning_rate": 1.5826465297243027e-05, + "loss": 0.9852, "step": 4989 }, { - "epoch": 1.5, - "grad_norm": 16.22612762451172, - "learning_rate": 9.998997694697806e-06, - "loss": 1.8036, + "epoch": 0.63, + "grad_norm": 18.66058921813965, + "learning_rate": 1.5825628582186337e-05, + "loss": 3.3515, "step": 4990 }, { - "epoch": 1.5, - "grad_norm": 9.207633018493652, - "learning_rate": 9.996993084093416e-06, - "loss": 1.2887, + "epoch": 0.63, + "grad_norm": 16.108327865600586, + "learning_rate": 1.582479186712965e-05, + "loss": 0.7064, "step": 4991 }, { - "epoch": 1.5, - "grad_norm": 109.87674713134766, - "learning_rate": 9.994988473489026e-06, - "loss": 2.7811, + "epoch": 0.63, + "grad_norm": 33.89423751831055, + "learning_rate": 1.582395515207296e-05, + "loss": 2.1219, "step": 4992 }, { - "epoch": 1.5, - "grad_norm": 10.483904838562012, - "learning_rate": 9.992983862884636e-06, - "loss": 2.3548, + "epoch": 0.63, + "grad_norm": 20.117361068725586, + "learning_rate": 1.5823118437016275e-05, + "loss": 2.2392, "step": 4993 }, { - "epoch": 1.5, - "grad_norm": 8.791914939880371, - "learning_rate": 9.990979252280245e-06, - "loss": 1.0692, + "epoch": 0.63, + "grad_norm": 2.8925938606262207, + "learning_rate": 1.582228172195959e-05, + "loss": 0.053, "step": 4994 }, { - "epoch": 1.5, - "grad_norm": 12.697139739990234, - "learning_rate": 9.988974641675855e-06, - "loss": 1.2917, + "epoch": 0.63, + "grad_norm": 11.56701946258545, + "learning_rate": 1.58214450069029e-05, + "loss": 1.919, "step": 4995 }, { - "epoch": 1.5, - "grad_norm": 15.190261840820312, - "learning_rate": 9.986970031071466e-06, - "loss": 1.7266, + "epoch": 0.63, + "grad_norm": 12.960175514221191, + "learning_rate": 1.5820608291846212e-05, + "loss": 1.4824, "step": 4996 }, { - "epoch": 1.5, - "grad_norm": 13.911056518554688, - "learning_rate": 9.984965420467075e-06, - "loss": 1.2805, + "epoch": 0.63, + "grad_norm": 28.917999267578125, + "learning_rate": 1.5819771576789526e-05, + "loss": 2.58, "step": 4997 }, { - "epoch": 1.5, - "grad_norm": 10.715677261352539, - "learning_rate": 9.982960809862685e-06, - "loss": 2.3035, + "epoch": 0.63, + "grad_norm": 15.101088523864746, + "learning_rate": 1.5818934861732836e-05, + "loss": 1.1439, "step": 4998 }, { - "epoch": 1.5, - "grad_norm": 23.649269104003906, - "learning_rate": 9.980956199258295e-06, - "loss": 1.6413, + "epoch": 0.63, + "grad_norm": 17.131656646728516, + "learning_rate": 1.581809814667615e-05, + "loss": 3.1353, "step": 4999 }, { - "epoch": 1.5, - "grad_norm": 9.821858406066895, - "learning_rate": 9.978951588653905e-06, - "loss": 1.1268, + "epoch": 0.63, + "grad_norm": 27.308841705322266, + "learning_rate": 1.5817261431619464e-05, + "loss": 1.8908, "step": 5000 }, { - "epoch": 1.5, - "grad_norm": 11.555136680603027, - "learning_rate": 9.976946978049515e-06, - "loss": 1.2356, + "epoch": 0.63, + "grad_norm": 10.716662406921387, + "learning_rate": 1.5816424716562774e-05, + "loss": 1.1753, "step": 5001 }, { - "epoch": 1.5, - "grad_norm": 15.840431213378906, - "learning_rate": 9.974942367445125e-06, - "loss": 1.7111, + "epoch": 0.63, + "grad_norm": 17.947704315185547, + "learning_rate": 1.5815588001506088e-05, + "loss": 1.5808, "step": 5002 }, { - "epoch": 1.5, - "grad_norm": 13.110334396362305, - "learning_rate": 9.972937756840735e-06, - "loss": 1.3668, + "epoch": 0.63, + "grad_norm": 9.132237434387207, + "learning_rate": 1.58147512864494e-05, + "loss": 0.5576, "step": 5003 }, { - "epoch": 1.5, - "grad_norm": 15.277511596679688, - "learning_rate": 9.970933146236345e-06, - "loss": 1.524, + "epoch": 0.63, + "grad_norm": 6.144209861755371, + "learning_rate": 1.5813914571392715e-05, + "loss": 0.64, "step": 5004 }, { - "epoch": 1.5, - "grad_norm": 22.332592010498047, - "learning_rate": 9.968928535631954e-06, - "loss": 1.8393, + "epoch": 0.63, + "grad_norm": 32.854312896728516, + "learning_rate": 1.5813077856336025e-05, + "loss": 1.5851, "step": 5005 }, { - "epoch": 1.51, - "grad_norm": 10.41042709350586, - "learning_rate": 9.966923925027564e-06, - "loss": 0.9121, + "epoch": 0.63, + "grad_norm": 19.065019607543945, + "learning_rate": 1.581224114127934e-05, + "loss": 1.7912, "step": 5006 }, { - "epoch": 1.51, - "grad_norm": 17.307802200317383, - "learning_rate": 9.964919314423174e-06, - "loss": 1.2376, + "epoch": 0.63, + "grad_norm": 10.907378196716309, + "learning_rate": 1.5811404426222652e-05, + "loss": 2.303, "step": 5007 }, { - "epoch": 1.51, - "grad_norm": 28.895647048950195, - "learning_rate": 9.962914703818784e-06, - "loss": 2.9159, + "epoch": 0.63, + "grad_norm": 88.3462142944336, + "learning_rate": 1.5810567711165963e-05, + "loss": 3.0653, "step": 5008 }, { - "epoch": 1.51, - "grad_norm": 54.32232666015625, - "learning_rate": 9.960910093214394e-06, - "loss": 2.1614, + "epoch": 0.63, + "grad_norm": 14.676846504211426, + "learning_rate": 1.5809730996109276e-05, + "loss": 2.3928, "step": 5009 }, { - "epoch": 1.51, - "grad_norm": 13.556760787963867, - "learning_rate": 9.958905482610004e-06, - "loss": 0.914, + "epoch": 0.63, + "grad_norm": 15.923056602478027, + "learning_rate": 1.580889428105259e-05, + "loss": 0.9603, "step": 5010 }, { - "epoch": 1.51, - "grad_norm": 16.66999053955078, - "learning_rate": 9.956900872005614e-06, - "loss": 1.303, + "epoch": 0.63, + "grad_norm": 10.484317779541016, + "learning_rate": 1.5808057565995904e-05, + "loss": 3.553, "step": 5011 }, { - "epoch": 1.51, - "grad_norm": 23.729116439819336, - "learning_rate": 9.954896261401223e-06, - "loss": 1.1319, + "epoch": 0.63, + "grad_norm": 8.377695083618164, + "learning_rate": 1.5807220850939214e-05, + "loss": 2.7543, "step": 5012 }, { - "epoch": 1.51, - "grad_norm": 14.170336723327637, - "learning_rate": 9.952891650796833e-06, - "loss": 1.2095, + "epoch": 0.63, + "grad_norm": 12.684469223022461, + "learning_rate": 1.5806384135882528e-05, + "loss": 1.6304, "step": 5013 }, { - "epoch": 1.51, - "grad_norm": 8.873191833496094, - "learning_rate": 9.950887040192443e-06, - "loss": 1.0473, + "epoch": 0.63, + "grad_norm": 33.287166595458984, + "learning_rate": 1.580554742082584e-05, + "loss": 2.6802, "step": 5014 }, { - "epoch": 1.51, - "grad_norm": 19.656044006347656, - "learning_rate": 9.948882429588053e-06, - "loss": 1.9538, + "epoch": 0.63, + "grad_norm": 38.08937454223633, + "learning_rate": 1.580471070576915e-05, + "loss": 0.912, "step": 5015 }, { - "epoch": 1.51, - "grad_norm": 7.630486011505127, - "learning_rate": 9.946877818983663e-06, - "loss": 1.2515, + "epoch": 0.63, + "grad_norm": 13.854416847229004, + "learning_rate": 1.5803873990712465e-05, + "loss": 1.3063, "step": 5016 }, { - "epoch": 1.51, - "grad_norm": 15.626066207885742, - "learning_rate": 9.944873208379273e-06, - "loss": 1.7024, + "epoch": 0.63, + "grad_norm": 11.320171356201172, + "learning_rate": 1.5803037275655775e-05, + "loss": 1.699, "step": 5017 }, { - "epoch": 1.51, - "grad_norm": 11.820615768432617, - "learning_rate": 9.942868597774883e-06, - "loss": 1.5467, + "epoch": 0.63, + "grad_norm": 6.064815044403076, + "learning_rate": 1.580220056059909e-05, + "loss": 0.5363, "step": 5018 }, { - "epoch": 1.51, - "grad_norm": 26.351261138916016, - "learning_rate": 9.940863987170493e-06, - "loss": 1.0137, + "epoch": 0.63, + "grad_norm": 33.121761322021484, + "learning_rate": 1.5801363845542403e-05, + "loss": 1.9679, "step": 5019 }, { - "epoch": 1.51, - "grad_norm": 15.984896659851074, - "learning_rate": 9.938859376566102e-06, - "loss": 1.2989, + "epoch": 0.63, + "grad_norm": 11.829465866088867, + "learning_rate": 1.5800527130485713e-05, + "loss": 0.7114, "step": 5020 }, { - "epoch": 1.51, - "grad_norm": 23.20572853088379, - "learning_rate": 9.936854765961713e-06, - "loss": 1.8128, + "epoch": 0.63, + "grad_norm": 26.17995262145996, + "learning_rate": 1.5799690415429027e-05, + "loss": 2.2246, "step": 5021 }, { - "epoch": 1.51, - "grad_norm": 12.120548248291016, - "learning_rate": 9.934850155357323e-06, - "loss": 1.295, + "epoch": 0.63, + "grad_norm": 13.5386323928833, + "learning_rate": 1.579885370037234e-05, + "loss": 2.1091, "step": 5022 }, { - "epoch": 1.51, - "grad_norm": 14.345947265625, - "learning_rate": 9.932845544752932e-06, - "loss": 1.0609, + "epoch": 0.63, + "grad_norm": 23.967710494995117, + "learning_rate": 1.579801698531565e-05, + "loss": 1.9515, "step": 5023 }, { - "epoch": 1.51, - "grad_norm": 10.384604454040527, - "learning_rate": 9.930840934148542e-06, - "loss": 0.8389, + "epoch": 0.63, + "grad_norm": 19.2731876373291, + "learning_rate": 1.5797180270258964e-05, + "loss": 2.5988, "step": 5024 }, { - "epoch": 1.51, - "grad_norm": 50.97624588012695, - "learning_rate": 9.928836323544152e-06, - "loss": 2.2094, + "epoch": 0.63, + "grad_norm": 9.11064338684082, + "learning_rate": 1.5796343555202278e-05, + "loss": 0.5122, "step": 5025 }, { - "epoch": 1.51, - "grad_norm": 15.930320739746094, - "learning_rate": 9.926831712939762e-06, - "loss": 1.0719, + "epoch": 0.63, + "grad_norm": 11.065043449401855, + "learning_rate": 1.5795506840145588e-05, + "loss": 0.9982, "step": 5026 }, { - "epoch": 1.51, - "grad_norm": 15.929656982421875, - "learning_rate": 9.924827102335372e-06, - "loss": 1.4516, + "epoch": 0.63, + "grad_norm": 17.819351196289062, + "learning_rate": 1.5794670125088902e-05, + "loss": 1.8422, "step": 5027 }, { - "epoch": 1.51, - "grad_norm": 20.207595825195312, - "learning_rate": 9.922822491730982e-06, - "loss": 1.122, + "epoch": 0.63, + "grad_norm": 22.37653350830078, + "learning_rate": 1.5793833410032215e-05, + "loss": 1.7192, "step": 5028 }, { - "epoch": 1.51, - "grad_norm": 10.85169506072998, - "learning_rate": 9.920817881126592e-06, - "loss": 1.3796, + "epoch": 0.63, + "grad_norm": 12.90748405456543, + "learning_rate": 1.5792996694975526e-05, + "loss": 1.1331, "step": 5029 }, { - "epoch": 1.51, - "grad_norm": 40.899234771728516, - "learning_rate": 9.918813270522202e-06, - "loss": 2.79, + "epoch": 0.63, + "grad_norm": 20.128108978271484, + "learning_rate": 1.579215997991884e-05, + "loss": 1.3026, "step": 5030 }, { - "epoch": 1.51, - "grad_norm": 77.47624206542969, - "learning_rate": 9.91680865991781e-06, - "loss": 2.6326, + "epoch": 0.63, + "grad_norm": 6.115049839019775, + "learning_rate": 1.5791323264862153e-05, + "loss": 2.1955, "step": 5031 }, { - "epoch": 1.51, - "grad_norm": 11.343374252319336, - "learning_rate": 9.914804049313421e-06, - "loss": 1.2866, + "epoch": 0.63, + "grad_norm": 16.82707977294922, + "learning_rate": 1.5790486549805467e-05, + "loss": 3.8006, "step": 5032 }, { - "epoch": 1.51, - "grad_norm": 35.656246185302734, - "learning_rate": 9.912799438709033e-06, - "loss": 2.6495, + "epoch": 0.63, + "grad_norm": 9.201262474060059, + "learning_rate": 1.5789649834748777e-05, + "loss": 1.1929, "step": 5033 }, { - "epoch": 1.51, - "grad_norm": 17.106098175048828, - "learning_rate": 9.910794828104641e-06, - "loss": 1.4173, + "epoch": 0.63, + "grad_norm": 24.340532302856445, + "learning_rate": 1.578881311969209e-05, + "loss": 2.5925, "step": 5034 }, { - "epoch": 1.51, - "grad_norm": 25.03550148010254, - "learning_rate": 9.908790217500251e-06, - "loss": 1.5695, + "epoch": 0.63, + "grad_norm": 21.014192581176758, + "learning_rate": 1.5787976404635404e-05, + "loss": 2.3361, "step": 5035 }, { - "epoch": 1.51, - "grad_norm": 14.62669563293457, - "learning_rate": 9.906785606895861e-06, - "loss": 1.6762, + "epoch": 0.63, + "grad_norm": 11.501837730407715, + "learning_rate": 1.5787139689578714e-05, + "loss": 1.1706, "step": 5036 }, { - "epoch": 1.51, - "grad_norm": 98.68997192382812, - "learning_rate": 9.904780996291471e-06, - "loss": 3.2678, + "epoch": 0.63, + "grad_norm": 15.21597957611084, + "learning_rate": 1.5786302974522028e-05, + "loss": 2.0659, "step": 5037 }, { - "epoch": 1.51, - "grad_norm": 10.68436050415039, - "learning_rate": 9.902776385687081e-06, - "loss": 1.6819, + "epoch": 0.63, + "grad_norm": 5.9182047843933105, + "learning_rate": 1.578546625946534e-05, + "loss": 0.4091, "step": 5038 }, { - "epoch": 1.52, - "grad_norm": 17.10354232788086, - "learning_rate": 9.900771775082692e-06, - "loss": 1.5759, + "epoch": 0.63, + "grad_norm": 14.510945320129395, + "learning_rate": 1.5784629544408655e-05, + "loss": 2.0028, "step": 5039 }, { - "epoch": 1.52, - "grad_norm": 24.183143615722656, - "learning_rate": 9.898767164478302e-06, - "loss": 1.51, - "step": 5040 - }, - { - "epoch": 1.52, - "eval_loss": 0.1983981430530548, - "eval_runtime": 43.7634, - "eval_samples_per_second": 33.795, - "eval_steps_per_second": 33.795, + "epoch": 0.63, + "grad_norm": 8.41043758392334, + "learning_rate": 1.5783792829351966e-05, + "loss": 0.6456, "step": 5040 }, { - "epoch": 1.52, - "grad_norm": 11.218318939208984, - "learning_rate": 9.896762553873912e-06, - "loss": 2.6052, + "epoch": 0.63, + "grad_norm": 14.400217056274414, + "learning_rate": 1.578295611429528e-05, + "loss": 2.1407, "step": 5041 }, { - "epoch": 1.52, - "grad_norm": 28.35023307800293, - "learning_rate": 9.89475794326952e-06, - "loss": 1.5494, + "epoch": 0.63, + "grad_norm": 9.780742645263672, + "learning_rate": 1.5782119399238593e-05, + "loss": 1.5189, "step": 5042 }, { - "epoch": 1.52, - "grad_norm": 39.44877624511719, - "learning_rate": 9.89275333266513e-06, - "loss": 2.2204, + "epoch": 0.63, + "grad_norm": 4.016148567199707, + "learning_rate": 1.5781282684181903e-05, + "loss": 0.2529, "step": 5043 }, { - "epoch": 1.52, - "grad_norm": 16.99179458618164, - "learning_rate": 9.89074872206074e-06, - "loss": 1.9028, + "epoch": 0.63, + "grad_norm": 15.589259147644043, + "learning_rate": 1.5780445969125217e-05, + "loss": 2.1329, "step": 5044 }, { - "epoch": 1.52, - "grad_norm": 22.564708709716797, - "learning_rate": 9.88874411145635e-06, - "loss": 2.0602, + "epoch": 0.63, + "grad_norm": 12.96193790435791, + "learning_rate": 1.5779609254068527e-05, + "loss": 2.2834, "step": 5045 }, { - "epoch": 1.52, - "grad_norm": 10.778221130371094, - "learning_rate": 9.88673950085196e-06, - "loss": 1.071, + "epoch": 0.63, + "grad_norm": 30.995908737182617, + "learning_rate": 1.577877253901184e-05, + "loss": 1.8793, "step": 5046 }, { - "epoch": 1.52, - "grad_norm": 25.14896583557129, - "learning_rate": 9.88473489024757e-06, - "loss": 1.4653, + "epoch": 0.63, + "grad_norm": 15.587225914001465, + "learning_rate": 1.5777935823955154e-05, + "loss": 2.889, "step": 5047 }, { - "epoch": 1.52, - "grad_norm": 24.445661544799805, - "learning_rate": 9.88273027964318e-06, - "loss": 1.2668, + "epoch": 0.63, + "grad_norm": 10.563833236694336, + "learning_rate": 1.5777099108898465e-05, + "loss": 2.2132, "step": 5048 }, { - "epoch": 1.52, - "grad_norm": 19.382781982421875, - "learning_rate": 9.88072566903879e-06, - "loss": 1.6534, + "epoch": 0.63, + "grad_norm": 9.568826675415039, + "learning_rate": 1.5776262393841778e-05, + "loss": 1.0254, "step": 5049 }, { - "epoch": 1.52, - "grad_norm": 48.97084426879883, - "learning_rate": 9.878721058434399e-06, - "loss": 1.6285, + "epoch": 0.63, + "grad_norm": 15.084373474121094, + "learning_rate": 1.577542567878509e-05, + "loss": 1.1353, "step": 5050 }, { - "epoch": 1.52, - "grad_norm": 23.401174545288086, - "learning_rate": 9.87671644783001e-06, - "loss": 2.24, + "epoch": 0.63, + "grad_norm": 12.499186515808105, + "learning_rate": 1.5774588963728402e-05, + "loss": 1.9012, "step": 5051 }, { - "epoch": 1.52, - "grad_norm": 7.404333591461182, - "learning_rate": 9.874711837225621e-06, - "loss": 0.8004, + "epoch": 0.63, + "grad_norm": 12.14394474029541, + "learning_rate": 1.5773752248671716e-05, + "loss": 2.1066, "step": 5052 }, { - "epoch": 1.52, - "grad_norm": 14.808720588684082, - "learning_rate": 9.87270722662123e-06, - "loss": 1.193, + "epoch": 0.63, + "grad_norm": 7.123768329620361, + "learning_rate": 1.577291553361503e-05, + "loss": 1.5745, "step": 5053 }, { - "epoch": 1.52, - "grad_norm": 11.823633193969727, - "learning_rate": 9.87070261601684e-06, - "loss": 1.1512, + "epoch": 0.63, + "grad_norm": 11.73341178894043, + "learning_rate": 1.577207881855834e-05, + "loss": 1.5408, "step": 5054 }, { - "epoch": 1.52, - "grad_norm": 20.415996551513672, - "learning_rate": 9.86869800541245e-06, - "loss": 1.7406, + "epoch": 0.63, + "grad_norm": 15.225319862365723, + "learning_rate": 1.5771242103501653e-05, + "loss": 0.7718, "step": 5055 }, { - "epoch": 1.52, - "grad_norm": 584.0230712890625, - "learning_rate": 9.86669339480806e-06, - "loss": 1.3769, + "epoch": 0.63, + "grad_norm": 16.591278076171875, + "learning_rate": 1.5770405388444967e-05, + "loss": 2.4582, "step": 5056 }, { - "epoch": 1.52, - "grad_norm": 14.067730903625488, - "learning_rate": 9.864688784203668e-06, - "loss": 1.1319, + "epoch": 0.63, + "grad_norm": 34.30024719238281, + "learning_rate": 1.5769568673388277e-05, + "loss": 3.0588, "step": 5057 }, { - "epoch": 1.52, - "grad_norm": 27.009614944458008, - "learning_rate": 9.86268417359928e-06, - "loss": 1.6301, + "epoch": 0.63, + "grad_norm": 14.780279159545898, + "learning_rate": 1.576873195833159e-05, + "loss": 1.0337, "step": 5058 }, { - "epoch": 1.52, - "grad_norm": 12.77624225616455, - "learning_rate": 9.86067956299489e-06, - "loss": 1.4832, + "epoch": 0.63, + "grad_norm": 16.732141494750977, + "learning_rate": 1.5767895243274905e-05, + "loss": 2.9683, "step": 5059 }, { - "epoch": 1.52, - "grad_norm": 14.48194408416748, - "learning_rate": 9.858674952390498e-06, - "loss": 1.3063, + "epoch": 0.64, + "grad_norm": 18.878597259521484, + "learning_rate": 1.5767058528218218e-05, + "loss": 3.4954, "step": 5060 }, { - "epoch": 1.52, - "grad_norm": 23.552724838256836, - "learning_rate": 9.856670341786108e-06, - "loss": 1.726, + "epoch": 0.64, + "grad_norm": 22.96034812927246, + "learning_rate": 1.576622181316153e-05, + "loss": 1.2982, "step": 5061 }, { - "epoch": 1.52, - "grad_norm": 35.44766616821289, - "learning_rate": 9.854665731181718e-06, - "loss": 1.7173, + "epoch": 0.64, + "grad_norm": 19.98801040649414, + "learning_rate": 1.5765385098104842e-05, + "loss": 2.2953, "step": 5062 }, { - "epoch": 1.52, - "grad_norm": 17.609161376953125, - "learning_rate": 9.852661120577328e-06, - "loss": 1.2193, + "epoch": 0.64, + "grad_norm": 25.850393295288086, + "learning_rate": 1.5764548383048156e-05, + "loss": 3.2962, "step": 5063 }, { - "epoch": 1.52, - "grad_norm": 13.460494041442871, - "learning_rate": 9.850656509972939e-06, - "loss": 1.2271, + "epoch": 0.64, + "grad_norm": 15.799878120422363, + "learning_rate": 1.5763711667991466e-05, + "loss": 1.8083, "step": 5064 }, { - "epoch": 1.52, - "grad_norm": 19.18136215209961, - "learning_rate": 9.848651899368549e-06, - "loss": 1.0003, + "epoch": 0.64, + "grad_norm": 9.549363136291504, + "learning_rate": 1.576287495293478e-05, + "loss": 1.6983, "step": 5065 }, { - "epoch": 1.52, - "grad_norm": 33.11497497558594, - "learning_rate": 9.846647288764159e-06, - "loss": 1.9836, + "epoch": 0.64, + "grad_norm": 16.10846519470215, + "learning_rate": 1.5762038237878093e-05, + "loss": 0.9541, "step": 5066 }, { - "epoch": 1.52, - "grad_norm": 17.282258987426758, - "learning_rate": 9.844642678159769e-06, - "loss": 1.7075, + "epoch": 0.64, + "grad_norm": 13.372329711914062, + "learning_rate": 1.5761201522821407e-05, + "loss": 0.8804, "step": 5067 }, { - "epoch": 1.52, - "grad_norm": 12.306963920593262, - "learning_rate": 9.842638067555377e-06, - "loss": 0.9739, + "epoch": 0.64, + "grad_norm": 16.088727951049805, + "learning_rate": 1.5760364807764717e-05, + "loss": 1.3293, "step": 5068 }, { - "epoch": 1.52, - "grad_norm": 18.901031494140625, - "learning_rate": 9.840633456950987e-06, - "loss": 1.4667, + "epoch": 0.64, + "grad_norm": 42.135169982910156, + "learning_rate": 1.575952809270803e-05, + "loss": 2.4092, "step": 5069 }, { - "epoch": 1.52, - "grad_norm": 22.379085540771484, - "learning_rate": 9.838628846346599e-06, - "loss": 1.1675, + "epoch": 0.64, + "grad_norm": 19.614219665527344, + "learning_rate": 1.575869137765134e-05, + "loss": 3.0553, "step": 5070 }, { - "epoch": 1.52, - "grad_norm": 14.647932052612305, - "learning_rate": 9.836624235742207e-06, - "loss": 1.809, + "epoch": 0.64, + "grad_norm": 10.226930618286133, + "learning_rate": 1.5757854662594655e-05, + "loss": 1.9401, "step": 5071 }, { - "epoch": 1.52, - "grad_norm": 8.400578498840332, - "learning_rate": 9.834619625137818e-06, - "loss": 1.8459, + "epoch": 0.64, + "grad_norm": 16.097923278808594, + "learning_rate": 1.575701794753797e-05, + "loss": 1.559, "step": 5072 }, { - "epoch": 1.53, - "grad_norm": 14.352058410644531, - "learning_rate": 9.832615014533428e-06, - "loss": 1.6325, + "epoch": 0.64, + "grad_norm": 22.118406295776367, + "learning_rate": 1.575618123248128e-05, + "loss": 1.5106, "step": 5073 }, { - "epoch": 1.53, - "grad_norm": 38.508724212646484, - "learning_rate": 9.830610403929038e-06, - "loss": 1.4109, + "epoch": 0.64, + "grad_norm": 11.076290130615234, + "learning_rate": 1.5755344517424592e-05, + "loss": 1.292, "step": 5074 }, { - "epoch": 1.53, - "grad_norm": 24.62671661376953, - "learning_rate": 9.828605793324648e-06, - "loss": 2.3143, + "epoch": 0.64, + "grad_norm": 7.768183708190918, + "learning_rate": 1.5754507802367903e-05, + "loss": 2.8704, "step": 5075 }, { - "epoch": 1.53, - "grad_norm": 32.014671325683594, - "learning_rate": 9.826601182720258e-06, - "loss": 2.2224, + "epoch": 0.64, + "grad_norm": 15.45603084564209, + "learning_rate": 1.5753671087311216e-05, + "loss": 2.2323, "step": 5076 }, { - "epoch": 1.53, - "grad_norm": 16.45624351501465, - "learning_rate": 9.824596572115868e-06, - "loss": 1.3855, + "epoch": 0.64, + "grad_norm": 16.193885803222656, + "learning_rate": 1.575283437225453e-05, + "loss": 2.3428, "step": 5077 }, { - "epoch": 1.53, - "grad_norm": 24.989152908325195, - "learning_rate": 9.822591961511478e-06, - "loss": 1.8901, + "epoch": 0.64, + "grad_norm": 15.888275146484375, + "learning_rate": 1.575199765719784e-05, + "loss": 1.1974, "step": 5078 }, { - "epoch": 1.53, - "grad_norm": 16.87116241455078, - "learning_rate": 9.820587350907086e-06, - "loss": 2.1003, + "epoch": 0.64, + "grad_norm": 17.932859420776367, + "learning_rate": 1.5751160942141154e-05, + "loss": 1.7937, "step": 5079 }, { - "epoch": 1.53, - "grad_norm": 19.88626480102539, - "learning_rate": 9.818582740302697e-06, - "loss": 0.9223, + "epoch": 0.64, + "grad_norm": 32.422584533691406, + "learning_rate": 1.5750324227084468e-05, + "loss": 2.737, "step": 5080 }, { - "epoch": 1.53, - "grad_norm": 29.52408218383789, - "learning_rate": 9.816578129698307e-06, - "loss": 2.1227, + "epoch": 0.64, + "grad_norm": 5.988121509552002, + "learning_rate": 1.574948751202778e-05, + "loss": 0.3623, "step": 5081 }, { - "epoch": 1.53, - "grad_norm": 6.786428451538086, - "learning_rate": 9.814573519093917e-06, - "loss": 1.1044, + "epoch": 0.64, + "grad_norm": 5.981936931610107, + "learning_rate": 1.574865079697109e-05, + "loss": 0.7407, "step": 5082 }, { - "epoch": 1.53, - "grad_norm": 11.187318801879883, - "learning_rate": 9.812568908489527e-06, - "loss": 1.0142, + "epoch": 0.64, + "grad_norm": 26.017316818237305, + "learning_rate": 1.5747814081914405e-05, + "loss": 2.8178, "step": 5083 }, { - "epoch": 1.53, - "grad_norm": 9.082967758178711, - "learning_rate": 9.810564297885137e-06, - "loss": 0.8634, + "epoch": 0.64, + "grad_norm": 10.025772094726562, + "learning_rate": 1.574697736685772e-05, + "loss": 3.3694, "step": 5084 }, { - "epoch": 1.53, - "grad_norm": 38.15378189086914, - "learning_rate": 9.808559687280747e-06, - "loss": 1.4613, + "epoch": 0.64, + "grad_norm": 17.240604400634766, + "learning_rate": 1.574614065180103e-05, + "loss": 1.836, "step": 5085 }, { - "epoch": 1.53, - "grad_norm": 11.708313941955566, - "learning_rate": 9.806555076676357e-06, - "loss": 2.1042, + "epoch": 0.64, + "grad_norm": 10.621012687683105, + "learning_rate": 1.5745303936744343e-05, + "loss": 1.0921, "step": 5086 }, { - "epoch": 1.53, - "grad_norm": 15.158965110778809, - "learning_rate": 9.804550466071965e-06, - "loss": 1.2576, + "epoch": 0.64, + "grad_norm": 4.24019718170166, + "learning_rate": 1.5744467221687656e-05, + "loss": 0.2206, "step": 5087 }, { - "epoch": 1.53, - "grad_norm": 15.897801399230957, - "learning_rate": 9.802545855467577e-06, - "loss": 1.5028, + "epoch": 0.64, + "grad_norm": 25.50225067138672, + "learning_rate": 1.574363050663097e-05, + "loss": 1.9584, "step": 5088 }, { - "epoch": 1.53, - "grad_norm": 58.225669860839844, - "learning_rate": 9.800541244863187e-06, - "loss": 1.0376, + "epoch": 0.64, + "grad_norm": 8.65950870513916, + "learning_rate": 1.574279379157428e-05, + "loss": 0.7873, "step": 5089 }, { - "epoch": 1.53, - "grad_norm": 26.382015228271484, - "learning_rate": 9.798536634258796e-06, - "loss": 1.2807, + "epoch": 0.64, + "grad_norm": 16.24616813659668, + "learning_rate": 1.5741957076517594e-05, + "loss": 1.4247, "step": 5090 }, { - "epoch": 1.53, - "grad_norm": 11.506767272949219, - "learning_rate": 9.796532023654406e-06, - "loss": 1.4462, + "epoch": 0.64, + "grad_norm": 15.66484260559082, + "learning_rate": 1.5741120361460907e-05, + "loss": 2.0374, "step": 5091 }, { - "epoch": 1.53, - "grad_norm": 36.81036376953125, - "learning_rate": 9.794527413050016e-06, - "loss": 2.7945, + "epoch": 0.64, + "grad_norm": 14.813385963439941, + "learning_rate": 1.5740283646404218e-05, + "loss": 2.5328, "step": 5092 }, { - "epoch": 1.53, - "grad_norm": 13.376030921936035, - "learning_rate": 9.792522802445626e-06, - "loss": 1.562, + "epoch": 0.64, + "grad_norm": 10.920860290527344, + "learning_rate": 1.573944693134753e-05, + "loss": 0.7747, "step": 5093 }, { - "epoch": 1.53, - "grad_norm": 17.786754608154297, - "learning_rate": 9.790518191841234e-06, - "loss": 2.5748, + "epoch": 0.64, + "grad_norm": 25.48909568786621, + "learning_rate": 1.5738610216290845e-05, + "loss": 1.3409, "step": 5094 }, { - "epoch": 1.53, - "grad_norm": 22.214862823486328, - "learning_rate": 9.788513581236846e-06, - "loss": 1.8133, + "epoch": 0.64, + "grad_norm": 82.24311828613281, + "learning_rate": 1.573777350123416e-05, + "loss": 2.8238, "step": 5095 }, { - "epoch": 1.53, - "grad_norm": 7.965408802032471, - "learning_rate": 9.786508970632456e-06, - "loss": 1.2572, + "epoch": 0.64, + "grad_norm": 14.695939064025879, + "learning_rate": 1.573693678617747e-05, + "loss": 1.3314, "step": 5096 }, { - "epoch": 1.53, - "grad_norm": 16.189109802246094, - "learning_rate": 9.784504360028065e-06, - "loss": 1.1491, + "epoch": 0.64, + "grad_norm": 14.964838027954102, + "learning_rate": 1.5736100071120783e-05, + "loss": 1.2088, "step": 5097 }, { - "epoch": 1.53, - "grad_norm": 21.910438537597656, - "learning_rate": 9.782499749423675e-06, - "loss": 1.8276, + "epoch": 0.64, + "grad_norm": 11.288729667663574, + "learning_rate": 1.5735263356064093e-05, + "loss": 1.1307, "step": 5098 }, { - "epoch": 1.53, - "grad_norm": 29.36257553100586, - "learning_rate": 9.780495138819285e-06, - "loss": 1.69, + "epoch": 0.64, + "grad_norm": 20.675094604492188, + "learning_rate": 1.5734426641007407e-05, + "loss": 3.8026, "step": 5099 }, { - "epoch": 1.53, - "grad_norm": 15.409547805786133, - "learning_rate": 9.778490528214895e-06, - "loss": 1.83, + "epoch": 0.64, + "grad_norm": 7.097488880157471, + "learning_rate": 1.573358992595072e-05, + "loss": 0.9689, "step": 5100 }, { - "epoch": 1.53, - "grad_norm": 12.414823532104492, - "learning_rate": 9.776485917610505e-06, - "loss": 1.3998, + "epoch": 0.64, + "grad_norm": 5.893520832061768, + "learning_rate": 1.573275321089403e-05, + "loss": 0.6925, "step": 5101 }, { - "epoch": 1.53, - "grad_norm": 25.707353591918945, - "learning_rate": 9.774481307006115e-06, - "loss": 2.12, + "epoch": 0.64, + "grad_norm": 19.848941802978516, + "learning_rate": 1.5731916495837344e-05, + "loss": 3.517, "step": 5102 }, { - "epoch": 1.53, - "grad_norm": 43.882991790771484, - "learning_rate": 9.772476696401725e-06, - "loss": 1.7967, + "epoch": 0.64, + "grad_norm": 13.46828556060791, + "learning_rate": 1.5731079780780654e-05, + "loss": 0.8219, "step": 5103 }, { - "epoch": 1.53, - "grad_norm": 12.999314308166504, - "learning_rate": 9.770472085797335e-06, - "loss": 1.1704, + "epoch": 0.64, + "grad_norm": 41.50696563720703, + "learning_rate": 1.5730243065723968e-05, + "loss": 1.7747, "step": 5104 }, { - "epoch": 1.53, - "grad_norm": 30.975318908691406, - "learning_rate": 9.768467475192944e-06, - "loss": 2.4516, + "epoch": 0.64, + "grad_norm": 9.662480354309082, + "learning_rate": 1.572940635066728e-05, + "loss": 1.5009, "step": 5105 }, { - "epoch": 1.54, - "grad_norm": 10.727395057678223, - "learning_rate": 9.766462864588554e-06, - "loss": 0.5141, + "epoch": 0.64, + "grad_norm": 24.629592895507812, + "learning_rate": 1.5728569635610592e-05, + "loss": 1.3831, "step": 5106 }, { - "epoch": 1.54, - "grad_norm": 16.37921142578125, - "learning_rate": 9.764458253984165e-06, - "loss": 1.9043, + "epoch": 0.64, + "grad_norm": 34.82011413574219, + "learning_rate": 1.5727732920553906e-05, + "loss": 1.8308, "step": 5107 }, { - "epoch": 1.54, - "grad_norm": 11.777350425720215, - "learning_rate": 9.762453643379774e-06, - "loss": 1.3563, + "epoch": 0.64, + "grad_norm": 8.111581802368164, + "learning_rate": 1.572689620549722e-05, + "loss": 0.6872, "step": 5108 }, { - "epoch": 1.54, - "grad_norm": 10.583935737609863, - "learning_rate": 9.760449032775384e-06, - "loss": 1.3274, + "epoch": 0.64, + "grad_norm": 13.667208671569824, + "learning_rate": 1.5726059490440533e-05, + "loss": 2.047, "step": 5109 }, { - "epoch": 1.54, - "grad_norm": 24.881576538085938, - "learning_rate": 9.758444422170994e-06, - "loss": 1.0478, + "epoch": 0.64, + "grad_norm": 8.9616060256958, + "learning_rate": 1.5725222775383843e-05, + "loss": 0.8753, "step": 5110 }, { - "epoch": 1.54, - "grad_norm": 24.082473754882812, - "learning_rate": 9.756439811566604e-06, - "loss": 0.8794, + "epoch": 0.64, + "grad_norm": 43.2788200378418, + "learning_rate": 1.5724386060327157e-05, + "loss": 1.0503, "step": 5111 }, { - "epoch": 1.54, - "grad_norm": 15.496257781982422, - "learning_rate": 9.754435200962214e-06, - "loss": 0.8897, + "epoch": 0.64, + "grad_norm": 15.115678787231445, + "learning_rate": 1.572354934527047e-05, + "loss": 2.3047, "step": 5112 }, { - "epoch": 1.54, - "grad_norm": 27.82330894470215, - "learning_rate": 9.752430590357824e-06, - "loss": 2.5503, + "epoch": 0.64, + "grad_norm": 11.667755126953125, + "learning_rate": 1.572271263021378e-05, + "loss": 0.7404, "step": 5113 }, { - "epoch": 1.54, - "grad_norm": 14.794507026672363, - "learning_rate": 9.750425979753434e-06, - "loss": 2.2607, + "epoch": 0.64, + "grad_norm": 16.43598175048828, + "learning_rate": 1.5721875915157094e-05, + "loss": 2.3748, "step": 5114 }, { - "epoch": 1.54, - "grad_norm": 25.15207862854004, - "learning_rate": 9.748421369149044e-06, - "loss": 1.2937, + "epoch": 0.64, + "grad_norm": 6.232161045074463, + "learning_rate": 1.5721039200100408e-05, + "loss": 0.5916, "step": 5115 }, { - "epoch": 1.54, - "grad_norm": 28.609935760498047, - "learning_rate": 9.746416758544653e-06, - "loss": 1.8648, + "epoch": 0.64, + "grad_norm": 20.68305015563965, + "learning_rate": 1.572020248504372e-05, + "loss": 3.2559, "step": 5116 }, { - "epoch": 1.54, - "grad_norm": 19.665058135986328, - "learning_rate": 9.744412147940263e-06, - "loss": 1.3485, + "epoch": 0.64, + "grad_norm": 32.31875991821289, + "learning_rate": 1.5719365769987032e-05, + "loss": 1.1103, "step": 5117 }, { - "epoch": 1.54, - "grad_norm": 16.362789154052734, - "learning_rate": 9.742407537335873e-06, - "loss": 1.6568, + "epoch": 0.64, + "grad_norm": 7.220429420471191, + "learning_rate": 1.5718529054930346e-05, + "loss": 0.9726, "step": 5118 }, { - "epoch": 1.54, - "grad_norm": 43.66654586791992, - "learning_rate": 9.740402926731483e-06, - "loss": 1.5276, + "epoch": 0.64, + "grad_norm": 15.526352882385254, + "learning_rate": 1.571769233987366e-05, + "loss": 1.332, "step": 5119 }, { - "epoch": 1.54, - "grad_norm": 11.123966217041016, - "learning_rate": 9.738398316127093e-06, - "loss": 1.4542, + "epoch": 0.64, + "grad_norm": 12.444489479064941, + "learning_rate": 1.571685562481697e-05, + "loss": 2.893, "step": 5120 }, { - "epoch": 1.54, - "grad_norm": 10.916337966918945, - "learning_rate": 9.736393705522703e-06, - "loss": 1.7105, + "epoch": 0.64, + "grad_norm": 16.54176902770996, + "learning_rate": 1.5716018909760283e-05, + "loss": 2.7116, "step": 5121 }, { - "epoch": 1.54, - "grad_norm": 38.372467041015625, - "learning_rate": 9.734389094918313e-06, - "loss": 2.5102, + "epoch": 0.64, + "grad_norm": 8.2006254196167, + "learning_rate": 1.5715182194703597e-05, + "loss": 0.7196, "step": 5122 }, { - "epoch": 1.54, - "grad_norm": 12.8078031539917, - "learning_rate": 9.732384484313923e-06, - "loss": 1.573, + "epoch": 0.64, + "grad_norm": 9.685171127319336, + "learning_rate": 1.5714345479646907e-05, + "loss": 1.8595, "step": 5123 }, { - "epoch": 1.54, - "grad_norm": 22.485212326049805, - "learning_rate": 9.730379873709532e-06, - "loss": 1.986, + "epoch": 0.64, + "grad_norm": 19.2276554107666, + "learning_rate": 1.571350876459022e-05, + "loss": 2.0104, "step": 5124 }, { - "epoch": 1.54, - "grad_norm": 16.489240646362305, - "learning_rate": 9.728375263105144e-06, - "loss": 1.5365, + "epoch": 0.64, + "grad_norm": 12.441829681396484, + "learning_rate": 1.5712672049533534e-05, + "loss": 1.353, "step": 5125 }, { - "epoch": 1.54, - "grad_norm": 48.157371520996094, - "learning_rate": 9.726370652500754e-06, - "loss": 2.6245, + "epoch": 0.64, + "grad_norm": 27.83458137512207, + "learning_rate": 1.5711835334476845e-05, + "loss": 1.2713, "step": 5126 }, { - "epoch": 1.54, - "grad_norm": 19.086997985839844, - "learning_rate": 9.724366041896362e-06, - "loss": 2.8709, + "epoch": 0.64, + "grad_norm": 23.49789047241211, + "learning_rate": 1.5710998619420158e-05, + "loss": 1.3408, "step": 5127 }, { - "epoch": 1.54, - "grad_norm": 16.59282112121582, - "learning_rate": 9.722361431291972e-06, - "loss": 1.6568, + "epoch": 0.64, + "grad_norm": 11.275956153869629, + "learning_rate": 1.571016190436347e-05, + "loss": 1.6182, "step": 5128 }, { - "epoch": 1.54, - "grad_norm": 77.64139556884766, - "learning_rate": 9.720356820687582e-06, - "loss": 2.4053, + "epoch": 0.64, + "grad_norm": 8.767870903015137, + "learning_rate": 1.5709325189306782e-05, + "loss": 1.2063, "step": 5129 }, { - "epoch": 1.54, - "grad_norm": 32.31578063964844, - "learning_rate": 9.718352210083192e-06, - "loss": 2.4112, + "epoch": 0.64, + "grad_norm": 6.207084655761719, + "learning_rate": 1.5708488474250096e-05, + "loss": 0.3131, "step": 5130 }, { - "epoch": 1.54, - "grad_norm": 16.772424697875977, - "learning_rate": 9.716347599478802e-06, - "loss": 1.8791, + "epoch": 0.64, + "grad_norm": 24.9721622467041, + "learning_rate": 1.5707651759193406e-05, + "loss": 1.3211, "step": 5131 }, { - "epoch": 1.54, - "grad_norm": 18.102991104125977, - "learning_rate": 9.714342988874412e-06, - "loss": 1.3216, + "epoch": 0.64, + "grad_norm": 23.742414474487305, + "learning_rate": 1.570681504413672e-05, + "loss": 1.5197, "step": 5132 }, { - "epoch": 1.54, - "grad_norm": 11.130638122558594, - "learning_rate": 9.712338378270023e-06, - "loss": 1.4781, + "epoch": 0.64, + "grad_norm": 15.931519508361816, + "learning_rate": 1.5705978329080033e-05, + "loss": 2.3792, "step": 5133 }, { - "epoch": 1.54, - "grad_norm": 23.45066261291504, - "learning_rate": 9.710333767665631e-06, - "loss": 2.065, + "epoch": 0.64, + "grad_norm": 8.295441627502441, + "learning_rate": 1.5705141614023344e-05, + "loss": 0.5454, "step": 5134 }, { - "epoch": 1.54, - "grad_norm": 24.560604095458984, - "learning_rate": 9.708329157061241e-06, - "loss": 1.383, + "epoch": 0.64, + "grad_norm": 63.2232551574707, + "learning_rate": 1.5704304898966657e-05, + "loss": 2.3334, "step": 5135 }, { - "epoch": 1.54, - "grad_norm": 12.12047290802002, - "learning_rate": 9.706324546456851e-06, - "loss": 1.5988, + "epoch": 0.64, + "grad_norm": 14.293220520019531, + "learning_rate": 1.570346818390997e-05, + "loss": 1.2896, "step": 5136 }, { - "epoch": 1.54, - "grad_norm": 10.139289855957031, - "learning_rate": 9.704319935852461e-06, - "loss": 0.8568, + "epoch": 0.64, + "grad_norm": 12.947236061096191, + "learning_rate": 1.5702631468853285e-05, + "loss": 1.4245, "step": 5137 }, { - "epoch": 1.54, - "grad_norm": 10.667194366455078, - "learning_rate": 9.702315325248071e-06, - "loss": 1.2382, + "epoch": 0.64, + "grad_norm": 6.499480724334717, + "learning_rate": 1.5701794753796595e-05, + "loss": 0.6759, "step": 5138 }, { - "epoch": 1.55, - "grad_norm": 14.842784881591797, - "learning_rate": 9.700310714643681e-06, - "loss": 1.474, + "epoch": 0.64, + "grad_norm": 22.969093322753906, + "learning_rate": 1.570095803873991e-05, + "loss": 1.924, "step": 5139 }, { - "epoch": 1.55, - "grad_norm": 28.56570816040039, - "learning_rate": 9.698306104039291e-06, - "loss": 1.9052, + "epoch": 0.65, + "grad_norm": 14.942154884338379, + "learning_rate": 1.5700121323683222e-05, + "loss": 1.3929, "step": 5140 }, { - "epoch": 1.55, - "grad_norm": 14.900588035583496, - "learning_rate": 9.696301493434901e-06, - "loss": 1.2868, + "epoch": 0.65, + "grad_norm": 8.734114646911621, + "learning_rate": 1.5699284608626532e-05, + "loss": 2.4714, "step": 5141 }, { - "epoch": 1.55, - "grad_norm": 11.330522537231445, - "learning_rate": 9.69429688283051e-06, - "loss": 1.1221, + "epoch": 0.65, + "grad_norm": 16.237415313720703, + "learning_rate": 1.5698447893569846e-05, + "loss": 1.1233, "step": 5142 }, { - "epoch": 1.55, - "grad_norm": 10.978798866271973, - "learning_rate": 9.69229227222612e-06, - "loss": 1.2924, + "epoch": 0.65, + "grad_norm": 8.782207489013672, + "learning_rate": 1.569761117851316e-05, + "loss": 0.5252, "step": 5143 }, { - "epoch": 1.55, - "grad_norm": 10.9464111328125, - "learning_rate": 9.690287661621732e-06, - "loss": 0.9553, + "epoch": 0.65, + "grad_norm": 31.56356430053711, + "learning_rate": 1.5696774463456473e-05, + "loss": 3.5507, "step": 5144 }, { - "epoch": 1.55, - "grad_norm": 19.434829711914062, - "learning_rate": 9.68828305101734e-06, - "loss": 1.6625, + "epoch": 0.65, + "grad_norm": 33.98700714111328, + "learning_rate": 1.5695937748399784e-05, + "loss": 2.9867, "step": 5145 }, { - "epoch": 1.55, - "grad_norm": 13.196100234985352, - "learning_rate": 9.68627844041295e-06, - "loss": 1.5143, + "epoch": 0.65, + "grad_norm": 19.101417541503906, + "learning_rate": 1.5695101033343097e-05, + "loss": 1.3417, "step": 5146 }, { - "epoch": 1.55, - "grad_norm": 13.529244422912598, - "learning_rate": 9.68427382980856e-06, - "loss": 1.6467, + "epoch": 0.65, + "grad_norm": 7.179376125335693, + "learning_rate": 1.569426431828641e-05, + "loss": 1.3175, "step": 5147 }, { - "epoch": 1.55, - "grad_norm": 37.84720993041992, - "learning_rate": 9.68226921920417e-06, - "loss": 3.0862, + "epoch": 0.65, + "grad_norm": 12.053905487060547, + "learning_rate": 1.569342760322972e-05, + "loss": 1.1468, "step": 5148 }, { - "epoch": 1.55, - "grad_norm": 11.348771095275879, - "learning_rate": 9.68026460859978e-06, - "loss": 1.3557, + "epoch": 0.65, + "grad_norm": 31.480859756469727, + "learning_rate": 1.5692590888173035e-05, + "loss": 2.4152, "step": 5149 }, { - "epoch": 1.55, - "grad_norm": 11.032979965209961, - "learning_rate": 9.67825999799539e-06, - "loss": 0.9113, + "epoch": 0.65, + "grad_norm": 22.707080841064453, + "learning_rate": 1.569175417311635e-05, + "loss": 1.7221, "step": 5150 }, { - "epoch": 1.55, - "grad_norm": 15.761959075927734, - "learning_rate": 9.676255387391e-06, - "loss": 1.7351, + "epoch": 0.65, + "grad_norm": 22.629104614257812, + "learning_rate": 1.569091745805966e-05, + "loss": 2.7781, "step": 5151 }, { - "epoch": 1.55, - "grad_norm": 53.275428771972656, - "learning_rate": 9.67425077678661e-06, - "loss": 2.9133, + "epoch": 0.65, + "grad_norm": 15.60866641998291, + "learning_rate": 1.5690080743002972e-05, + "loss": 1.1647, "step": 5152 }, { - "epoch": 1.55, - "grad_norm": 18.882171630859375, - "learning_rate": 9.672246166182219e-06, - "loss": 2.1493, + "epoch": 0.65, + "grad_norm": 10.045681953430176, + "learning_rate": 1.5689244027946286e-05, + "loss": 0.8843, "step": 5153 }, { - "epoch": 1.55, - "grad_norm": 13.768794059753418, - "learning_rate": 9.67024155557783e-06, - "loss": 1.1085, + "epoch": 0.65, + "grad_norm": 9.390864372253418, + "learning_rate": 1.5688407312889596e-05, + "loss": 1.3782, "step": 5154 }, { - "epoch": 1.55, - "grad_norm": 15.78554916381836, - "learning_rate": 9.66823694497344e-06, - "loss": 1.7214, + "epoch": 0.65, + "grad_norm": 15.955741882324219, + "learning_rate": 1.568757059783291e-05, + "loss": 1.3725, "step": 5155 }, { - "epoch": 1.55, - "grad_norm": 18.663970947265625, - "learning_rate": 9.66623233436905e-06, - "loss": 1.5341, + "epoch": 0.65, + "grad_norm": 4.5042724609375, + "learning_rate": 1.568673388277622e-05, + "loss": 1.396, "step": 5156 }, { - "epoch": 1.55, - "grad_norm": 91.2535629272461, - "learning_rate": 9.66422772376466e-06, - "loss": 2.4091, + "epoch": 0.65, + "grad_norm": 7.471918106079102, + "learning_rate": 1.5685897167719534e-05, + "loss": 1.0309, "step": 5157 }, { - "epoch": 1.55, - "grad_norm": 30.0625057220459, - "learning_rate": 9.66222311316027e-06, - "loss": 1.2433, + "epoch": 0.65, + "grad_norm": 5.654822826385498, + "learning_rate": 1.5685060452662847e-05, + "loss": 0.134, "step": 5158 }, { - "epoch": 1.55, - "grad_norm": 10.793164253234863, - "learning_rate": 9.66021850255588e-06, - "loss": 1.3247, + "epoch": 0.65, + "grad_norm": 26.877819061279297, + "learning_rate": 1.5684223737606158e-05, + "loss": 2.7742, "step": 5159 }, { - "epoch": 1.55, - "grad_norm": 14.865962028503418, - "learning_rate": 9.65821389195149e-06, - "loss": 0.7533, - "step": 5160 - }, - { - "epoch": 1.55, - "eval_loss": 0.20735611021518707, - "eval_runtime": 43.7679, - "eval_samples_per_second": 33.792, - "eval_steps_per_second": 33.792, + "epoch": 0.65, + "grad_norm": 7.091485500335693, + "learning_rate": 1.568338702254947e-05, + "loss": 0.4301, "step": 5160 }, { - "epoch": 1.55, - "grad_norm": 18.602514266967773, - "learning_rate": 9.656209281347098e-06, - "loss": 1.5636, + "epoch": 0.65, + "grad_norm": 68.3599624633789, + "learning_rate": 1.5682550307492785e-05, + "loss": 0.8968, "step": 5161 }, { - "epoch": 1.55, - "grad_norm": 14.882292747497559, - "learning_rate": 9.65420467074271e-06, - "loss": 1.1038, + "epoch": 0.65, + "grad_norm": 7.607362747192383, + "learning_rate": 1.5681713592436095e-05, + "loss": 1.3871, "step": 5162 }, { - "epoch": 1.55, - "grad_norm": 56.73546600341797, - "learning_rate": 9.65220006013832e-06, - "loss": 1.9049, + "epoch": 0.65, + "grad_norm": 16.349225997924805, + "learning_rate": 1.568087687737941e-05, + "loss": 1.6503, "step": 5163 }, { - "epoch": 1.55, - "grad_norm": 13.342903137207031, - "learning_rate": 9.650195449533928e-06, - "loss": 1.6425, + "epoch": 0.65, + "grad_norm": 13.676576614379883, + "learning_rate": 1.5680040162322723e-05, + "loss": 1.7194, "step": 5164 }, { - "epoch": 1.55, - "grad_norm": 13.89185905456543, - "learning_rate": 9.648190838929538e-06, - "loss": 1.0887, + "epoch": 0.65, + "grad_norm": 18.912063598632812, + "learning_rate": 1.5679203447266036e-05, + "loss": 2.1749, "step": 5165 }, { - "epoch": 1.55, - "grad_norm": 72.23283386230469, - "learning_rate": 9.646186228325149e-06, - "loss": 2.1655, + "epoch": 0.65, + "grad_norm": 19.127605438232422, + "learning_rate": 1.5678366732209346e-05, + "loss": 1.3759, "step": 5166 }, { - "epoch": 1.55, - "grad_norm": 17.974084854125977, - "learning_rate": 9.644181617720759e-06, - "loss": 2.2678, + "epoch": 0.65, + "grad_norm": 12.958504676818848, + "learning_rate": 1.567753001715266e-05, + "loss": 0.8994, "step": 5167 }, { - "epoch": 1.55, - "grad_norm": 69.75627136230469, - "learning_rate": 9.642177007116369e-06, - "loss": 1.7682, + "epoch": 0.65, + "grad_norm": 16.317556381225586, + "learning_rate": 1.5676693302095974e-05, + "loss": 1.1109, "step": 5168 }, { - "epoch": 1.55, - "grad_norm": 11.242698669433594, - "learning_rate": 9.640172396511979e-06, - "loss": 1.3168, + "epoch": 0.65, + "grad_norm": 11.528014183044434, + "learning_rate": 1.5675856587039284e-05, + "loss": 1.4751, "step": 5169 }, { - "epoch": 1.55, - "grad_norm": 66.39893341064453, - "learning_rate": 9.638167785907589e-06, - "loss": 2.6081, + "epoch": 0.65, + "grad_norm": 3.7973978519439697, + "learning_rate": 1.5675019871982598e-05, + "loss": 0.4032, "step": 5170 }, { - "epoch": 1.55, - "grad_norm": 9.396846771240234, - "learning_rate": 9.636163175303199e-06, - "loss": 0.8918, + "epoch": 0.65, + "grad_norm": 17.930723190307617, + "learning_rate": 1.567418315692591e-05, + "loss": 1.8452, "step": 5171 }, { - "epoch": 1.56, - "grad_norm": 30.329811096191406, - "learning_rate": 9.634158564698807e-06, - "loss": 2.5263, + "epoch": 0.65, + "grad_norm": 21.808406829833984, + "learning_rate": 1.5673346441869225e-05, + "loss": 3.4642, "step": 5172 }, { - "epoch": 1.56, - "grad_norm": 8.505033493041992, - "learning_rate": 9.632153954094417e-06, - "loss": 0.9187, + "epoch": 0.65, + "grad_norm": 12.586520195007324, + "learning_rate": 1.5672509726812535e-05, + "loss": 0.6569, "step": 5173 }, { - "epoch": 1.56, - "grad_norm": 15.874688148498535, - "learning_rate": 9.63014934349003e-06, - "loss": 1.2034, + "epoch": 0.65, + "grad_norm": 12.065384864807129, + "learning_rate": 1.567167301175585e-05, + "loss": 1.2497, "step": 5174 }, { - "epoch": 1.56, - "grad_norm": 12.436075210571289, - "learning_rate": 9.628144732885638e-06, - "loss": 1.2702, + "epoch": 0.65, + "grad_norm": 26.136062622070312, + "learning_rate": 1.5670836296699162e-05, + "loss": 0.9877, "step": 5175 }, { - "epoch": 1.56, - "grad_norm": 16.405839920043945, - "learning_rate": 9.626140122281248e-06, - "loss": 1.2595, + "epoch": 0.65, + "grad_norm": 29.510662078857422, + "learning_rate": 1.5669999581642473e-05, + "loss": 2.4064, "step": 5176 }, { - "epoch": 1.56, - "grad_norm": 6.489515781402588, - "learning_rate": 9.624135511676858e-06, - "loss": 0.699, + "epoch": 0.65, + "grad_norm": 7.213874340057373, + "learning_rate": 1.5669162866585786e-05, + "loss": 0.9525, "step": 5177 }, { - "epoch": 1.56, - "grad_norm": 15.752748489379883, - "learning_rate": 9.622130901072468e-06, - "loss": 1.8403, + "epoch": 0.65, + "grad_norm": 24.652938842773438, + "learning_rate": 1.56683261515291e-05, + "loss": 1.4499, "step": 5178 }, { - "epoch": 1.56, - "grad_norm": 21.755474090576172, - "learning_rate": 9.620126290468076e-06, - "loss": 1.9319, + "epoch": 0.65, + "grad_norm": 9.178464889526367, + "learning_rate": 1.566748943647241e-05, + "loss": 0.7875, "step": 5179 }, { - "epoch": 1.56, - "grad_norm": 14.760236740112305, - "learning_rate": 9.618121679863686e-06, - "loss": 1.3553, + "epoch": 0.65, + "grad_norm": 14.615882873535156, + "learning_rate": 1.5666652721415724e-05, + "loss": 0.9102, "step": 5180 }, { - "epoch": 1.56, - "grad_norm": 20.644493103027344, - "learning_rate": 9.616117069259298e-06, - "loss": 1.2051, + "epoch": 0.65, + "grad_norm": 16.932817459106445, + "learning_rate": 1.5665816006359034e-05, + "loss": 1.8659, "step": 5181 }, { - "epoch": 1.56, - "grad_norm": 18.111047744750977, - "learning_rate": 9.614112458654906e-06, - "loss": 1.1888, + "epoch": 0.65, + "grad_norm": 13.214982032775879, + "learning_rate": 1.5664979291302348e-05, + "loss": 1.4819, "step": 5182 }, { - "epoch": 1.56, - "grad_norm": 9.958390235900879, - "learning_rate": 9.612107848050517e-06, - "loss": 1.2644, + "epoch": 0.65, + "grad_norm": 14.489808082580566, + "learning_rate": 1.566414257624566e-05, + "loss": 2.7115, "step": 5183 }, { - "epoch": 1.56, - "grad_norm": 16.58241081237793, - "learning_rate": 9.610103237446127e-06, - "loss": 0.8966, + "epoch": 0.65, + "grad_norm": 32.23086166381836, + "learning_rate": 1.5663305861188972e-05, + "loss": 2.2479, "step": 5184 }, { - "epoch": 1.56, - "grad_norm": 27.23797607421875, - "learning_rate": 9.608098626841737e-06, - "loss": 2.5713, + "epoch": 0.65, + "grad_norm": 13.670648574829102, + "learning_rate": 1.5662469146132285e-05, + "loss": 1.5881, "step": 5185 }, { - "epoch": 1.56, - "grad_norm": 28.486438751220703, - "learning_rate": 9.606094016237347e-06, - "loss": 2.6203, + "epoch": 0.65, + "grad_norm": 23.302671432495117, + "learning_rate": 1.5661632431075596e-05, + "loss": 2.2353, "step": 5186 }, { - "epoch": 1.56, - "grad_norm": 15.266054153442383, - "learning_rate": 9.604089405632957e-06, - "loss": 1.8284, + "epoch": 0.65, + "grad_norm": 22.597736358642578, + "learning_rate": 1.566079571601891e-05, + "loss": 2.3094, "step": 5187 }, { - "epoch": 1.56, - "grad_norm": 27.08075714111328, - "learning_rate": 9.602084795028567e-06, - "loss": 1.749, + "epoch": 0.65, + "grad_norm": 12.448516845703125, + "learning_rate": 1.5659959000962223e-05, + "loss": 2.5301, "step": 5188 }, { - "epoch": 1.56, - "grad_norm": 15.239933013916016, - "learning_rate": 9.600080184424177e-06, - "loss": 1.4726, + "epoch": 0.65, + "grad_norm": 6.709007740020752, + "learning_rate": 1.5659122285905537e-05, + "loss": 0.5511, "step": 5189 }, { - "epoch": 1.56, - "grad_norm": 29.341596603393555, - "learning_rate": 9.598075573819785e-06, - "loss": 1.697, + "epoch": 0.65, + "grad_norm": 10.571820259094238, + "learning_rate": 1.5658285570848847e-05, + "loss": 1.4207, "step": 5190 }, { - "epoch": 1.56, - "grad_norm": 17.9268856048584, - "learning_rate": 9.596070963215396e-06, - "loss": 1.7262, + "epoch": 0.65, + "grad_norm": 14.686131477355957, + "learning_rate": 1.565744885579216e-05, + "loss": 0.6408, "step": 5191 }, { - "epoch": 1.56, - "grad_norm": 18.277809143066406, - "learning_rate": 9.594066352611006e-06, - "loss": 2.4779, + "epoch": 0.65, + "grad_norm": 8.774033546447754, + "learning_rate": 1.5656612140735474e-05, + "loss": 1.8296, "step": 5192 }, { - "epoch": 1.56, - "grad_norm": 24.198942184448242, - "learning_rate": 9.592061742006616e-06, - "loss": 1.2228, + "epoch": 0.65, + "grad_norm": 13.244007110595703, + "learning_rate": 1.5655775425678784e-05, + "loss": 2.2862, "step": 5193 }, { - "epoch": 1.56, - "grad_norm": 10.138649940490723, - "learning_rate": 9.590057131402226e-06, - "loss": 2.3378, + "epoch": 0.65, + "grad_norm": 4.894765853881836, + "learning_rate": 1.5654938710622098e-05, + "loss": 0.178, "step": 5194 }, { - "epoch": 1.56, - "grad_norm": 15.209259986877441, - "learning_rate": 9.588052520797836e-06, - "loss": 1.8108, + "epoch": 0.65, + "grad_norm": 4.754327774047852, + "learning_rate": 1.5654101995565412e-05, + "loss": 0.6165, "step": 5195 }, { - "epoch": 1.56, - "grad_norm": 22.023107528686523, - "learning_rate": 9.586047910193446e-06, - "loss": 1.5428, + "epoch": 0.65, + "grad_norm": 1.4888056516647339, + "learning_rate": 1.5653265280508725e-05, + "loss": 0.0392, "step": 5196 }, { - "epoch": 1.56, - "grad_norm": 18.7374210357666, - "learning_rate": 9.584043299589056e-06, - "loss": 1.6044, + "epoch": 0.65, + "grad_norm": 15.606212615966797, + "learning_rate": 1.5652428565452036e-05, + "loss": 1.7273, "step": 5197 }, { - "epoch": 1.56, - "grad_norm": 21.584501266479492, - "learning_rate": 9.582038688984664e-06, - "loss": 2.0156, + "epoch": 0.65, + "grad_norm": 10.794861793518066, + "learning_rate": 1.565159185039535e-05, + "loss": 1.3038, "step": 5198 }, { - "epoch": 1.56, - "grad_norm": 17.4039363861084, - "learning_rate": 9.580034078380276e-06, - "loss": 1.3307, + "epoch": 0.65, + "grad_norm": 15.449944496154785, + "learning_rate": 1.5650755135338663e-05, + "loss": 2.1364, "step": 5199 }, { - "epoch": 1.56, - "grad_norm": 10.785429000854492, - "learning_rate": 9.578029467775886e-06, - "loss": 1.2009, + "epoch": 0.65, + "grad_norm": 17.74704933166504, + "learning_rate": 1.5649918420281973e-05, + "loss": 2.6271, "step": 5200 }, { - "epoch": 1.56, - "grad_norm": 13.932561874389648, - "learning_rate": 9.576024857171495e-06, - "loss": 1.6818, + "epoch": 0.65, + "eval_loss": 0.11776099354028702, + "eval_runtime": 95.0543, + "eval_samples_per_second": 37.263, + "eval_steps_per_second": 37.263, + "step": 5200 + }, + { + "epoch": 0.65, + "grad_norm": 11.526359558105469, + "learning_rate": 1.5649081705225287e-05, + "loss": 1.1783, "step": 5201 }, { - "epoch": 1.56, - "grad_norm": 13.384397506713867, - "learning_rate": 9.574020246567105e-06, - "loss": 1.7232, + "epoch": 0.65, + "grad_norm": 8.998695373535156, + "learning_rate": 1.56482449901686e-05, + "loss": 1.1888, "step": 5202 }, { - "epoch": 1.56, - "grad_norm": 32.59089279174805, - "learning_rate": 9.572015635962715e-06, - "loss": 2.1975, + "epoch": 0.65, + "grad_norm": 34.16201400756836, + "learning_rate": 1.5647408275111914e-05, + "loss": 1.4792, "step": 5203 }, { - "epoch": 1.56, - "grad_norm": 19.552274703979492, - "learning_rate": 9.570011025358325e-06, - "loss": 2.2128, + "epoch": 0.65, + "grad_norm": 22.636789321899414, + "learning_rate": 1.5646571560055224e-05, + "loss": 1.4457, "step": 5204 }, { - "epoch": 1.56, - "grad_norm": 15.523249626159668, - "learning_rate": 9.568006414753935e-06, - "loss": 1.1143, + "epoch": 0.65, + "grad_norm": 32.693355560302734, + "learning_rate": 1.5645734844998538e-05, + "loss": 2.9091, "step": 5205 }, { - "epoch": 1.57, - "grad_norm": 35.84797286987305, - "learning_rate": 9.566001804149545e-06, - "loss": 2.5374, + "epoch": 0.65, + "grad_norm": 16.178911209106445, + "learning_rate": 1.5644898129941852e-05, + "loss": 1.4922, "step": 5206 }, { - "epoch": 1.57, - "grad_norm": 14.734626770019531, - "learning_rate": 9.563997193545155e-06, - "loss": 1.2287, + "epoch": 0.65, + "grad_norm": 15.090814590454102, + "learning_rate": 1.5644061414885162e-05, + "loss": 1.7392, "step": 5207 }, { - "epoch": 1.57, - "grad_norm": 13.902031898498535, - "learning_rate": 9.561992582940765e-06, - "loss": 1.4145, + "epoch": 0.65, + "grad_norm": 10.362354278564453, + "learning_rate": 1.5643224699828476e-05, + "loss": 0.7582, "step": 5208 }, { - "epoch": 1.57, - "grad_norm": 9.233687400817871, - "learning_rate": 9.559987972336374e-06, - "loss": 1.2157, + "epoch": 0.65, + "grad_norm": 12.946538925170898, + "learning_rate": 1.5642387984771786e-05, + "loss": 0.9911, "step": 5209 }, { - "epoch": 1.57, - "grad_norm": 47.99122619628906, - "learning_rate": 9.557983361731984e-06, - "loss": 1.6352, + "epoch": 0.65, + "grad_norm": 19.269906997680664, + "learning_rate": 1.56415512697151e-05, + "loss": 3.0072, "step": 5210 }, { - "epoch": 1.57, - "grad_norm": 18.855663299560547, - "learning_rate": 9.555978751127596e-06, - "loss": 2.1963, + "epoch": 0.65, + "grad_norm": 13.436691284179688, + "learning_rate": 1.5640714554658413e-05, + "loss": 1.9378, "step": 5211 }, { - "epoch": 1.57, - "grad_norm": 25.103059768676758, - "learning_rate": 9.553974140523204e-06, - "loss": 2.7753, + "epoch": 0.65, + "grad_norm": 11.440930366516113, + "learning_rate": 1.5639877839601723e-05, + "loss": 0.5874, "step": 5212 }, { - "epoch": 1.57, - "grad_norm": 19.241830825805664, - "learning_rate": 9.551969529918814e-06, - "loss": 1.5551, + "epoch": 0.65, + "grad_norm": 16.88869857788086, + "learning_rate": 1.5639041124545037e-05, + "loss": 1.3259, "step": 5213 }, { - "epoch": 1.57, - "grad_norm": 13.835005760192871, - "learning_rate": 9.549964919314424e-06, - "loss": 1.4738, + "epoch": 0.65, + "grad_norm": 26.100208282470703, + "learning_rate": 1.5638204409488347e-05, + "loss": 2.6398, "step": 5214 }, { - "epoch": 1.57, - "grad_norm": 8.090837478637695, - "learning_rate": 9.547960308710034e-06, - "loss": 1.7791, + "epoch": 0.65, + "grad_norm": 7.826216697692871, + "learning_rate": 1.563736769443166e-05, + "loss": 1.6174, "step": 5215 }, { - "epoch": 1.57, - "grad_norm": 22.887216567993164, - "learning_rate": 9.545955698105643e-06, - "loss": 1.4865, + "epoch": 0.65, + "grad_norm": 15.352578163146973, + "learning_rate": 1.5636530979374975e-05, + "loss": 1.6649, "step": 5216 }, { - "epoch": 1.57, - "grad_norm": 18.71986961364746, - "learning_rate": 9.543951087501254e-06, - "loss": 1.7043, + "epoch": 0.65, + "grad_norm": 7.8712077140808105, + "learning_rate": 1.563569426431829e-05, + "loss": 1.7075, "step": 5217 }, { - "epoch": 1.57, - "grad_norm": 9.452502250671387, - "learning_rate": 9.541946476896864e-06, - "loss": 1.9759, + "epoch": 0.65, + "grad_norm": 19.553529739379883, + "learning_rate": 1.56348575492616e-05, + "loss": 1.2477, "step": 5218 }, { - "epoch": 1.57, - "grad_norm": 18.228004455566406, - "learning_rate": 9.539941866292473e-06, - "loss": 1.4368, + "epoch": 0.65, + "grad_norm": 30.20794105529785, + "learning_rate": 1.5634020834204912e-05, + "loss": 2.1334, "step": 5219 }, { - "epoch": 1.57, - "grad_norm": 27.472970962524414, - "learning_rate": 9.537937255688083e-06, - "loss": 1.5878, + "epoch": 0.66, + "grad_norm": 17.59926414489746, + "learning_rate": 1.5633184119148226e-05, + "loss": 2.1509, "step": 5220 }, { - "epoch": 1.57, - "grad_norm": 34.94017028808594, - "learning_rate": 9.535932645083693e-06, - "loss": 1.7284, + "epoch": 0.66, + "grad_norm": 14.72649097442627, + "learning_rate": 1.5632347404091536e-05, + "loss": 1.6159, "step": 5221 }, { - "epoch": 1.57, - "grad_norm": 19.611160278320312, - "learning_rate": 9.533928034479303e-06, - "loss": 1.9294, + "epoch": 0.66, + "grad_norm": 7.848047733306885, + "learning_rate": 1.563151068903485e-05, + "loss": 1.0181, "step": 5222 }, { - "epoch": 1.57, - "grad_norm": 181.42205810546875, - "learning_rate": 9.531923423874913e-06, - "loss": 1.5287, + "epoch": 0.66, + "grad_norm": 10.326218605041504, + "learning_rate": 1.5630673973978163e-05, + "loss": 1.5618, "step": 5223 }, { - "epoch": 1.57, - "grad_norm": 27.391780853271484, - "learning_rate": 9.529918813270523e-06, - "loss": 1.0979, + "epoch": 0.66, + "grad_norm": 5.257458686828613, + "learning_rate": 1.5629837258921477e-05, + "loss": 1.1577, "step": 5224 }, { - "epoch": 1.57, - "grad_norm": 19.48676872253418, - "learning_rate": 9.527914202666133e-06, - "loss": 1.9428, + "epoch": 0.66, + "grad_norm": 14.704678535461426, + "learning_rate": 1.5629000543864787e-05, + "loss": 1.8026, "step": 5225 }, { - "epoch": 1.57, - "grad_norm": 60.94241714477539, - "learning_rate": 9.525909592061743e-06, - "loss": 2.5457, + "epoch": 0.66, + "grad_norm": 24.467130661010742, + "learning_rate": 1.56281638288081e-05, + "loss": 2.5593, "step": 5226 }, { - "epoch": 1.57, - "grad_norm": 39.271629333496094, - "learning_rate": 9.523904981457352e-06, - "loss": 1.834, + "epoch": 0.66, + "grad_norm": 21.703311920166016, + "learning_rate": 1.5627327113751415e-05, + "loss": 3.3068, "step": 5227 }, { - "epoch": 1.57, - "grad_norm": 7.752985000610352, - "learning_rate": 9.521900370852962e-06, - "loss": 1.3035, + "epoch": 0.66, + "grad_norm": 19.474443435668945, + "learning_rate": 1.5626490398694725e-05, + "loss": 1.4674, "step": 5228 }, { - "epoch": 1.57, - "grad_norm": 11.235089302062988, - "learning_rate": 9.519895760248572e-06, - "loss": 0.659, + "epoch": 0.66, + "grad_norm": 14.166336059570312, + "learning_rate": 1.562565368363804e-05, + "loss": 2.139, "step": 5229 }, { - "epoch": 1.57, - "grad_norm": 22.830162048339844, - "learning_rate": 9.517891149644182e-06, - "loss": 2.1836, + "epoch": 0.66, + "grad_norm": 8.847391128540039, + "learning_rate": 1.5624816968581352e-05, + "loss": 1.7414, "step": 5230 }, { - "epoch": 1.57, - "grad_norm": 40.63211441040039, - "learning_rate": 9.515886539039792e-06, - "loss": 2.0237, + "epoch": 0.66, + "grad_norm": 26.29215431213379, + "learning_rate": 1.5623980253524666e-05, + "loss": 2.1403, "step": 5231 }, { - "epoch": 1.57, - "grad_norm": 15.20790958404541, - "learning_rate": 9.513881928435402e-06, - "loss": 0.9827, + "epoch": 0.66, + "grad_norm": 35.24619674682617, + "learning_rate": 1.5623143538467976e-05, + "loss": 2.2782, "step": 5232 }, { - "epoch": 1.57, - "grad_norm": 24.22981071472168, - "learning_rate": 9.511877317831012e-06, - "loss": 1.6325, + "epoch": 0.66, + "grad_norm": 9.792732238769531, + "learning_rate": 1.562230682341129e-05, + "loss": 3.4117, "step": 5233 }, { - "epoch": 1.57, - "grad_norm": 69.4743423461914, - "learning_rate": 9.509872707226622e-06, - "loss": 2.8366, + "epoch": 0.66, + "grad_norm": 8.300969123840332, + "learning_rate": 1.56214701083546e-05, + "loss": 0.9535, "step": 5234 }, { - "epoch": 1.57, - "grad_norm": 16.03273582458496, - "learning_rate": 9.50786809662223e-06, - "loss": 1.8027, + "epoch": 0.66, + "grad_norm": 22.07040023803711, + "learning_rate": 1.5620633393297914e-05, + "loss": 2.7798, "step": 5235 }, { - "epoch": 1.57, - "grad_norm": 27.365671157836914, - "learning_rate": 9.505863486017843e-06, - "loss": 1.7576, + "epoch": 0.66, + "grad_norm": 12.015377044677734, + "learning_rate": 1.5619796678241227e-05, + "loss": 2.0949, "step": 5236 }, { - "epoch": 1.57, - "grad_norm": 24.969091415405273, - "learning_rate": 9.503858875413453e-06, - "loss": 2.0772, + "epoch": 0.66, + "grad_norm": 38.503719329833984, + "learning_rate": 1.5618959963184538e-05, + "loss": 2.7961, "step": 5237 }, { - "epoch": 1.57, - "grad_norm": 17.248416900634766, - "learning_rate": 9.501854264809061e-06, - "loss": 1.3726, + "epoch": 0.66, + "grad_norm": 8.067269325256348, + "learning_rate": 1.561812324812785e-05, + "loss": 1.1209, "step": 5238 }, { - "epoch": 1.58, - "grad_norm": 15.464237213134766, - "learning_rate": 9.499849654204671e-06, - "loss": 1.4722, + "epoch": 0.66, + "grad_norm": 36.50723648071289, + "learning_rate": 1.561728653307116e-05, + "loss": 3.2759, "step": 5239 }, { - "epoch": 1.58, - "grad_norm": 24.053791046142578, - "learning_rate": 9.497845043600281e-06, - "loss": 1.4818, + "epoch": 0.66, + "grad_norm": 8.340876579284668, + "learning_rate": 1.5616449818014475e-05, + "loss": 0.8212, "step": 5240 }, { - "epoch": 1.58, - "grad_norm": 21.699386596679688, - "learning_rate": 9.495840432995891e-06, - "loss": 2.1324, + "epoch": 0.66, + "grad_norm": 18.946107864379883, + "learning_rate": 1.561561310295779e-05, + "loss": 1.8706, "step": 5241 }, { - "epoch": 1.58, - "grad_norm": 32.680335998535156, - "learning_rate": 9.493835822391501e-06, - "loss": 2.5121, + "epoch": 0.66, + "grad_norm": 13.62232780456543, + "learning_rate": 1.56147763879011e-05, + "loss": 2.333, "step": 5242 }, { - "epoch": 1.58, - "grad_norm": 9.323308944702148, - "learning_rate": 9.491831211787111e-06, - "loss": 1.7795, + "epoch": 0.66, + "grad_norm": 5.632523536682129, + "learning_rate": 1.5613939672844413e-05, + "loss": 1.7993, "step": 5243 }, { - "epoch": 1.58, - "grad_norm": 9.845460891723633, - "learning_rate": 9.489826601182722e-06, - "loss": 1.5512, + "epoch": 0.66, + "grad_norm": 49.13154602050781, + "learning_rate": 1.5613102957787726e-05, + "loss": 2.3888, "step": 5244 }, { - "epoch": 1.58, - "grad_norm": 12.808528900146484, - "learning_rate": 9.487821990578332e-06, - "loss": 1.6268, + "epoch": 0.66, + "grad_norm": 40.47331619262695, + "learning_rate": 1.561226624273104e-05, + "loss": 1.5035, "step": 5245 }, { - "epoch": 1.58, - "grad_norm": 30.653648376464844, - "learning_rate": 9.48581737997394e-06, - "loss": 2.2517, + "epoch": 0.66, + "grad_norm": 33.18256378173828, + "learning_rate": 1.561142952767435e-05, + "loss": 1.3711, "step": 5246 }, { - "epoch": 1.58, - "grad_norm": 37.43178939819336, - "learning_rate": 9.48381276936955e-06, - "loss": 1.6323, + "epoch": 0.66, + "grad_norm": 13.688959121704102, + "learning_rate": 1.5610592812617664e-05, + "loss": 1.282, "step": 5247 }, { - "epoch": 1.58, - "grad_norm": 23.361366271972656, - "learning_rate": 9.481808158765162e-06, - "loss": 1.6171, + "epoch": 0.66, + "grad_norm": 15.67768383026123, + "learning_rate": 1.5609756097560978e-05, + "loss": 2.3848, "step": 5248 }, { - "epoch": 1.58, - "grad_norm": 43.43690872192383, - "learning_rate": 9.47980354816077e-06, - "loss": 1.8716, + "epoch": 0.66, + "grad_norm": 8.087139129638672, + "learning_rate": 1.5608919382504288e-05, + "loss": 1.9862, "step": 5249 }, { - "epoch": 1.58, - "grad_norm": 16.062522888183594, - "learning_rate": 9.47779893755638e-06, - "loss": 2.9676, + "epoch": 0.66, + "grad_norm": 19.833606719970703, + "learning_rate": 1.56080826674476e-05, + "loss": 2.536, "step": 5250 }, { - "epoch": 1.58, - "grad_norm": 34.8533935546875, - "learning_rate": 9.47579432695199e-06, - "loss": 1.6051, + "epoch": 0.66, + "grad_norm": 11.725749969482422, + "learning_rate": 1.5607245952390915e-05, + "loss": 2.0792, "step": 5251 }, { - "epoch": 1.58, - "grad_norm": 20.79990577697754, - "learning_rate": 9.4737897163476e-06, - "loss": 2.5814, + "epoch": 0.66, + "grad_norm": 23.482179641723633, + "learning_rate": 1.560640923733423e-05, + "loss": 2.3725, "step": 5252 }, { - "epoch": 1.58, - "grad_norm": 94.41461181640625, - "learning_rate": 9.47178510574321e-06, - "loss": 3.8194, + "epoch": 0.66, + "grad_norm": 13.740874290466309, + "learning_rate": 1.560557252227754e-05, + "loss": 2.5001, "step": 5253 }, { - "epoch": 1.58, - "grad_norm": 9.540099143981934, - "learning_rate": 9.46978049513882e-06, - "loss": 2.1172, + "epoch": 0.66, + "grad_norm": 19.1165714263916, + "learning_rate": 1.5604735807220853e-05, + "loss": 2.192, "step": 5254 }, { - "epoch": 1.58, - "grad_norm": 18.652847290039062, - "learning_rate": 9.46777588453443e-06, - "loss": 1.3757, + "epoch": 0.66, + "grad_norm": 13.846579551696777, + "learning_rate": 1.5603899092164166e-05, + "loss": 2.5799, "step": 5255 }, { - "epoch": 1.58, - "grad_norm": 6.206528663635254, - "learning_rate": 9.465771273930041e-06, - "loss": 0.92, + "epoch": 0.66, + "grad_norm": 12.332139015197754, + "learning_rate": 1.5603062377107477e-05, + "loss": 0.978, "step": 5256 }, { - "epoch": 1.58, - "grad_norm": 18.400524139404297, - "learning_rate": 9.46376666332565e-06, - "loss": 1.8978, + "epoch": 0.66, + "grad_norm": 20.706724166870117, + "learning_rate": 1.560222566205079e-05, + "loss": 3.2421, "step": 5257 }, { - "epoch": 1.58, - "grad_norm": 22.74957275390625, - "learning_rate": 9.46176205272126e-06, - "loss": 1.8973, + "epoch": 0.66, + "grad_norm": 15.696081161499023, + "learning_rate": 1.5601388946994104e-05, + "loss": 1.1646, "step": 5258 }, { - "epoch": 1.58, - "grad_norm": 15.965211868286133, - "learning_rate": 9.45975744211687e-06, - "loss": 1.8043, + "epoch": 0.66, + "grad_norm": 17.897335052490234, + "learning_rate": 1.5600552231937418e-05, + "loss": 2.4383, "step": 5259 }, { - "epoch": 1.58, - "grad_norm": 19.99260139465332, - "learning_rate": 9.45775283151248e-06, - "loss": 1.6596, + "epoch": 0.66, + "grad_norm": 10.454360961914062, + "learning_rate": 1.5599715516880728e-05, + "loss": 1.5693, "step": 5260 }, { - "epoch": 1.58, - "grad_norm": 9.259965896606445, - "learning_rate": 9.45574822090809e-06, - "loss": 1.4668, + "epoch": 0.66, + "grad_norm": 13.172468185424805, + "learning_rate": 1.559887880182404e-05, + "loss": 1.4832, "step": 5261 }, { - "epoch": 1.58, - "grad_norm": 49.869110107421875, - "learning_rate": 9.4537436103037e-06, - "loss": 3.4404, + "epoch": 0.66, + "grad_norm": 19.44193458557129, + "learning_rate": 1.559804208676735e-05, + "loss": 3.0477, "step": 5262 }, { - "epoch": 1.58, - "grad_norm": 26.978092193603516, - "learning_rate": 9.45173899969931e-06, - "loss": 2.3785, + "epoch": 0.66, + "grad_norm": 52.53721618652344, + "learning_rate": 1.5597205371710665e-05, + "loss": 1.1252, "step": 5263 }, { - "epoch": 1.58, - "grad_norm": 37.760353088378906, - "learning_rate": 9.449734389094918e-06, - "loss": 1.1802, + "epoch": 0.66, + "grad_norm": 10.68852710723877, + "learning_rate": 1.559636865665398e-05, + "loss": 1.5092, "step": 5264 }, { - "epoch": 1.58, - "grad_norm": 24.364702224731445, - "learning_rate": 9.447729778490528e-06, - "loss": 2.288, + "epoch": 0.66, + "grad_norm": 1.36933171749115, + "learning_rate": 1.559553194159729e-05, + "loss": 0.0572, "step": 5265 }, { - "epoch": 1.58, - "grad_norm": 17.613224029541016, - "learning_rate": 9.445725167886138e-06, - "loss": 1.8934, + "epoch": 0.66, + "grad_norm": 14.17722225189209, + "learning_rate": 1.5594695226540603e-05, + "loss": 0.3962, "step": 5266 }, { - "epoch": 1.58, - "grad_norm": 25.8133544921875, - "learning_rate": 9.443720557281748e-06, - "loss": 1.6238, + "epoch": 0.66, + "grad_norm": 10.241806030273438, + "learning_rate": 1.5593858511483913e-05, + "loss": 2.3737, "step": 5267 }, { - "epoch": 1.58, - "grad_norm": 32.17334747314453, - "learning_rate": 9.441715946677358e-06, - "loss": 1.8157, + "epoch": 0.66, + "grad_norm": 28.391183853149414, + "learning_rate": 1.5593021796427227e-05, + "loss": 1.0734, "step": 5268 }, { - "epoch": 1.58, - "grad_norm": 16.442485809326172, - "learning_rate": 9.439711336072969e-06, - "loss": 1.1689, + "epoch": 0.66, + "grad_norm": 14.059039115905762, + "learning_rate": 1.559218508137054e-05, + "loss": 1.8246, "step": 5269 }, { - "epoch": 1.58, - "grad_norm": 11.316102027893066, - "learning_rate": 9.437706725468579e-06, - "loss": 2.063, + "epoch": 0.66, + "grad_norm": 6.69702672958374, + "learning_rate": 1.559134836631385e-05, + "loss": 0.1278, "step": 5270 }, { - "epoch": 1.58, - "grad_norm": 11.197502136230469, - "learning_rate": 9.435702114864189e-06, - "loss": 1.4149, + "epoch": 0.66, + "grad_norm": 8.767797470092773, + "learning_rate": 1.5590511651257164e-05, + "loss": 0.8445, "step": 5271 }, { - "epoch": 1.59, - "grad_norm": 15.829837799072266, - "learning_rate": 9.433697504259797e-06, - "loss": 1.6842, + "epoch": 0.66, + "grad_norm": 6.164175987243652, + "learning_rate": 1.5589674936200478e-05, + "loss": 0.355, "step": 5272 }, { - "epoch": 1.59, - "grad_norm": 44.153804779052734, - "learning_rate": 9.431692893655409e-06, - "loss": 1.8206, + "epoch": 0.66, + "grad_norm": 28.865331649780273, + "learning_rate": 1.558883822114379e-05, + "loss": 3.0153, "step": 5273 }, { - "epoch": 1.59, - "grad_norm": 8.531728744506836, - "learning_rate": 9.429688283051019e-06, - "loss": 0.9972, + "epoch": 0.66, + "grad_norm": 12.738309860229492, + "learning_rate": 1.5588001506087102e-05, + "loss": 1.9587, "step": 5274 }, { - "epoch": 1.59, - "grad_norm": 16.26192283630371, - "learning_rate": 9.427683672446627e-06, - "loss": 1.8538, + "epoch": 0.66, + "grad_norm": 30.209096908569336, + "learning_rate": 1.5587164791030416e-05, + "loss": 1.3462, "step": 5275 }, { - "epoch": 1.59, - "grad_norm": 10.455742835998535, - "learning_rate": 9.425679061842237e-06, - "loss": 1.0675, + "epoch": 0.66, + "grad_norm": 26.268959045410156, + "learning_rate": 1.558632807597373e-05, + "loss": 2.0748, "step": 5276 }, { - "epoch": 1.59, - "grad_norm": 19.000938415527344, - "learning_rate": 9.423674451237848e-06, - "loss": 1.9213, + "epoch": 0.66, + "grad_norm": 7.454601287841797, + "learning_rate": 1.558549136091704e-05, + "loss": 0.8775, "step": 5277 }, { - "epoch": 1.59, - "grad_norm": 17.04540252685547, - "learning_rate": 9.421669840633458e-06, - "loss": 1.4436, + "epoch": 0.66, + "grad_norm": 6.933131217956543, + "learning_rate": 1.5584654645860353e-05, + "loss": 0.4548, "step": 5278 }, { - "epoch": 1.59, - "grad_norm": 15.419578552246094, - "learning_rate": 9.419665230029068e-06, - "loss": 1.9087, + "epoch": 0.66, + "grad_norm": 6.293020725250244, + "learning_rate": 1.5583817930803667e-05, + "loss": 2.0141, "step": 5279 }, { - "epoch": 1.59, - "grad_norm": 18.85577392578125, - "learning_rate": 9.417660619424678e-06, - "loss": 2.2146, - "step": 5280 - }, - { - "epoch": 1.59, - "eval_loss": 0.1917702704668045, - "eval_runtime": 43.7028, - "eval_samples_per_second": 33.842, - "eval_steps_per_second": 33.842, + "epoch": 0.66, + "grad_norm": 45.84587478637695, + "learning_rate": 1.558298121574698e-05, + "loss": 1.7475, "step": 5280 }, { - "epoch": 1.59, - "grad_norm": 24.73578453063965, - "learning_rate": 9.415656008820288e-06, - "loss": 2.0709, + "epoch": 0.66, + "grad_norm": 12.852069854736328, + "learning_rate": 1.558214450069029e-05, + "loss": 1.1722, "step": 5281 }, { - "epoch": 1.59, - "grad_norm": 10.795376777648926, - "learning_rate": 9.413651398215898e-06, - "loss": 0.9204, + "epoch": 0.66, + "grad_norm": 9.643006324768066, + "learning_rate": 1.5581307785633604e-05, + "loss": 2.4848, "step": 5282 }, { - "epoch": 1.59, - "grad_norm": 14.635207176208496, - "learning_rate": 9.411646787611506e-06, - "loss": 1.7222, + "epoch": 0.66, + "grad_norm": 11.011727333068848, + "learning_rate": 1.5580471070576918e-05, + "loss": 2.8387, "step": 5283 }, { - "epoch": 1.59, - "grad_norm": 12.989263534545898, - "learning_rate": 9.409642177007116e-06, - "loss": 1.3394, + "epoch": 0.66, + "grad_norm": 13.584528923034668, + "learning_rate": 1.5579634355520228e-05, + "loss": 1.7407, "step": 5284 }, { - "epoch": 1.59, - "grad_norm": 12.602009773254395, - "learning_rate": 9.407637566402728e-06, - "loss": 1.7059, + "epoch": 0.66, + "grad_norm": 12.662556648254395, + "learning_rate": 1.5578797640463542e-05, + "loss": 2.0174, "step": 5285 }, { - "epoch": 1.59, - "grad_norm": 20.175142288208008, - "learning_rate": 9.405632955798337e-06, - "loss": 1.7654, + "epoch": 0.66, + "grad_norm": 16.633058547973633, + "learning_rate": 1.5577960925406856e-05, + "loss": 2.0047, "step": 5286 }, { - "epoch": 1.59, - "grad_norm": 18.197296142578125, - "learning_rate": 9.403628345193947e-06, - "loss": 1.7068, + "epoch": 0.66, + "grad_norm": 12.066131591796875, + "learning_rate": 1.5577124210350166e-05, + "loss": 1.0126, "step": 5287 }, { - "epoch": 1.59, - "grad_norm": 34.1553955078125, - "learning_rate": 9.401623734589557e-06, - "loss": 1.3868, + "epoch": 0.66, + "grad_norm": 35.76958084106445, + "learning_rate": 1.557628749529348e-05, + "loss": 2.378, "step": 5288 }, { - "epoch": 1.59, - "grad_norm": 27.550334930419922, - "learning_rate": 9.399619123985167e-06, - "loss": 1.2817, + "epoch": 0.66, + "grad_norm": 8.880400657653809, + "learning_rate": 1.5575450780236793e-05, + "loss": 1.9348, "step": 5289 }, { - "epoch": 1.59, - "grad_norm": 19.8583984375, - "learning_rate": 9.397614513380777e-06, - "loss": 1.9351, + "epoch": 0.66, + "grad_norm": 3.5895142555236816, + "learning_rate": 1.5574614065180103e-05, + "loss": 0.1277, "step": 5290 }, { - "epoch": 1.59, - "grad_norm": 6.953277587890625, - "learning_rate": 9.395609902776387e-06, - "loss": 1.1993, + "epoch": 0.66, + "grad_norm": 4.755842208862305, + "learning_rate": 1.5573777350123417e-05, + "loss": 0.3882, "step": 5291 }, { - "epoch": 1.59, - "grad_norm": 15.303847312927246, - "learning_rate": 9.393605292171997e-06, - "loss": 1.6499, + "epoch": 0.66, + "grad_norm": 12.279382705688477, + "learning_rate": 1.5572940635066727e-05, + "loss": 2.5931, "step": 5292 }, { - "epoch": 1.59, - "grad_norm": 28.220230102539062, - "learning_rate": 9.391600681567607e-06, - "loss": 1.7453, + "epoch": 0.66, + "grad_norm": 4.911112308502197, + "learning_rate": 1.557210392001004e-05, + "loss": 0.6729, "step": 5293 }, { - "epoch": 1.59, - "grad_norm": 40.280029296875, - "learning_rate": 9.389596070963216e-06, - "loss": 1.9758, + "epoch": 0.66, + "grad_norm": 11.579627990722656, + "learning_rate": 1.5571267204953355e-05, + "loss": 3.0826, "step": 5294 }, { - "epoch": 1.59, - "grad_norm": 15.754435539245605, - "learning_rate": 9.387591460358826e-06, - "loss": 1.1859, + "epoch": 0.66, + "grad_norm": 8.23723030090332, + "learning_rate": 1.5570430489896665e-05, + "loss": 1.0335, "step": 5295 }, { - "epoch": 1.59, - "grad_norm": 64.05276489257812, - "learning_rate": 9.385586849754436e-06, - "loss": 2.8985, + "epoch": 0.66, + "grad_norm": 23.425081253051758, + "learning_rate": 1.556959377483998e-05, + "loss": 1.8962, "step": 5296 }, { - "epoch": 1.59, - "grad_norm": 12.505768775939941, - "learning_rate": 9.383582239150046e-06, - "loss": 1.7193, + "epoch": 0.66, + "grad_norm": 22.175535202026367, + "learning_rate": 1.5568757059783292e-05, + "loss": 2.0819, "step": 5297 }, { - "epoch": 1.59, - "grad_norm": 22.20297622680664, - "learning_rate": 9.381577628545656e-06, - "loss": 1.8241, + "epoch": 0.66, + "grad_norm": 13.097886085510254, + "learning_rate": 1.5567920344726602e-05, + "loss": 1.8502, "step": 5298 }, { - "epoch": 1.59, - "grad_norm": 17.095487594604492, - "learning_rate": 9.379573017941266e-06, - "loss": 1.7652, + "epoch": 0.67, + "grad_norm": 10.502854347229004, + "learning_rate": 1.5567083629669916e-05, + "loss": 1.0218, "step": 5299 }, { - "epoch": 1.59, - "grad_norm": 30.220666885375977, - "learning_rate": 9.377568407336876e-06, - "loss": 2.0137, + "epoch": 0.67, + "grad_norm": 9.6867036819458, + "learning_rate": 1.556624691461323e-05, + "loss": 0.9032, "step": 5300 }, { - "epoch": 1.59, - "grad_norm": 16.760820388793945, - "learning_rate": 9.375563796732484e-06, - "loss": 1.4244, + "epoch": 0.67, + "grad_norm": 21.5103702545166, + "learning_rate": 1.5565410199556543e-05, + "loss": 2.3803, "step": 5301 }, { - "epoch": 1.59, - "grad_norm": 74.30367279052734, - "learning_rate": 9.373559186128095e-06, - "loss": 1.818, + "epoch": 0.67, + "grad_norm": 13.037554740905762, + "learning_rate": 1.5564573484499854e-05, + "loss": 1.0351, "step": 5302 }, { - "epoch": 1.59, - "grad_norm": 8.43071460723877, - "learning_rate": 9.371554575523705e-06, - "loss": 0.9715, + "epoch": 0.67, + "grad_norm": 19.55864143371582, + "learning_rate": 1.5563736769443167e-05, + "loss": 0.9527, "step": 5303 }, { - "epoch": 1.59, - "grad_norm": 19.98003387451172, - "learning_rate": 9.369549964919315e-06, - "loss": 1.5193, + "epoch": 0.67, + "grad_norm": 4.488838195800781, + "learning_rate": 1.556290005438648e-05, + "loss": 0.074, "step": 5304 }, { - "epoch": 1.6, - "grad_norm": 15.74586009979248, - "learning_rate": 9.367545354314925e-06, - "loss": 1.7498, + "epoch": 0.67, + "grad_norm": 7.514483451843262, + "learning_rate": 1.556206333932979e-05, + "loss": 0.5455, "step": 5305 }, { - "epoch": 1.6, - "grad_norm": 16.798616409301758, - "learning_rate": 9.365540743710535e-06, - "loss": 0.9751, + "epoch": 0.67, + "grad_norm": 16.274606704711914, + "learning_rate": 1.5561226624273105e-05, + "loss": 1.7984, "step": 5306 }, { - "epoch": 1.6, - "grad_norm": 15.125486373901367, - "learning_rate": 9.363536133106145e-06, - "loss": 2.1381, + "epoch": 0.67, + "grad_norm": 9.492721557617188, + "learning_rate": 1.556038990921642e-05, + "loss": 1.2578, "step": 5307 }, { - "epoch": 1.6, - "grad_norm": 26.87381362915039, - "learning_rate": 9.361531522501755e-06, - "loss": 1.6783, + "epoch": 0.67, + "grad_norm": 11.834115982055664, + "learning_rate": 1.5559553194159732e-05, + "loss": 1.3758, "step": 5308 }, { - "epoch": 1.6, - "grad_norm": 23.354263305664062, - "learning_rate": 9.359526911897363e-06, - "loss": 2.2363, + "epoch": 0.67, + "grad_norm": 23.507463455200195, + "learning_rate": 1.5558716479103042e-05, + "loss": 2.7738, "step": 5309 }, { - "epoch": 1.6, - "grad_norm": 11.7379789352417, - "learning_rate": 9.357522301292975e-06, - "loss": 1.6394, + "epoch": 0.67, + "grad_norm": 18.14545440673828, + "learning_rate": 1.5557879764046356e-05, + "loss": 1.151, "step": 5310 }, { - "epoch": 1.6, - "grad_norm": 16.918331146240234, - "learning_rate": 9.355517690688585e-06, - "loss": 2.0194, + "epoch": 0.67, + "grad_norm": 9.639022827148438, + "learning_rate": 1.555704304898967e-05, + "loss": 2.6962, "step": 5311 }, { - "epoch": 1.6, - "grad_norm": 10.654472351074219, - "learning_rate": 9.353513080084194e-06, - "loss": 2.1258, + "epoch": 0.67, + "grad_norm": 37.715087890625, + "learning_rate": 1.555620633393298e-05, + "loss": 1.3331, "step": 5312 }, { - "epoch": 1.6, - "grad_norm": 23.184656143188477, - "learning_rate": 9.351508469479804e-06, - "loss": 2.9148, + "epoch": 0.67, + "grad_norm": 16.375370025634766, + "learning_rate": 1.5555369618876294e-05, + "loss": 3.3425, "step": 5313 }, { - "epoch": 1.6, - "grad_norm": 10.799365997314453, - "learning_rate": 9.349503858875414e-06, - "loss": 0.941, + "epoch": 0.67, + "grad_norm": 16.068984985351562, + "learning_rate": 1.5554532903819607e-05, + "loss": 2.0363, "step": 5314 }, { - "epoch": 1.6, - "grad_norm": 12.17577838897705, - "learning_rate": 9.347499248271024e-06, - "loss": 0.8023, + "epoch": 0.67, + "grad_norm": 5.682156085968018, + "learning_rate": 1.5553696188762917e-05, + "loss": 2.2618, "step": 5315 }, { - "epoch": 1.6, - "grad_norm": 25.213882446289062, - "learning_rate": 9.345494637666634e-06, - "loss": 1.6508, + "epoch": 0.67, + "grad_norm": 15.031339645385742, + "learning_rate": 1.555285947370623e-05, + "loss": 2.9795, "step": 5316 }, { - "epoch": 1.6, - "grad_norm": 11.12057113647461, - "learning_rate": 9.343490027062244e-06, - "loss": 1.589, + "epoch": 0.67, + "grad_norm": 13.337397575378418, + "learning_rate": 1.5552022758649545e-05, + "loss": 0.792, "step": 5317 }, { - "epoch": 1.6, - "grad_norm": 27.033601760864258, - "learning_rate": 9.341485416457854e-06, - "loss": 1.8537, + "epoch": 0.67, + "grad_norm": 27.936046600341797, + "learning_rate": 1.5551186043592855e-05, + "loss": 2.1488, "step": 5318 }, { - "epoch": 1.6, - "grad_norm": 16.07554054260254, - "learning_rate": 9.339480805853464e-06, - "loss": 1.4078, + "epoch": 0.67, + "grad_norm": 19.554880142211914, + "learning_rate": 1.555034932853617e-05, + "loss": 3.7691, "step": 5319 }, { - "epoch": 1.6, - "grad_norm": 16.774127960205078, - "learning_rate": 9.337476195249073e-06, - "loss": 1.2248, + "epoch": 0.67, + "grad_norm": 7.329078197479248, + "learning_rate": 1.554951261347948e-05, + "loss": 2.1147, "step": 5320 }, { - "epoch": 1.6, - "grad_norm": 12.099883079528809, - "learning_rate": 9.335471584644683e-06, - "loss": 0.5273, + "epoch": 0.67, + "grad_norm": 24.679201126098633, + "learning_rate": 1.5548675898422793e-05, + "loss": 3.1108, "step": 5321 }, { - "epoch": 1.6, - "grad_norm": 14.563944816589355, - "learning_rate": 9.333466974040295e-06, - "loss": 1.2972, + "epoch": 0.67, + "grad_norm": 15.159878730773926, + "learning_rate": 1.5547839183366106e-05, + "loss": 1.4298, "step": 5322 }, { - "epoch": 1.6, - "grad_norm": 6.988048553466797, - "learning_rate": 9.331462363435903e-06, - "loss": 1.2299, + "epoch": 0.67, + "grad_norm": 85.50730895996094, + "learning_rate": 1.5547002468309417e-05, + "loss": 2.3678, "step": 5323 }, { - "epoch": 1.6, - "grad_norm": 15.345603942871094, - "learning_rate": 9.329457752831513e-06, - "loss": 1.8004, + "epoch": 0.67, + "grad_norm": 9.848220825195312, + "learning_rate": 1.554616575325273e-05, + "loss": 0.7269, "step": 5324 }, { - "epoch": 1.6, - "grad_norm": 8.920988082885742, - "learning_rate": 9.327453142227123e-06, - "loss": 1.799, + "epoch": 0.67, + "grad_norm": 49.288997650146484, + "learning_rate": 1.5545329038196044e-05, + "loss": 2.2035, "step": 5325 }, { - "epoch": 1.6, - "grad_norm": 13.415757179260254, - "learning_rate": 9.325448531622733e-06, - "loss": 1.3893, + "epoch": 0.67, + "grad_norm": 14.30613899230957, + "learning_rate": 1.5544492323139354e-05, + "loss": 1.5891, "step": 5326 }, { - "epoch": 1.6, - "grad_norm": 14.817873001098633, - "learning_rate": 9.323443921018343e-06, - "loss": 2.0269, + "epoch": 0.67, + "grad_norm": 17.46340560913086, + "learning_rate": 1.5543655608082668e-05, + "loss": 1.3967, "step": 5327 }, { - "epoch": 1.6, - "grad_norm": 17.05929183959961, - "learning_rate": 9.321439310413953e-06, - "loss": 1.9584, + "epoch": 0.67, + "grad_norm": 12.865313529968262, + "learning_rate": 1.554281889302598e-05, + "loss": 0.9915, "step": 5328 }, { - "epoch": 1.6, - "grad_norm": 10.261931419372559, - "learning_rate": 9.319434699809563e-06, - "loss": 1.5321, + "epoch": 0.67, + "grad_norm": 10.885964393615723, + "learning_rate": 1.5541982177969295e-05, + "loss": 1.491, "step": 5329 }, { - "epoch": 1.6, - "grad_norm": 18.25226402282715, - "learning_rate": 9.317430089205174e-06, - "loss": 1.5914, + "epoch": 0.67, + "grad_norm": 52.34687423706055, + "learning_rate": 1.5541145462912605e-05, + "loss": 2.7669, "step": 5330 }, { - "epoch": 1.6, - "grad_norm": 13.496363639831543, - "learning_rate": 9.315425478600782e-06, - "loss": 1.6435, + "epoch": 0.67, + "grad_norm": 17.977941513061523, + "learning_rate": 1.554030874785592e-05, + "loss": 1.8955, "step": 5331 }, { - "epoch": 1.6, - "grad_norm": 26.062244415283203, - "learning_rate": 9.313420867996392e-06, - "loss": 1.892, + "epoch": 0.67, + "grad_norm": 22.670303344726562, + "learning_rate": 1.5539472032799233e-05, + "loss": 2.4956, "step": 5332 }, { - "epoch": 1.6, - "grad_norm": 28.14862632751465, - "learning_rate": 9.311416257392002e-06, - "loss": 0.9718, + "epoch": 0.67, + "grad_norm": 10.357001304626465, + "learning_rate": 1.5538635317742543e-05, + "loss": 0.5737, "step": 5333 }, { - "epoch": 1.6, - "grad_norm": 9.369063377380371, - "learning_rate": 9.309411646787612e-06, - "loss": 1.6377, + "epoch": 0.67, + "grad_norm": 9.255528450012207, + "learning_rate": 1.5537798602685856e-05, + "loss": 1.5152, "step": 5334 }, { - "epoch": 1.6, - "grad_norm": 15.546137809753418, - "learning_rate": 9.307407036183222e-06, - "loss": 1.9211, + "epoch": 0.67, + "grad_norm": 5.881645679473877, + "learning_rate": 1.553696188762917e-05, + "loss": 1.94, "step": 5335 }, { - "epoch": 1.6, - "grad_norm": 32.82224655151367, - "learning_rate": 9.305402425578832e-06, - "loss": 1.2737, + "epoch": 0.67, + "grad_norm": 7.286981105804443, + "learning_rate": 1.5536125172572484e-05, + "loss": 0.8161, "step": 5336 }, { - "epoch": 1.6, - "grad_norm": 58.94491958618164, - "learning_rate": 9.303397814974442e-06, - "loss": 2.3802, + "epoch": 0.67, + "grad_norm": 22.43034553527832, + "learning_rate": 1.5535288457515794e-05, + "loss": 2.1878, "step": 5337 }, { - "epoch": 1.6, - "grad_norm": 17.832128524780273, - "learning_rate": 9.30139320437005e-06, - "loss": 2.0701, + "epoch": 0.67, + "grad_norm": 11.97895622253418, + "learning_rate": 1.5534451742459108e-05, + "loss": 0.579, "step": 5338 }, { - "epoch": 1.61, - "grad_norm": 25.756505966186523, - "learning_rate": 9.299388593765661e-06, - "loss": 1.6935, + "epoch": 0.67, + "grad_norm": 9.473199844360352, + "learning_rate": 1.553361502740242e-05, + "loss": 1.7777, "step": 5339 }, { - "epoch": 1.61, - "grad_norm": 12.220959663391113, - "learning_rate": 9.297383983161273e-06, - "loss": 2.3572, + "epoch": 0.67, + "grad_norm": 15.786806106567383, + "learning_rate": 1.553277831234573e-05, + "loss": 1.3267, "step": 5340 }, { - "epoch": 1.61, - "grad_norm": 28.716758728027344, - "learning_rate": 9.295379372556883e-06, - "loss": 1.8092, + "epoch": 0.67, + "grad_norm": 15.628312110900879, + "learning_rate": 1.5531941597289045e-05, + "loss": 1.7803, "step": 5341 }, { - "epoch": 1.61, - "grad_norm": 17.98668670654297, - "learning_rate": 9.293374761952491e-06, - "loss": 1.61, + "epoch": 0.67, + "grad_norm": 7.371047019958496, + "learning_rate": 1.553110488223236e-05, + "loss": 0.9534, "step": 5342 }, { - "epoch": 1.61, - "grad_norm": 15.28249454498291, - "learning_rate": 9.291370151348101e-06, - "loss": 2.0471, + "epoch": 0.67, + "grad_norm": 10.50656795501709, + "learning_rate": 1.553026816717567e-05, + "loss": 1.1232, "step": 5343 }, { - "epoch": 1.61, - "grad_norm": 10.611080169677734, - "learning_rate": 9.289365540743711e-06, - "loss": 0.9532, + "epoch": 0.67, + "grad_norm": 14.262672424316406, + "learning_rate": 1.5529431452118983e-05, + "loss": 0.9357, "step": 5344 }, { - "epoch": 1.61, - "grad_norm": 18.82200813293457, - "learning_rate": 9.287360930139321e-06, - "loss": 1.315, + "epoch": 0.67, + "grad_norm": 16.33648681640625, + "learning_rate": 1.5528594737062293e-05, + "loss": 2.7881, "step": 5345 }, { - "epoch": 1.61, - "grad_norm": 19.420570373535156, - "learning_rate": 9.28535631953493e-06, - "loss": 1.0041, + "epoch": 0.67, + "grad_norm": 18.935623168945312, + "learning_rate": 1.5527758022005607e-05, + "loss": 0.9888, "step": 5346 }, { - "epoch": 1.61, - "grad_norm": 16.25090980529785, - "learning_rate": 9.283351708930542e-06, - "loss": 2.2406, + "epoch": 0.67, + "grad_norm": 10.97852611541748, + "learning_rate": 1.552692130694892e-05, + "loss": 1.9534, "step": 5347 }, { - "epoch": 1.61, - "grad_norm": 36.150394439697266, - "learning_rate": 9.281347098326152e-06, - "loss": 2.0748, + "epoch": 0.67, + "grad_norm": 6.660601615905762, + "learning_rate": 1.552608459189223e-05, + "loss": 2.1308, "step": 5348 }, { - "epoch": 1.61, - "grad_norm": 69.37711334228516, - "learning_rate": 9.27934248772176e-06, - "loss": 2.1891, + "epoch": 0.67, + "grad_norm": 13.707566261291504, + "learning_rate": 1.5525247876835544e-05, + "loss": 0.9985, "step": 5349 }, { - "epoch": 1.61, - "grad_norm": 9.345280647277832, - "learning_rate": 9.27733787711737e-06, - "loss": 1.2469, + "epoch": 0.67, + "grad_norm": 8.461606979370117, + "learning_rate": 1.5524411161778858e-05, + "loss": 1.68, "step": 5350 }, { - "epoch": 1.61, - "grad_norm": 51.229942321777344, - "learning_rate": 9.27533326651298e-06, - "loss": 2.5097, + "epoch": 0.67, + "grad_norm": 8.554165840148926, + "learning_rate": 1.5523574446722168e-05, + "loss": 1.0881, "step": 5351 }, { - "epoch": 1.61, - "grad_norm": 33.678749084472656, - "learning_rate": 9.27332865590859e-06, - "loss": 2.5323, + "epoch": 0.67, + "grad_norm": 25.228971481323242, + "learning_rate": 1.5522737731665482e-05, + "loss": 2.156, "step": 5352 }, { - "epoch": 1.61, - "grad_norm": 20.67754364013672, - "learning_rate": 9.2713240453042e-06, - "loss": 3.1563, + "epoch": 0.67, + "grad_norm": 5.438741207122803, + "learning_rate": 1.5521901016608795e-05, + "loss": 2.7531, "step": 5353 }, { - "epoch": 1.61, - "grad_norm": 19.53748321533203, - "learning_rate": 9.26931943469981e-06, - "loss": 1.1978, + "epoch": 0.67, + "grad_norm": 20.67116928100586, + "learning_rate": 1.5521064301552106e-05, + "loss": 1.821, "step": 5354 }, { - "epoch": 1.61, - "grad_norm": 16.690845489501953, - "learning_rate": 9.26731482409542e-06, - "loss": 1.1068, + "epoch": 0.67, + "grad_norm": 8.502362251281738, + "learning_rate": 1.552022758649542e-05, + "loss": 0.7838, "step": 5355 }, { - "epoch": 1.61, - "grad_norm": 19.17826271057129, - "learning_rate": 9.26531021349103e-06, - "loss": 1.8296, + "epoch": 0.67, + "grad_norm": 24.416032791137695, + "learning_rate": 1.5519390871438733e-05, + "loss": 3.4488, "step": 5356 }, { - "epoch": 1.61, - "grad_norm": 15.507776260375977, - "learning_rate": 9.263305602886639e-06, - "loss": 1.6507, + "epoch": 0.67, + "grad_norm": 17.0931396484375, + "learning_rate": 1.5518554156382047e-05, + "loss": 1.8967, "step": 5357 }, { - "epoch": 1.61, - "grad_norm": 129.35165405273438, - "learning_rate": 9.261300992282249e-06, - "loss": 1.632, + "epoch": 0.67, + "grad_norm": 20.132095336914062, + "learning_rate": 1.5517717441325357e-05, + "loss": 0.9988, "step": 5358 }, { - "epoch": 1.61, - "grad_norm": 9.657642364501953, - "learning_rate": 9.259296381677861e-06, - "loss": 1.315, + "epoch": 0.67, + "grad_norm": 19.004472732543945, + "learning_rate": 1.551688072626867e-05, + "loss": 1.2706, "step": 5359 }, { - "epoch": 1.61, - "grad_norm": 18.087684631347656, - "learning_rate": 9.25729177107347e-06, - "loss": 1.2448, + "epoch": 0.67, + "grad_norm": 14.730697631835938, + "learning_rate": 1.5516044011211984e-05, + "loss": 2.3345, "step": 5360 }, { - "epoch": 1.61, - "grad_norm": 16.07610511779785, - "learning_rate": 9.25528716046908e-06, - "loss": 1.2188, + "epoch": 0.67, + "grad_norm": 14.693830490112305, + "learning_rate": 1.5515207296155295e-05, + "loss": 1.0428, "step": 5361 }, { - "epoch": 1.61, - "grad_norm": 14.000444412231445, - "learning_rate": 9.25328254986469e-06, - "loss": 1.9099, + "epoch": 0.67, + "grad_norm": 20.921735763549805, + "learning_rate": 1.5514370581098608e-05, + "loss": 3.0207, "step": 5362 }, { - "epoch": 1.61, - "grad_norm": 22.69326400756836, - "learning_rate": 9.2512779392603e-06, - "loss": 2.2418, + "epoch": 0.67, + "grad_norm": 10.459497451782227, + "learning_rate": 1.5513533866041922e-05, + "loss": 1.7045, "step": 5363 }, { - "epoch": 1.61, - "grad_norm": 41.782066345214844, - "learning_rate": 9.24927332865591e-06, - "loss": 1.8902, + "epoch": 0.67, + "grad_norm": 26.556926727294922, + "learning_rate": 1.5512697150985235e-05, + "loss": 0.7821, "step": 5364 }, { - "epoch": 1.61, - "grad_norm": 9.715392112731934, - "learning_rate": 9.24726871805152e-06, - "loss": 1.1494, + "epoch": 0.67, + "grad_norm": 54.429847717285156, + "learning_rate": 1.5511860435928546e-05, + "loss": 2.2101, "step": 5365 }, { - "epoch": 1.61, - "grad_norm": 13.382871627807617, - "learning_rate": 9.24526410744713e-06, - "loss": 1.6138, + "epoch": 0.67, + "grad_norm": 7.08843994140625, + "learning_rate": 1.551102372087186e-05, + "loss": 1.3268, "step": 5366 }, { - "epoch": 1.61, - "grad_norm": 33.94945526123047, - "learning_rate": 9.24325949684274e-06, - "loss": 1.562, + "epoch": 0.67, + "grad_norm": 11.765007972717285, + "learning_rate": 1.5510187005815173e-05, + "loss": 1.4397, "step": 5367 }, { - "epoch": 1.61, - "grad_norm": 24.688888549804688, - "learning_rate": 9.241254886238348e-06, - "loss": 1.3487, + "epoch": 0.67, + "grad_norm": 15.495498657226562, + "learning_rate": 1.5509350290758483e-05, + "loss": 1.4547, "step": 5368 }, { - "epoch": 1.61, - "grad_norm": 19.392419815063477, - "learning_rate": 9.239250275633958e-06, - "loss": 1.7019, + "epoch": 0.67, + "grad_norm": 6.075366497039795, + "learning_rate": 1.5508513575701797e-05, + "loss": 2.4579, "step": 5369 }, { - "epoch": 1.61, - "grad_norm": 9.527002334594727, - "learning_rate": 9.237245665029568e-06, - "loss": 1.6555, + "epoch": 0.67, + "grad_norm": 10.94253921508789, + "learning_rate": 1.550767686064511e-05, + "loss": 0.4965, "step": 5370 }, { - "epoch": 1.61, - "grad_norm": 11.718170166015625, - "learning_rate": 9.235241054425179e-06, - "loss": 1.143, + "epoch": 0.67, + "grad_norm": 19.25712013244629, + "learning_rate": 1.550684014558842e-05, + "loss": 1.6914, "step": 5371 }, { - "epoch": 1.62, - "grad_norm": 14.71898078918457, - "learning_rate": 9.233236443820789e-06, - "loss": 1.3483, + "epoch": 0.67, + "grad_norm": 26.579479217529297, + "learning_rate": 1.5506003430531734e-05, + "loss": 2.3986, "step": 5372 }, { - "epoch": 1.62, - "grad_norm": 12.641324996948242, - "learning_rate": 9.231231833216399e-06, - "loss": 1.3701, + "epoch": 0.67, + "grad_norm": 16.759775161743164, + "learning_rate": 1.5505166715475045e-05, + "loss": 0.5726, "step": 5373 }, { - "epoch": 1.62, - "grad_norm": 26.643348693847656, - "learning_rate": 9.229227222612009e-06, - "loss": 1.8719, + "epoch": 0.67, + "grad_norm": 8.281192779541016, + "learning_rate": 1.550433000041836e-05, + "loss": 0.3718, "step": 5374 }, { - "epoch": 1.62, - "grad_norm": 11.661134719848633, - "learning_rate": 9.227222612007619e-06, - "loss": 1.5889, + "epoch": 0.67, + "grad_norm": 9.185592651367188, + "learning_rate": 1.5503493285361672e-05, + "loss": 0.5656, "step": 5375 }, { - "epoch": 1.62, - "grad_norm": 14.931244850158691, - "learning_rate": 9.225218001403227e-06, - "loss": 1.7465, + "epoch": 0.67, + "grad_norm": 7.616875648498535, + "learning_rate": 1.5502656570304982e-05, + "loss": 2.2267, "step": 5376 }, { - "epoch": 1.62, - "grad_norm": 11.898720741271973, - "learning_rate": 9.223213390798839e-06, - "loss": 1.6241, + "epoch": 0.67, + "grad_norm": 8.69092082977295, + "learning_rate": 1.5501819855248296e-05, + "loss": 1.4501, "step": 5377 }, { - "epoch": 1.62, - "grad_norm": 16.589155197143555, - "learning_rate": 9.221208780194449e-06, - "loss": 1.4509, + "epoch": 0.67, + "grad_norm": 42.72675704956055, + "learning_rate": 1.550098314019161e-05, + "loss": 4.0754, "step": 5378 }, { - "epoch": 1.62, - "grad_norm": 23.514259338378906, - "learning_rate": 9.219204169590058e-06, - "loss": 2.5493, + "epoch": 0.68, + "grad_norm": 21.543594360351562, + "learning_rate": 1.550014642513492e-05, + "loss": 2.0531, "step": 5379 }, { - "epoch": 1.62, - "grad_norm": 17.52058982849121, - "learning_rate": 9.217199558985668e-06, - "loss": 1.2928, + "epoch": 0.68, + "grad_norm": 10.249034881591797, + "learning_rate": 1.5499309710078234e-05, + "loss": 2.3686, "step": 5380 }, { - "epoch": 1.62, - "grad_norm": 28.47098159790039, - "learning_rate": 9.215194948381278e-06, - "loss": 1.2744, + "epoch": 0.68, + "grad_norm": 5.479506969451904, + "learning_rate": 1.5498472995021547e-05, + "loss": 0.1317, "step": 5381 }, { - "epoch": 1.62, - "grad_norm": 15.93428897857666, - "learning_rate": 9.213190337776888e-06, - "loss": 1.5216, + "epoch": 0.68, + "grad_norm": 5.757992267608643, + "learning_rate": 1.5497636279964857e-05, + "loss": 0.3811, "step": 5382 }, { - "epoch": 1.62, - "grad_norm": 73.04627227783203, - "learning_rate": 9.211185727172498e-06, - "loss": 0.9938, + "epoch": 0.68, + "grad_norm": 13.579237937927246, + "learning_rate": 1.549679956490817e-05, + "loss": 1.3426, "step": 5383 }, { - "epoch": 1.62, - "grad_norm": 26.94357681274414, - "learning_rate": 9.209181116568108e-06, - "loss": 1.0955, + "epoch": 0.68, + "grad_norm": 10.298381805419922, + "learning_rate": 1.5495962849851485e-05, + "loss": 0.8953, "step": 5384 }, { - "epoch": 1.62, - "grad_norm": 15.998095512390137, - "learning_rate": 9.207176505963718e-06, - "loss": 1.2675, + "epoch": 0.68, + "grad_norm": 8.469018936157227, + "learning_rate": 1.54951261347948e-05, + "loss": 1.9812, "step": 5385 }, { - "epoch": 1.62, - "grad_norm": 6.676560401916504, - "learning_rate": 9.205171895359326e-06, - "loss": 0.8448, + "epoch": 0.68, + "grad_norm": 8.103954315185547, + "learning_rate": 1.549428941973811e-05, + "loss": 1.631, "step": 5386 }, { - "epoch": 1.62, - "grad_norm": 17.63335609436035, - "learning_rate": 9.203167284754936e-06, - "loss": 1.771, + "epoch": 0.68, + "grad_norm": 17.763341903686523, + "learning_rate": 1.5493452704681422e-05, + "loss": 3.0545, "step": 5387 }, { - "epoch": 1.62, - "grad_norm": 15.837037086486816, - "learning_rate": 9.201162674150547e-06, - "loss": 1.1493, + "epoch": 0.68, + "grad_norm": 9.169461250305176, + "learning_rate": 1.5492615989624736e-05, + "loss": 0.6142, "step": 5388 }, { - "epoch": 1.62, - "grad_norm": 15.462651252746582, - "learning_rate": 9.199158063546157e-06, - "loss": 1.4158, + "epoch": 0.68, + "grad_norm": 12.485591888427734, + "learning_rate": 1.5491779274568046e-05, + "loss": 3.1047, "step": 5389 }, { - "epoch": 1.62, - "grad_norm": 14.927474975585938, - "learning_rate": 9.197153452941767e-06, - "loss": 2.7956, + "epoch": 0.68, + "grad_norm": 49.102455139160156, + "learning_rate": 1.549094255951136e-05, + "loss": 1.9242, "step": 5390 }, { - "epoch": 1.62, - "grad_norm": 25.812740325927734, - "learning_rate": 9.195148842337377e-06, - "loss": 2.2199, + "epoch": 0.68, + "grad_norm": 9.454066276550293, + "learning_rate": 1.5490105844454673e-05, + "loss": 1.6279, "step": 5391 }, { - "epoch": 1.62, - "grad_norm": 17.734699249267578, - "learning_rate": 9.193144231732987e-06, - "loss": 2.1131, + "epoch": 0.68, + "grad_norm": 13.472455024719238, + "learning_rate": 1.5489269129397987e-05, + "loss": 1.9243, "step": 5392 }, { - "epoch": 1.62, - "grad_norm": 15.593416213989258, - "learning_rate": 9.191139621128597e-06, - "loss": 1.3466, + "epoch": 0.68, + "grad_norm": 20.5775203704834, + "learning_rate": 1.5488432414341297e-05, + "loss": 1.5563, "step": 5393 }, { - "epoch": 1.62, - "grad_norm": 14.880626678466797, - "learning_rate": 9.189135010524205e-06, - "loss": 1.4476, + "epoch": 0.68, + "grad_norm": 12.55504035949707, + "learning_rate": 1.548759569928461e-05, + "loss": 0.8326, "step": 5394 }, { - "epoch": 1.62, - "grad_norm": 17.891508102416992, - "learning_rate": 9.187130399919815e-06, - "loss": 1.5905, + "epoch": 0.68, + "grad_norm": 31.129262924194336, + "learning_rate": 1.5486758984227925e-05, + "loss": 1.5193, "step": 5395 }, { - "epoch": 1.62, - "grad_norm": 20.343963623046875, - "learning_rate": 9.185125789315427e-06, - "loss": 2.0189, + "epoch": 0.68, + "grad_norm": 14.942036628723145, + "learning_rate": 1.5485922269171235e-05, + "loss": 2.8091, "step": 5396 }, { - "epoch": 1.62, - "grad_norm": 19.60910415649414, - "learning_rate": 9.183121178711036e-06, - "loss": 2.392, + "epoch": 0.68, + "grad_norm": 7.471428394317627, + "learning_rate": 1.548508555411455e-05, + "loss": 0.8795, "step": 5397 }, { - "epoch": 1.62, - "grad_norm": 19.910167694091797, - "learning_rate": 9.181116568106646e-06, - "loss": 1.713, + "epoch": 0.68, + "grad_norm": 22.331401824951172, + "learning_rate": 1.548424883905786e-05, + "loss": 1.7552, "step": 5398 }, { - "epoch": 1.62, - "grad_norm": 13.35189151763916, - "learning_rate": 9.179111957502256e-06, - "loss": 1.2514, + "epoch": 0.68, + "grad_norm": 4.722949028015137, + "learning_rate": 1.5483412124001173e-05, + "loss": 0.1993, "step": 5399 }, { - "epoch": 1.62, - "grad_norm": 27.091657638549805, - "learning_rate": 9.177107346897866e-06, - "loss": 1.4003, - "step": 5400 - }, - { - "epoch": 1.62, - "eval_loss": 0.18608930706977844, - "eval_runtime": 43.4962, - "eval_samples_per_second": 34.003, - "eval_steps_per_second": 34.003, + "epoch": 0.68, + "grad_norm": 14.18830680847168, + "learning_rate": 1.5482575408944486e-05, + "loss": 0.393, "step": 5400 }, { - "epoch": 1.62, - "grad_norm": 38.99711608886719, - "learning_rate": 9.175102736293476e-06, - "loss": 2.452, + "epoch": 0.68, + "grad_norm": 70.21976470947266, + "learning_rate": 1.5481738693887796e-05, + "loss": 3.0059, "step": 5401 }, { - "epoch": 1.62, - "grad_norm": 17.14046859741211, - "learning_rate": 9.173098125689086e-06, - "loss": 1.0333, + "epoch": 0.68, + "grad_norm": 46.213504791259766, + "learning_rate": 1.548090197883111e-05, + "loss": 1.9276, "step": 5402 }, { - "epoch": 1.62, - "grad_norm": 14.545239448547363, - "learning_rate": 9.171093515084696e-06, - "loss": 1.1729, + "epoch": 0.68, + "grad_norm": 12.647377014160156, + "learning_rate": 1.548006526377442e-05, + "loss": 2.4336, "step": 5403 }, { - "epoch": 1.62, - "grad_norm": 21.479055404663086, - "learning_rate": 9.169088904480306e-06, - "loss": 2.0552, + "epoch": 0.68, + "grad_norm": 12.181181907653809, + "learning_rate": 1.5479228548717734e-05, + "loss": 0.9303, "step": 5404 }, { - "epoch": 1.63, - "grad_norm": 52.615394592285156, - "learning_rate": 9.167084293875915e-06, - "loss": 1.6672, + "epoch": 0.68, + "grad_norm": 19.40496063232422, + "learning_rate": 1.5478391833661048e-05, + "loss": 1.0818, "step": 5405 }, { - "epoch": 1.63, - "grad_norm": 12.193754196166992, - "learning_rate": 9.165079683271525e-06, - "loss": 1.3845, + "epoch": 0.68, + "grad_norm": 15.307136535644531, + "learning_rate": 1.547755511860436e-05, + "loss": 1.7573, "step": 5406 }, { - "epoch": 1.63, - "grad_norm": 10.990750312805176, - "learning_rate": 9.163075072667135e-06, - "loss": 1.3259, + "epoch": 0.68, + "grad_norm": 17.155710220336914, + "learning_rate": 1.547671840354767e-05, + "loss": 2.1602, "step": 5407 }, { - "epoch": 1.63, - "grad_norm": 14.663102149963379, - "learning_rate": 9.161070462062745e-06, - "loss": 1.432, + "epoch": 0.68, + "grad_norm": 25.457571029663086, + "learning_rate": 1.5475881688490985e-05, + "loss": 1.6301, "step": 5408 }, { - "epoch": 1.63, - "grad_norm": 8.971065521240234, - "learning_rate": 9.159065851458355e-06, - "loss": 0.9139, + "epoch": 0.68, + "grad_norm": 24.962495803833008, + "learning_rate": 1.54750449734343e-05, + "loss": 2.5019, "step": 5409 }, { - "epoch": 1.63, - "grad_norm": 12.251411437988281, - "learning_rate": 9.157061240853965e-06, - "loss": 1.2786, + "epoch": 0.68, + "grad_norm": 11.651535034179688, + "learning_rate": 1.547420825837761e-05, + "loss": 0.8768, "step": 5410 }, { - "epoch": 1.63, - "grad_norm": 10.386260032653809, - "learning_rate": 9.155056630249575e-06, - "loss": 1.8492, + "epoch": 0.68, + "grad_norm": 17.485122680664062, + "learning_rate": 1.5473371543320923e-05, + "loss": 3.7041, "step": 5411 }, { - "epoch": 1.63, - "grad_norm": 25.967205047607422, - "learning_rate": 9.153052019645185e-06, - "loss": 1.8648, + "epoch": 0.68, + "grad_norm": 14.24410343170166, + "learning_rate": 1.5472534828264236e-05, + "loss": 0.8224, "step": 5412 }, { - "epoch": 1.63, - "grad_norm": 15.101587295532227, - "learning_rate": 9.151047409040794e-06, - "loss": 1.3361, + "epoch": 0.68, + "grad_norm": 21.170881271362305, + "learning_rate": 1.547169811320755e-05, + "loss": 3.0605, "step": 5413 }, { - "epoch": 1.63, - "grad_norm": 21.25626564025879, - "learning_rate": 9.149042798436405e-06, - "loss": 1.624, + "epoch": 0.68, + "grad_norm": 128.70204162597656, + "learning_rate": 1.547086139815086e-05, + "loss": 2.0236, "step": 5414 }, { - "epoch": 1.63, - "grad_norm": 12.912954330444336, - "learning_rate": 9.147038187832015e-06, - "loss": 1.2722, + "epoch": 0.68, + "grad_norm": 6.5688323974609375, + "learning_rate": 1.5470024683094174e-05, + "loss": 1.5321, "step": 5415 }, { - "epoch": 1.63, - "grad_norm": 19.924049377441406, - "learning_rate": 9.145033577227624e-06, - "loss": 1.8115, + "epoch": 0.68, + "grad_norm": 27.743898391723633, + "learning_rate": 1.5469187968037488e-05, + "loss": 3.4685, "step": 5416 }, { - "epoch": 1.63, - "grad_norm": 10.341268539428711, - "learning_rate": 9.143028966623234e-06, - "loss": 1.1745, + "epoch": 0.68, + "grad_norm": 11.474124908447266, + "learning_rate": 1.5468351252980798e-05, + "loss": 1.1201, "step": 5417 }, { - "epoch": 1.63, - "grad_norm": 19.03986167907715, - "learning_rate": 9.141024356018844e-06, - "loss": 1.8332, + "epoch": 0.68, + "grad_norm": 10.405303001403809, + "learning_rate": 1.546751453792411e-05, + "loss": 1.8472, "step": 5418 }, { - "epoch": 1.63, - "grad_norm": 10.237374305725098, - "learning_rate": 9.139019745414454e-06, - "loss": 1.1372, + "epoch": 0.68, + "grad_norm": 5.412685394287109, + "learning_rate": 1.5466677822867425e-05, + "loss": 0.5036, "step": 5419 }, { - "epoch": 1.63, - "grad_norm": 12.311965942382812, - "learning_rate": 9.137015134810064e-06, - "loss": 1.3524, + "epoch": 0.68, + "grad_norm": 8.794434547424316, + "learning_rate": 1.546584110781074e-05, + "loss": 0.9806, "step": 5420 }, { - "epoch": 1.63, - "grad_norm": 17.118541717529297, - "learning_rate": 9.135010524205674e-06, - "loss": 1.4631, + "epoch": 0.68, + "grad_norm": 8.287175178527832, + "learning_rate": 1.546500439275405e-05, + "loss": 0.843, "step": 5421 }, { - "epoch": 1.63, - "grad_norm": 13.466850280761719, - "learning_rate": 9.133005913601284e-06, - "loss": 2.0476, + "epoch": 0.68, + "grad_norm": 17.346330642700195, + "learning_rate": 1.5464167677697363e-05, + "loss": 1.9287, "step": 5422 }, { - "epoch": 1.63, - "grad_norm": 22.777727127075195, - "learning_rate": 9.131001302996893e-06, - "loss": 1.6859, + "epoch": 0.68, + "grad_norm": 14.11860466003418, + "learning_rate": 1.5463330962640676e-05, + "loss": 2.1149, "step": 5423 }, { - "epoch": 1.63, - "grad_norm": 24.398303985595703, - "learning_rate": 9.128996692392503e-06, - "loss": 2.0839, + "epoch": 0.68, + "grad_norm": 11.32366943359375, + "learning_rate": 1.5462494247583987e-05, + "loss": 2.592, "step": 5424 }, { - "epoch": 1.63, - "grad_norm": 22.156652450561523, - "learning_rate": 9.126992081788113e-06, - "loss": 1.3133, + "epoch": 0.68, + "grad_norm": 19.463321685791016, + "learning_rate": 1.54616575325273e-05, + "loss": 2.1927, "step": 5425 }, { - "epoch": 1.63, - "grad_norm": 10.810490608215332, - "learning_rate": 9.124987471183725e-06, - "loss": 0.9388, + "epoch": 0.68, + "grad_norm": 11.99179458618164, + "learning_rate": 1.546082081747061e-05, + "loss": 1.5235, "step": 5426 }, { - "epoch": 1.63, - "grad_norm": 17.22280502319336, - "learning_rate": 9.122982860579333e-06, - "loss": 2.081, + "epoch": 0.68, + "grad_norm": 10.898519515991211, + "learning_rate": 1.5459984102413924e-05, + "loss": 1.3556, "step": 5427 }, { - "epoch": 1.63, - "grad_norm": 11.593786239624023, - "learning_rate": 9.120978249974943e-06, - "loss": 1.6318, + "epoch": 0.68, + "grad_norm": 52.73088455200195, + "learning_rate": 1.5459147387357238e-05, + "loss": 2.1083, "step": 5428 }, { - "epoch": 1.63, - "grad_norm": 111.95779418945312, - "learning_rate": 9.118973639370553e-06, - "loss": 2.2157, + "epoch": 0.68, + "grad_norm": 36.21796798706055, + "learning_rate": 1.5458310672300548e-05, + "loss": 2.343, "step": 5429 }, { - "epoch": 1.63, - "grad_norm": 10.570384979248047, - "learning_rate": 9.116969028766163e-06, - "loss": 1.8122, + "epoch": 0.68, + "grad_norm": 22.104034423828125, + "learning_rate": 1.5457473957243862e-05, + "loss": 1.1986, "step": 5430 }, { - "epoch": 1.63, - "grad_norm": 47.375701904296875, - "learning_rate": 9.114964418161772e-06, - "loss": 2.1275, + "epoch": 0.68, + "grad_norm": 14.936227798461914, + "learning_rate": 1.5456637242187172e-05, + "loss": 2.8078, "step": 5431 }, { - "epoch": 1.63, - "grad_norm": 15.159667015075684, - "learning_rate": 9.112959807557382e-06, - "loss": 2.2646, + "epoch": 0.68, + "grad_norm": 21.90635108947754, + "learning_rate": 1.5455800527130486e-05, + "loss": 2.5358, "step": 5432 }, { - "epoch": 1.63, - "grad_norm": 10.478726387023926, - "learning_rate": 9.110955196952994e-06, - "loss": 1.4377, + "epoch": 0.68, + "grad_norm": 25.158336639404297, + "learning_rate": 1.54549638120738e-05, + "loss": 1.835, "step": 5433 }, { - "epoch": 1.63, - "grad_norm": 27.897777557373047, - "learning_rate": 9.108950586348602e-06, - "loss": 2.1688, + "epoch": 0.68, + "grad_norm": 9.340642929077148, + "learning_rate": 1.5454127097017113e-05, + "loss": 1.394, "step": 5434 }, { - "epoch": 1.63, - "grad_norm": 26.705507278442383, - "learning_rate": 9.106945975744212e-06, - "loss": 1.7822, + "epoch": 0.68, + "grad_norm": 13.902207374572754, + "learning_rate": 1.5453290381960423e-05, + "loss": 0.6865, "step": 5435 }, { - "epoch": 1.63, - "grad_norm": 23.994443893432617, - "learning_rate": 9.104941365139822e-06, - "loss": 2.0833, + "epoch": 0.68, + "grad_norm": 10.27506160736084, + "learning_rate": 1.5452453666903737e-05, + "loss": 0.6639, "step": 5436 }, { - "epoch": 1.63, - "grad_norm": 66.67716217041016, - "learning_rate": 9.102936754535432e-06, - "loss": 1.3532, + "epoch": 0.68, + "grad_norm": 13.183588027954102, + "learning_rate": 1.545161695184705e-05, + "loss": 1.5091, "step": 5437 }, { - "epoch": 1.63, - "grad_norm": 30.51035499572754, - "learning_rate": 9.100932143931042e-06, - "loss": 2.0045, + "epoch": 0.68, + "grad_norm": 31.602766036987305, + "learning_rate": 1.545078023679036e-05, + "loss": 2.0934, "step": 5438 }, { - "epoch": 1.64, - "grad_norm": 10.716803550720215, - "learning_rate": 9.098927533326652e-06, - "loss": 1.6747, + "epoch": 0.68, + "grad_norm": 3.6912848949432373, + "learning_rate": 1.5449943521733674e-05, + "loss": 0.0864, "step": 5439 }, { - "epoch": 1.64, - "grad_norm": 13.023651123046875, - "learning_rate": 9.096922922722262e-06, - "loss": 0.7083, + "epoch": 0.68, + "grad_norm": 22.57712173461914, + "learning_rate": 1.5449106806676988e-05, + "loss": 1.5905, "step": 5440 }, { - "epoch": 1.64, - "grad_norm": 25.49052619934082, - "learning_rate": 9.094918312117873e-06, - "loss": 1.0234, + "epoch": 0.68, + "grad_norm": 21.62580108642578, + "learning_rate": 1.5448270091620302e-05, + "loss": 1.5551, "step": 5441 }, { - "epoch": 1.64, - "grad_norm": 33.57606887817383, - "learning_rate": 9.092913701513481e-06, - "loss": 2.2643, + "epoch": 0.68, + "grad_norm": 18.682220458984375, + "learning_rate": 1.5447433376563612e-05, + "loss": 2.4257, "step": 5442 }, { - "epoch": 1.64, - "grad_norm": 11.915518760681152, - "learning_rate": 9.090909090909091e-06, - "loss": 1.0967, + "epoch": 0.68, + "grad_norm": 15.763568878173828, + "learning_rate": 1.5446596661506926e-05, + "loss": 0.9516, "step": 5443 }, { - "epoch": 1.64, - "grad_norm": 15.158368110656738, - "learning_rate": 9.088904480304701e-06, - "loss": 1.534, + "epoch": 0.68, + "grad_norm": 18.485803604125977, + "learning_rate": 1.544575994645024e-05, + "loss": 3.442, "step": 5444 }, { - "epoch": 1.64, - "grad_norm": 11.216906547546387, - "learning_rate": 9.086899869700311e-06, - "loss": 1.4261, + "epoch": 0.68, + "grad_norm": 11.463976860046387, + "learning_rate": 1.544492323139355e-05, + "loss": 1.0116, "step": 5445 }, { - "epoch": 1.64, - "grad_norm": 26.098888397216797, - "learning_rate": 9.084895259095921e-06, - "loss": 2.0426, + "epoch": 0.68, + "grad_norm": 12.108723640441895, + "learning_rate": 1.5444086516336863e-05, + "loss": 1.7329, "step": 5446 }, { - "epoch": 1.64, - "grad_norm": 13.751211166381836, - "learning_rate": 9.082890648491531e-06, - "loss": 1.5604, + "epoch": 0.68, + "grad_norm": 8.408315658569336, + "learning_rate": 1.5443249801280177e-05, + "loss": 1.3185, "step": 5447 }, { - "epoch": 1.64, - "grad_norm": 12.62765884399414, - "learning_rate": 9.080886037887141e-06, - "loss": 1.7308, + "epoch": 0.68, + "grad_norm": 8.830940246582031, + "learning_rate": 1.544241308622349e-05, + "loss": 1.1114, "step": 5448 }, { - "epoch": 1.64, - "grad_norm": 20.71200942993164, - "learning_rate": 9.078881427282752e-06, - "loss": 0.9927, + "epoch": 0.68, + "grad_norm": 9.236875534057617, + "learning_rate": 1.54415763711668e-05, + "loss": 0.9462, "step": 5449 }, { - "epoch": 1.64, - "grad_norm": 23.00042152404785, - "learning_rate": 9.07687681667836e-06, - "loss": 2.4304, + "epoch": 0.68, + "grad_norm": 19.101408004760742, + "learning_rate": 1.5440739656110114e-05, + "loss": 2.1817, "step": 5450 }, { - "epoch": 1.64, - "grad_norm": 28.70808982849121, - "learning_rate": 9.074872206073972e-06, - "loss": 1.5417, + "epoch": 0.68, + "grad_norm": 9.254010200500488, + "learning_rate": 1.5439902941053425e-05, + "loss": 0.8988, "step": 5451 }, { - "epoch": 1.64, - "grad_norm": 9.605061531066895, - "learning_rate": 9.072867595469582e-06, - "loss": 1.3111, + "epoch": 0.68, + "grad_norm": 7.230218887329102, + "learning_rate": 1.5439066225996738e-05, + "loss": 0.9233, "step": 5452 }, { - "epoch": 1.64, - "grad_norm": 7.0347490310668945, - "learning_rate": 9.07086298486519e-06, - "loss": 0.7902, + "epoch": 0.68, + "grad_norm": 15.310093879699707, + "learning_rate": 1.5438229510940052e-05, + "loss": 1.1652, "step": 5453 }, { - "epoch": 1.64, - "grad_norm": 7.641110420227051, - "learning_rate": 9.0688583742608e-06, - "loss": 0.5109, + "epoch": 0.68, + "grad_norm": 7.4407782554626465, + "learning_rate": 1.5437392795883362e-05, + "loss": 0.5707, "step": 5454 }, { - "epoch": 1.64, - "grad_norm": 39.103248596191406, - "learning_rate": 9.06685376365641e-06, - "loss": 1.863, + "epoch": 0.68, + "grad_norm": 16.615798950195312, + "learning_rate": 1.5436556080826676e-05, + "loss": 0.9137, "step": 5455 }, { - "epoch": 1.64, - "grad_norm": 10.603463172912598, - "learning_rate": 9.06484915305202e-06, - "loss": 1.5076, + "epoch": 0.68, + "grad_norm": 45.70905303955078, + "learning_rate": 1.5435719365769986e-05, + "loss": 1.9834, "step": 5456 }, { - "epoch": 1.64, - "grad_norm": 53.90522766113281, - "learning_rate": 9.06284454244763e-06, - "loss": 3.2397, + "epoch": 0.68, + "grad_norm": 11.287595748901367, + "learning_rate": 1.54348826507133e-05, + "loss": 1.7566, "step": 5457 }, { - "epoch": 1.64, - "grad_norm": 55.33717727661133, - "learning_rate": 9.06083993184324e-06, - "loss": 2.0653, + "epoch": 0.68, + "grad_norm": 29.066984176635742, + "learning_rate": 1.5434045935656613e-05, + "loss": 3.2238, "step": 5458 }, { - "epoch": 1.64, - "grad_norm": 18.05723762512207, - "learning_rate": 9.05883532123885e-06, - "loss": 1.8223, + "epoch": 0.69, + "grad_norm": 3.7322309017181396, + "learning_rate": 1.5433209220599924e-05, + "loss": 0.3103, "step": 5459 }, { - "epoch": 1.64, - "grad_norm": 45.47984313964844, - "learning_rate": 9.05683071063446e-06, - "loss": 1.3857, + "epoch": 0.69, + "grad_norm": 22.915382385253906, + "learning_rate": 1.5432372505543237e-05, + "loss": 1.743, "step": 5460 }, { - "epoch": 1.64, - "grad_norm": 64.19039916992188, - "learning_rate": 9.05482610003007e-06, - "loss": 1.9055, + "epoch": 0.69, + "grad_norm": 26.823711395263672, + "learning_rate": 1.543153579048655e-05, + "loss": 3.0431, "step": 5461 }, { - "epoch": 1.64, - "grad_norm": 16.403478622436523, - "learning_rate": 9.05282148942568e-06, - "loss": 1.2061, + "epoch": 0.69, + "grad_norm": 12.175504684448242, + "learning_rate": 1.5430699075429865e-05, + "loss": 0.8336, "step": 5462 }, { - "epoch": 1.64, - "grad_norm": 15.951025009155273, - "learning_rate": 9.050816878821291e-06, - "loss": 2.1111, + "epoch": 0.69, + "grad_norm": 11.215303421020508, + "learning_rate": 1.5429862360373175e-05, + "loss": 1.9734, "step": 5463 }, { - "epoch": 1.64, - "grad_norm": 32.44859313964844, - "learning_rate": 9.0488122682169e-06, - "loss": 1.3111, + "epoch": 0.69, + "grad_norm": 26.189634323120117, + "learning_rate": 1.542902564531649e-05, + "loss": 3.3735, "step": 5464 }, { - "epoch": 1.64, - "grad_norm": 22.92570686340332, - "learning_rate": 9.04680765761251e-06, - "loss": 1.4222, + "epoch": 0.69, + "grad_norm": 38.86238098144531, + "learning_rate": 1.5428188930259802e-05, + "loss": 1.7573, "step": 5465 }, { - "epoch": 1.64, - "grad_norm": 11.310336112976074, - "learning_rate": 9.04480304700812e-06, - "loss": 1.0347, + "epoch": 0.69, + "grad_norm": 8.166007041931152, + "learning_rate": 1.5427352215203112e-05, + "loss": 1.7582, "step": 5466 }, { - "epoch": 1.64, - "grad_norm": 48.00858688354492, - "learning_rate": 9.04279843640373e-06, - "loss": 1.5265, + "epoch": 0.69, + "grad_norm": 3.437197685241699, + "learning_rate": 1.5426515500146426e-05, + "loss": 0.0896, "step": 5467 }, { - "epoch": 1.64, - "grad_norm": 22.504119873046875, - "learning_rate": 9.040793825799338e-06, - "loss": 1.2273, + "epoch": 0.69, + "grad_norm": 6.311569690704346, + "learning_rate": 1.542567878508974e-05, + "loss": 1.5704, "step": 5468 }, { - "epoch": 1.64, - "grad_norm": 23.860111236572266, - "learning_rate": 9.03878921519495e-06, - "loss": 1.4192, + "epoch": 0.69, + "grad_norm": 9.943201065063477, + "learning_rate": 1.5424842070033053e-05, + "loss": 1.1738, "step": 5469 }, { - "epoch": 1.64, - "grad_norm": 11.303675651550293, - "learning_rate": 9.03678460459056e-06, - "loss": 1.0257, + "epoch": 0.69, + "grad_norm": 5.164342880249023, + "learning_rate": 1.5424005354976364e-05, + "loss": 0.8256, "step": 5470 }, { - "epoch": 1.64, - "grad_norm": 12.571292877197266, - "learning_rate": 9.034779993986168e-06, - "loss": 1.4753, + "epoch": 0.69, + "grad_norm": 26.206350326538086, + "learning_rate": 1.5423168639919677e-05, + "loss": 2.0694, "step": 5471 }, { - "epoch": 1.65, - "grad_norm": 45.509620666503906, - "learning_rate": 9.032775383381778e-06, - "loss": 2.5814, + "epoch": 0.69, + "grad_norm": 29.577566146850586, + "learning_rate": 1.542233192486299e-05, + "loss": 1.9165, "step": 5472 }, { - "epoch": 1.65, - "grad_norm": 11.296418190002441, - "learning_rate": 9.030770772777388e-06, - "loss": 0.8345, + "epoch": 0.69, + "grad_norm": 10.16978931427002, + "learning_rate": 1.54214952098063e-05, + "loss": 1.3248, "step": 5473 }, { - "epoch": 1.65, - "grad_norm": 19.737348556518555, - "learning_rate": 9.028766162172999e-06, - "loss": 2.0533, + "epoch": 0.69, + "grad_norm": 16.154563903808594, + "learning_rate": 1.5420658494749615e-05, + "loss": 0.6503, "step": 5474 }, { - "epoch": 1.65, - "grad_norm": 32.28429412841797, - "learning_rate": 9.026761551568609e-06, - "loss": 1.0527, + "epoch": 0.69, + "grad_norm": 18.64064598083496, + "learning_rate": 1.541982177969293e-05, + "loss": 2.6751, "step": 5475 }, { - "epoch": 1.65, - "grad_norm": 5.6402106285095215, - "learning_rate": 9.024756940964219e-06, - "loss": 0.7558, + "epoch": 0.69, + "grad_norm": 17.2528133392334, + "learning_rate": 1.541898506463624e-05, + "loss": 1.8936, "step": 5476 }, { - "epoch": 1.65, - "grad_norm": 11.547632217407227, - "learning_rate": 9.022752330359829e-06, - "loss": 1.089, + "epoch": 0.69, + "grad_norm": 17.381118774414062, + "learning_rate": 1.5418148349579552e-05, + "loss": 1.9031, "step": 5477 }, { - "epoch": 1.65, - "grad_norm": 7.933753967285156, - "learning_rate": 9.020747719755439e-06, - "loss": 1.1976, + "epoch": 0.69, + "grad_norm": 31.099803924560547, + "learning_rate": 1.5417311634522866e-05, + "loss": 2.3239, "step": 5478 }, { - "epoch": 1.65, - "grad_norm": 25.738367080688477, - "learning_rate": 9.018743109151047e-06, - "loss": 2.3856, + "epoch": 0.69, + "grad_norm": 11.734057426452637, + "learning_rate": 1.5416474919466176e-05, + "loss": 0.5819, "step": 5479 }, { - "epoch": 1.65, - "grad_norm": 14.972805976867676, - "learning_rate": 9.016738498546657e-06, - "loss": 2.7914, + "epoch": 0.69, + "grad_norm": 16.285499572753906, + "learning_rate": 1.541563820440949e-05, + "loss": 2.2578, "step": 5480 }, { - "epoch": 1.65, - "grad_norm": 23.992773056030273, - "learning_rate": 9.014733887942267e-06, - "loss": 2.568, + "epoch": 0.69, + "grad_norm": 6.641711711883545, + "learning_rate": 1.5414801489352804e-05, + "loss": 1.3813, "step": 5481 }, { - "epoch": 1.65, - "grad_norm": 12.318902015686035, - "learning_rate": 9.012729277337878e-06, - "loss": 1.2272, + "epoch": 0.69, + "grad_norm": 4.232131481170654, + "learning_rate": 1.5413964774296114e-05, + "loss": 0.6922, "step": 5482 }, { - "epoch": 1.65, - "grad_norm": 13.769071578979492, - "learning_rate": 9.010724666733488e-06, - "loss": 1.4336, + "epoch": 0.69, + "grad_norm": 14.072823524475098, + "learning_rate": 1.5413128059239428e-05, + "loss": 1.7136, "step": 5483 }, { - "epoch": 1.65, - "grad_norm": 29.345212936401367, - "learning_rate": 9.008720056129098e-06, - "loss": 1.8268, + "epoch": 0.69, + "grad_norm": 18.54960823059082, + "learning_rate": 1.5412291344182738e-05, + "loss": 2.7273, "step": 5484 }, { - "epoch": 1.65, - "grad_norm": 10.752758026123047, - "learning_rate": 9.006715445524708e-06, - "loss": 0.75, + "epoch": 0.69, + "grad_norm": 12.27863597869873, + "learning_rate": 1.541145462912605e-05, + "loss": 1.326, "step": 5485 }, { - "epoch": 1.65, - "grad_norm": 11.773067474365234, - "learning_rate": 9.004710834920318e-06, - "loss": 1.5268, + "epoch": 0.69, + "grad_norm": 19.024850845336914, + "learning_rate": 1.5410617914069365e-05, + "loss": 1.7985, "step": 5486 }, { - "epoch": 1.65, - "grad_norm": 20.230098724365234, - "learning_rate": 9.002706224315926e-06, - "loss": 1.2181, + "epoch": 0.69, + "grad_norm": 8.920555114746094, + "learning_rate": 1.5409781199012675e-05, + "loss": 0.5712, "step": 5487 }, { - "epoch": 1.65, - "grad_norm": 19.085262298583984, - "learning_rate": 9.000701613711538e-06, - "loss": 2.0583, + "epoch": 0.69, + "grad_norm": 12.335217475891113, + "learning_rate": 1.540894448395599e-05, + "loss": 1.1266, "step": 5488 }, { - "epoch": 1.65, - "grad_norm": 15.87958812713623, - "learning_rate": 8.998697003107148e-06, - "loss": 1.2279, + "epoch": 0.69, + "grad_norm": 6.6275739669799805, + "learning_rate": 1.5408107768899303e-05, + "loss": 0.9102, "step": 5489 }, { - "epoch": 1.65, - "grad_norm": 10.969903945922852, - "learning_rate": 8.996692392502757e-06, - "loss": 1.4018, + "epoch": 0.69, + "grad_norm": 9.099052429199219, + "learning_rate": 1.5407271053842616e-05, + "loss": 1.3513, "step": 5490 }, { - "epoch": 1.65, - "grad_norm": 16.4309024810791, - "learning_rate": 8.994687781898367e-06, - "loss": 1.7592, + "epoch": 0.69, + "grad_norm": 11.68352222442627, + "learning_rate": 1.5406434338785927e-05, + "loss": 2.1072, "step": 5491 }, { - "epoch": 1.65, - "grad_norm": 17.328758239746094, - "learning_rate": 8.992683171293977e-06, - "loss": 1.3829, + "epoch": 0.69, + "grad_norm": 6.421557903289795, + "learning_rate": 1.540559762372924e-05, + "loss": 0.6181, "step": 5492 }, { - "epoch": 1.65, - "grad_norm": 23.75958824157715, - "learning_rate": 8.990678560689587e-06, - "loss": 1.7726, + "epoch": 0.69, + "grad_norm": 13.317495346069336, + "learning_rate": 1.5404760908672554e-05, + "loss": 1.9661, "step": 5493 }, { - "epoch": 1.65, - "grad_norm": 7.305182933807373, - "learning_rate": 8.988673950085197e-06, - "loss": 0.9196, + "epoch": 0.69, + "grad_norm": 5.760676383972168, + "learning_rate": 1.5403924193615864e-05, + "loss": 0.1942, "step": 5494 }, { - "epoch": 1.65, - "grad_norm": 23.182376861572266, - "learning_rate": 8.986669339480807e-06, - "loss": 2.1978, + "epoch": 0.69, + "grad_norm": 21.07034683227539, + "learning_rate": 1.5403087478559178e-05, + "loss": 2.1126, "step": 5495 }, { - "epoch": 1.65, - "grad_norm": 16.009490966796875, - "learning_rate": 8.984664728876417e-06, - "loss": 1.3805, + "epoch": 0.69, + "grad_norm": 15.3811674118042, + "learning_rate": 1.540225076350249e-05, + "loss": 1.292, "step": 5496 }, { - "epoch": 1.65, - "grad_norm": 25.598642349243164, - "learning_rate": 8.982660118272027e-06, - "loss": 1.7893, + "epoch": 0.69, + "grad_norm": 13.275308609008789, + "learning_rate": 1.5401414048445805e-05, + "loss": 1.9513, "step": 5497 }, { - "epoch": 1.65, - "grad_norm": 14.513004302978516, - "learning_rate": 8.980655507667636e-06, - "loss": 1.0957, + "epoch": 0.69, + "grad_norm": 14.056856155395508, + "learning_rate": 1.5400577333389115e-05, + "loss": 1.4046, "step": 5498 }, { - "epoch": 1.65, - "grad_norm": 33.75980758666992, - "learning_rate": 8.978650897063246e-06, - "loss": 1.9849, + "epoch": 0.69, + "grad_norm": 4.988726615905762, + "learning_rate": 1.539974061833243e-05, + "loss": 0.4338, "step": 5499 }, { - "epoch": 1.65, - "grad_norm": 17.729082107543945, - "learning_rate": 8.976646286458857e-06, - "loss": 1.2108, + "epoch": 0.69, + "grad_norm": 36.79734420776367, + "learning_rate": 1.5398903903275743e-05, + "loss": 2.014, "step": 5500 }, { - "epoch": 1.65, - "grad_norm": 15.00300121307373, - "learning_rate": 8.974641675854466e-06, - "loss": 1.9234, + "epoch": 0.69, + "grad_norm": 6.316336631774902, + "learning_rate": 1.5398067188219053e-05, + "loss": 0.7961, "step": 5501 }, { - "epoch": 1.65, - "grad_norm": 37.787193298339844, - "learning_rate": 8.972637065250076e-06, - "loss": 1.5762, + "epoch": 0.69, + "grad_norm": 11.95064926147461, + "learning_rate": 1.5397230473162367e-05, + "loss": 2.1392, "step": 5502 }, { - "epoch": 1.65, - "grad_norm": 18.156917572021484, - "learning_rate": 8.970632454645686e-06, - "loss": 1.2841, + "epoch": 0.69, + "grad_norm": 8.447251319885254, + "learning_rate": 1.539639375810568e-05, + "loss": 1.3479, "step": 5503 }, { - "epoch": 1.65, - "grad_norm": 17.250186920166016, - "learning_rate": 8.968627844041296e-06, - "loss": 1.4742, + "epoch": 0.69, + "grad_norm": 16.203733444213867, + "learning_rate": 1.539555704304899e-05, + "loss": 2.0525, "step": 5504 }, { - "epoch": 1.66, - "grad_norm": 27.10199737548828, - "learning_rate": 8.966623233436904e-06, - "loss": 1.9449, + "epoch": 0.69, + "grad_norm": 28.032142639160156, + "learning_rate": 1.5394720327992304e-05, + "loss": 3.0286, "step": 5505 }, { - "epoch": 1.66, - "grad_norm": 17.242645263671875, - "learning_rate": 8.964618622832516e-06, - "loss": 1.3392, + "epoch": 0.69, + "grad_norm": 10.23719310760498, + "learning_rate": 1.5393883612935618e-05, + "loss": 0.5594, "step": 5506 }, { - "epoch": 1.66, - "grad_norm": 12.527267456054688, - "learning_rate": 8.962614012228126e-06, - "loss": 1.559, + "epoch": 0.69, + "grad_norm": 20.76209831237793, + "learning_rate": 1.5393046897878928e-05, + "loss": 2.1147, "step": 5507 }, { - "epoch": 1.66, - "grad_norm": 12.160088539123535, - "learning_rate": 8.960609401623735e-06, - "loss": 1.2504, + "epoch": 0.69, + "grad_norm": 11.435663223266602, + "learning_rate": 1.539221018282224e-05, + "loss": 1.9703, "step": 5508 }, { - "epoch": 1.66, - "grad_norm": 9.42102336883545, - "learning_rate": 8.958604791019345e-06, - "loss": 0.9488, + "epoch": 0.69, + "grad_norm": 16.902219772338867, + "learning_rate": 1.5391373467765552e-05, + "loss": 1.7572, "step": 5509 }, { - "epoch": 1.66, - "grad_norm": 16.88702964782715, - "learning_rate": 8.956600180414955e-06, - "loss": 1.5164, + "epoch": 0.69, + "grad_norm": 36.9378662109375, + "learning_rate": 1.5390536752708866e-05, + "loss": 2.7558, "step": 5510 }, { - "epoch": 1.66, - "grad_norm": 8.981550216674805, - "learning_rate": 8.954595569810565e-06, - "loss": 1.2683, + "epoch": 0.69, + "grad_norm": 34.30980682373047, + "learning_rate": 1.538970003765218e-05, + "loss": 1.6774, "step": 5511 }, { - "epoch": 1.66, - "grad_norm": 14.901409149169922, - "learning_rate": 8.952590959206175e-06, - "loss": 1.3492, + "epoch": 0.69, + "grad_norm": 11.313263893127441, + "learning_rate": 1.538886332259549e-05, + "loss": 1.1251, "step": 5512 }, { - "epoch": 1.66, - "grad_norm": 38.2581787109375, - "learning_rate": 8.950586348601785e-06, - "loss": 1.4274, + "epoch": 0.69, + "grad_norm": 80.6723861694336, + "learning_rate": 1.5388026607538803e-05, + "loss": 2.4211, "step": 5513 }, { - "epoch": 1.66, - "grad_norm": 11.266697883605957, - "learning_rate": 8.948581737997395e-06, - "loss": 1.2673, + "epoch": 0.69, + "grad_norm": 20.437837600708008, + "learning_rate": 1.5387189892482117e-05, + "loss": 2.8406, "step": 5514 }, { - "epoch": 1.66, - "grad_norm": 12.022287368774414, - "learning_rate": 8.946577127393005e-06, - "loss": 1.7178, + "epoch": 0.69, + "grad_norm": 7.128717422485352, + "learning_rate": 1.5386353177425427e-05, + "loss": 1.1533, "step": 5515 }, { - "epoch": 1.66, - "grad_norm": 18.94673728942871, - "learning_rate": 8.944572516788614e-06, - "loss": 1.3636, + "epoch": 0.69, + "grad_norm": 12.50041675567627, + "learning_rate": 1.538551646236874e-05, + "loss": 2.2954, "step": 5516 }, { - "epoch": 1.66, - "grad_norm": 30.44729232788086, - "learning_rate": 8.942567906184224e-06, - "loss": 1.623, + "epoch": 0.69, + "grad_norm": 23.601917266845703, + "learning_rate": 1.5384679747312054e-05, + "loss": 3.0042, "step": 5517 }, { - "epoch": 1.66, - "grad_norm": 13.971253395080566, - "learning_rate": 8.940563295579834e-06, - "loss": 1.3185, + "epoch": 0.69, + "grad_norm": 15.071398735046387, + "learning_rate": 1.5383843032255368e-05, + "loss": 0.8468, "step": 5518 }, { - "epoch": 1.66, - "grad_norm": 10.269094467163086, - "learning_rate": 8.938558684975444e-06, - "loss": 1.2367, + "epoch": 0.69, + "grad_norm": 15.801284790039062, + "learning_rate": 1.5383006317198678e-05, + "loss": 2.6363, "step": 5519 }, { - "epoch": 1.66, - "grad_norm": 19.084022521972656, - "learning_rate": 8.936554074371054e-06, - "loss": 0.9847, - "step": 5520 - }, - { - "epoch": 1.66, - "eval_loss": 0.1914350539445877, - "eval_runtime": 43.4865, - "eval_samples_per_second": 34.011, - "eval_steps_per_second": 34.011, + "epoch": 0.69, + "grad_norm": 9.802026748657227, + "learning_rate": 1.5382169602141992e-05, + "loss": 0.9632, "step": 5520 }, { - "epoch": 1.66, - "grad_norm": 11.31253719329834, - "learning_rate": 8.934549463766664e-06, - "loss": 1.363, + "epoch": 0.69, + "grad_norm": 7.131788730621338, + "learning_rate": 1.5381332887085306e-05, + "loss": 0.6508, "step": 5521 }, { - "epoch": 1.66, - "grad_norm": 22.407804489135742, - "learning_rate": 8.932544853162274e-06, - "loss": 1.953, + "epoch": 0.69, + "grad_norm": 15.925463676452637, + "learning_rate": 1.5380496172028616e-05, + "loss": 1.7814, "step": 5522 }, { - "epoch": 1.66, - "grad_norm": 36.859561920166016, - "learning_rate": 8.930540242557884e-06, - "loss": 2.4004, + "epoch": 0.69, + "grad_norm": 5.8094000816345215, + "learning_rate": 1.537965945697193e-05, + "loss": 0.3943, "step": 5523 }, { - "epoch": 1.66, - "grad_norm": 28.108509063720703, - "learning_rate": 8.928535631953493e-06, - "loss": 1.4061, + "epoch": 0.69, + "grad_norm": 20.999128341674805, + "learning_rate": 1.5378822741915243e-05, + "loss": 1.2808, "step": 5524 }, { - "epoch": 1.66, - "grad_norm": 20.431781768798828, - "learning_rate": 8.926531021349104e-06, - "loss": 1.4469, + "epoch": 0.69, + "grad_norm": 10.400137901306152, + "learning_rate": 1.5377986026858557e-05, + "loss": 1.3163, "step": 5525 }, { - "epoch": 1.66, - "grad_norm": 20.850435256958008, - "learning_rate": 8.924526410744714e-06, - "loss": 1.789, + "epoch": 0.69, + "grad_norm": 30.212005615234375, + "learning_rate": 1.5377149311801867e-05, + "loss": 1.9231, "step": 5526 }, { - "epoch": 1.66, - "grad_norm": 11.911688804626465, - "learning_rate": 8.922521800140323e-06, - "loss": 2.0367, + "epoch": 0.69, + "grad_norm": 9.523595809936523, + "learning_rate": 1.537631259674518e-05, + "loss": 2.1191, "step": 5527 }, { - "epoch": 1.66, - "grad_norm": 10.329282760620117, - "learning_rate": 8.920517189535933e-06, - "loss": 1.4678, + "epoch": 0.69, + "grad_norm": 18.021493911743164, + "learning_rate": 1.5375475881688494e-05, + "loss": 3.7398, "step": 5528 }, { - "epoch": 1.66, - "grad_norm": 12.712068557739258, - "learning_rate": 8.918512578931543e-06, - "loss": 1.6763, + "epoch": 0.69, + "grad_norm": 21.648252487182617, + "learning_rate": 1.5374639166631805e-05, + "loss": 1.3649, "step": 5529 }, { - "epoch": 1.66, - "grad_norm": 58.54856872558594, - "learning_rate": 8.916507968327153e-06, - "loss": 2.0221, + "epoch": 0.69, + "grad_norm": 13.179224014282227, + "learning_rate": 1.5373802451575118e-05, + "loss": 0.5187, "step": 5530 }, { - "epoch": 1.66, - "grad_norm": 13.266801834106445, - "learning_rate": 8.914503357722763e-06, - "loss": 1.7743, + "epoch": 0.69, + "grad_norm": 5.153297424316406, + "learning_rate": 1.5372965736518432e-05, + "loss": 1.0926, "step": 5531 }, { - "epoch": 1.66, - "grad_norm": 27.546993255615234, - "learning_rate": 8.912498747118373e-06, - "loss": 2.6575, + "epoch": 0.69, + "grad_norm": 15.431589126586914, + "learning_rate": 1.5372129021461742e-05, + "loss": 3.4108, "step": 5532 }, { - "epoch": 1.66, - "grad_norm": 11.523126602172852, - "learning_rate": 8.910494136513983e-06, - "loss": 1.1144, + "epoch": 0.69, + "grad_norm": 25.87183952331543, + "learning_rate": 1.5371292306405056e-05, + "loss": 1.4335, "step": 5533 }, { - "epoch": 1.66, - "grad_norm": 11.785201072692871, - "learning_rate": 8.908489525909593e-06, - "loss": 1.1003, + "epoch": 0.69, + "grad_norm": 6.297723770141602, + "learning_rate": 1.537045559134837e-05, + "loss": 0.313, "step": 5534 }, { - "epoch": 1.66, - "grad_norm": 14.82313346862793, - "learning_rate": 8.906484915305202e-06, - "loss": 1.8222, + "epoch": 0.69, + "grad_norm": 24.791057586669922, + "learning_rate": 1.536961887629168e-05, + "loss": 2.463, "step": 5535 }, { - "epoch": 1.66, - "grad_norm": 14.647685050964355, - "learning_rate": 8.904480304700812e-06, - "loss": 0.8845, + "epoch": 0.69, + "grad_norm": 13.483685493469238, + "learning_rate": 1.5368782161234993e-05, + "loss": 1.0357, "step": 5536 }, { - "epoch": 1.66, - "grad_norm": 8.299193382263184, - "learning_rate": 8.902475694096424e-06, - "loss": 1.3426, + "epoch": 0.69, + "grad_norm": 15.028130531311035, + "learning_rate": 1.5367945446178304e-05, + "loss": 1.5467, "step": 5537 }, { - "epoch": 1.67, - "grad_norm": 31.66522979736328, - "learning_rate": 8.900471083492032e-06, - "loss": 1.5596, + "epoch": 0.7, + "grad_norm": 7.930243492126465, + "learning_rate": 1.5367108731121617e-05, + "loss": 0.9488, "step": 5538 }, { - "epoch": 1.67, - "grad_norm": 10.633186340332031, - "learning_rate": 8.898466472887642e-06, - "loss": 0.8703, + "epoch": 0.7, + "grad_norm": 29.752031326293945, + "learning_rate": 1.536627201606493e-05, + "loss": 1.6596, "step": 5539 }, { - "epoch": 1.67, - "grad_norm": 12.473995208740234, - "learning_rate": 8.896461862283252e-06, - "loss": 1.1009, + "epoch": 0.7, + "grad_norm": 14.493793487548828, + "learning_rate": 1.536543530100824e-05, + "loss": 2.9665, "step": 5540 }, { - "epoch": 1.67, - "grad_norm": 16.58210563659668, - "learning_rate": 8.894457251678862e-06, - "loss": 0.7829, + "epoch": 0.7, + "grad_norm": 12.4276762008667, + "learning_rate": 1.5364598585951555e-05, + "loss": 0.8027, "step": 5541 }, { - "epoch": 1.67, - "grad_norm": 9.069561004638672, - "learning_rate": 8.89245264107447e-06, - "loss": 0.5217, + "epoch": 0.7, + "grad_norm": 39.77607345581055, + "learning_rate": 1.536376187089487e-05, + "loss": 2.4362, "step": 5542 }, { - "epoch": 1.67, - "grad_norm": 14.045775413513184, - "learning_rate": 8.890448030470083e-06, - "loss": 1.2927, + "epoch": 0.7, + "grad_norm": 13.369621276855469, + "learning_rate": 1.536292515583818e-05, + "loss": 2.7699, "step": 5543 }, { - "epoch": 1.67, - "grad_norm": 10.430898666381836, - "learning_rate": 8.888443419865693e-06, - "loss": 1.0182, + "epoch": 0.7, + "grad_norm": 7.263457298278809, + "learning_rate": 1.5362088440781492e-05, + "loss": 0.5554, "step": 5544 }, { - "epoch": 1.67, - "grad_norm": 28.892423629760742, - "learning_rate": 8.886438809261303e-06, - "loss": 2.1135, + "epoch": 0.7, + "grad_norm": 6.9838337898254395, + "learning_rate": 1.5361251725724806e-05, + "loss": 0.9054, "step": 5545 }, { - "epoch": 1.67, - "grad_norm": 33.67451095581055, - "learning_rate": 8.884434198656911e-06, - "loss": 2.2505, + "epoch": 0.7, + "grad_norm": 20.616817474365234, + "learning_rate": 1.5360415010668116e-05, + "loss": 2.8528, "step": 5546 }, { - "epoch": 1.67, - "grad_norm": 22.55853271484375, - "learning_rate": 8.882429588052521e-06, - "loss": 1.275, + "epoch": 0.7, + "grad_norm": 4.747254371643066, + "learning_rate": 1.535957829561143e-05, + "loss": 1.3521, "step": 5547 }, { - "epoch": 1.67, - "grad_norm": 11.746838569641113, - "learning_rate": 8.880424977448131e-06, - "loss": 1.0598, + "epoch": 0.7, + "grad_norm": 8.651074409484863, + "learning_rate": 1.5358741580554744e-05, + "loss": 1.9477, "step": 5548 }, { - "epoch": 1.67, - "grad_norm": 18.632312774658203, - "learning_rate": 8.878420366843741e-06, - "loss": 2.3075, + "epoch": 0.7, + "grad_norm": 10.826473236083984, + "learning_rate": 1.5357904865498057e-05, + "loss": 0.6565, "step": 5549 }, { - "epoch": 1.67, - "grad_norm": 23.912031173706055, - "learning_rate": 8.876415756239351e-06, - "loss": 1.5192, + "epoch": 0.7, + "grad_norm": 16.596824645996094, + "learning_rate": 1.5357068150441367e-05, + "loss": 1.9037, "step": 5550 }, { - "epoch": 1.67, - "grad_norm": 19.950305938720703, - "learning_rate": 8.874411145634962e-06, - "loss": 1.5378, + "epoch": 0.7, + "grad_norm": 20.1361141204834, + "learning_rate": 1.535623143538468e-05, + "loss": 1.6692, "step": 5551 }, { - "epoch": 1.67, - "grad_norm": 15.393779754638672, - "learning_rate": 8.872406535030572e-06, - "loss": 1.4361, + "epoch": 0.7, + "grad_norm": 9.58398151397705, + "learning_rate": 1.5355394720327995e-05, + "loss": 1.0216, "step": 5552 }, { - "epoch": 1.67, - "grad_norm": 11.668610572814941, - "learning_rate": 8.87040192442618e-06, - "loss": 1.1552, + "epoch": 0.7, + "grad_norm": 21.33419418334961, + "learning_rate": 1.5354558005271305e-05, + "loss": 1.6206, "step": 5553 }, { - "epoch": 1.67, - "grad_norm": 11.19984245300293, - "learning_rate": 8.86839731382179e-06, - "loss": 1.3071, + "epoch": 0.7, + "grad_norm": 15.278912544250488, + "learning_rate": 1.535372129021462e-05, + "loss": 2.2082, "step": 5554 }, { - "epoch": 1.67, - "grad_norm": 12.745835304260254, - "learning_rate": 8.8663927032174e-06, - "loss": 0.7712, + "epoch": 0.7, + "grad_norm": 8.61217212677002, + "learning_rate": 1.5352884575157932e-05, + "loss": 1.9911, "step": 5555 }, { - "epoch": 1.67, - "grad_norm": 32.49069595336914, - "learning_rate": 8.86438809261301e-06, - "loss": 1.9268, + "epoch": 0.7, + "grad_norm": 17.128591537475586, + "learning_rate": 1.5352047860101246e-05, + "loss": 1.4462, "step": 5556 }, { - "epoch": 1.67, - "grad_norm": 8.485462188720703, - "learning_rate": 8.86238348200862e-06, - "loss": 1.0959, + "epoch": 0.7, + "grad_norm": 6.570550441741943, + "learning_rate": 1.5351211145044556e-05, + "loss": 0.5636, "step": 5557 }, { - "epoch": 1.67, - "grad_norm": 14.367705345153809, - "learning_rate": 8.86037887140423e-06, - "loss": 1.0967, + "epoch": 0.7, + "grad_norm": 14.50590991973877, + "learning_rate": 1.535037442998787e-05, + "loss": 3.6256, "step": 5558 }, { - "epoch": 1.67, - "grad_norm": 22.95987892150879, - "learning_rate": 8.85837426079984e-06, - "loss": 1.6232, + "epoch": 0.7, + "grad_norm": 13.506613731384277, + "learning_rate": 1.5349537714931184e-05, + "loss": 1.0145, "step": 5559 }, { - "epoch": 1.67, - "grad_norm": 15.968416213989258, - "learning_rate": 8.85636965019545e-06, - "loss": 1.2239, + "epoch": 0.7, + "grad_norm": 7.986975193023682, + "learning_rate": 1.5348700999874494e-05, + "loss": 1.4675, "step": 5560 }, { - "epoch": 1.67, - "grad_norm": 10.333566665649414, - "learning_rate": 8.854365039591059e-06, - "loss": 1.248, + "epoch": 0.7, + "grad_norm": 10.020309448242188, + "learning_rate": 1.5347864284817807e-05, + "loss": 1.5019, "step": 5561 }, { - "epoch": 1.67, - "grad_norm": 45.59640884399414, - "learning_rate": 8.85236042898667e-06, - "loss": 1.6009, + "epoch": 0.7, + "grad_norm": 21.0300350189209, + "learning_rate": 1.5347027569761118e-05, + "loss": 0.9188, "step": 5562 }, { - "epoch": 1.67, - "grad_norm": 28.932910919189453, - "learning_rate": 8.85035581838228e-06, - "loss": 1.88, + "epoch": 0.7, + "grad_norm": 8.697385787963867, + "learning_rate": 1.534619085470443e-05, + "loss": 0.604, "step": 5563 }, { - "epoch": 1.67, - "grad_norm": 12.517439842224121, - "learning_rate": 8.84835120777789e-06, - "loss": 1.8985, + "epoch": 0.7, + "grad_norm": 9.769986152648926, + "learning_rate": 1.5345354139647745e-05, + "loss": 1.6472, "step": 5564 }, { - "epoch": 1.67, - "grad_norm": 17.193233489990234, - "learning_rate": 8.8463465971735e-06, - "loss": 1.1834, + "epoch": 0.7, + "grad_norm": 13.042610168457031, + "learning_rate": 1.5344517424591055e-05, + "loss": 1.7165, "step": 5565 }, { - "epoch": 1.67, - "grad_norm": 26.31612205505371, - "learning_rate": 8.84434198656911e-06, - "loss": 2.4826, + "epoch": 0.7, + "grad_norm": 38.07878112792969, + "learning_rate": 1.534368070953437e-05, + "loss": 3.2106, "step": 5566 }, { - "epoch": 1.67, - "grad_norm": 16.692577362060547, - "learning_rate": 8.84233737596472e-06, - "loss": 1.3551, + "epoch": 0.7, + "grad_norm": 45.39943313598633, + "learning_rate": 1.534284399447768e-05, + "loss": 4.0076, "step": 5567 }, { - "epoch": 1.67, - "grad_norm": 19.141315460205078, - "learning_rate": 8.84033276536033e-06, - "loss": 1.5564, + "epoch": 0.7, + "grad_norm": 6.458475112915039, + "learning_rate": 1.5342007279420993e-05, + "loss": 1.7158, "step": 5568 }, { - "epoch": 1.67, - "grad_norm": 12.95341682434082, - "learning_rate": 8.83832815475594e-06, - "loss": 1.3491, + "epoch": 0.7, + "grad_norm": 19.190696716308594, + "learning_rate": 1.5341170564364306e-05, + "loss": 0.9015, "step": 5569 }, { - "epoch": 1.67, - "grad_norm": 11.91671085357666, - "learning_rate": 8.83632354415155e-06, - "loss": 1.4844, + "epoch": 0.7, + "grad_norm": 101.54644012451172, + "learning_rate": 1.534033384930762e-05, + "loss": 1.944, "step": 5570 }, { - "epoch": 1.67, - "grad_norm": 17.263015747070312, - "learning_rate": 8.83431893354716e-06, - "loss": 1.5285, + "epoch": 0.7, + "grad_norm": 23.400814056396484, + "learning_rate": 1.533949713425093e-05, + "loss": 1.376, "step": 5571 }, { - "epoch": 1.68, - "grad_norm": 10.586352348327637, - "learning_rate": 8.832314322942768e-06, - "loss": 1.4477, + "epoch": 0.7, + "grad_norm": 16.705427169799805, + "learning_rate": 1.5338660419194244e-05, + "loss": 2.3008, "step": 5572 }, { - "epoch": 1.68, - "grad_norm": 17.991832733154297, - "learning_rate": 8.830309712338378e-06, - "loss": 1.3879, + "epoch": 0.7, + "grad_norm": 16.300222396850586, + "learning_rate": 1.5337823704137558e-05, + "loss": 2.3399, "step": 5573 }, { - "epoch": 1.68, - "grad_norm": 19.859966278076172, - "learning_rate": 8.82830510173399e-06, - "loss": 1.8104, + "epoch": 0.7, + "grad_norm": 38.15534973144531, + "learning_rate": 1.5336986989080868e-05, + "loss": 1.3535, "step": 5574 }, { - "epoch": 1.68, - "grad_norm": 436.9852294921875, - "learning_rate": 8.826300491129598e-06, - "loss": 2.6175, + "epoch": 0.7, + "grad_norm": 26.14240264892578, + "learning_rate": 1.533615027402418e-05, + "loss": 2.1262, "step": 5575 }, { - "epoch": 1.68, - "grad_norm": 12.303122520446777, - "learning_rate": 8.824295880525209e-06, - "loss": 0.9817, + "epoch": 0.7, + "grad_norm": 10.343050956726074, + "learning_rate": 1.5335313558967495e-05, + "loss": 0.8729, "step": 5576 }, { - "epoch": 1.68, - "grad_norm": 13.081160545349121, - "learning_rate": 8.822291269920819e-06, - "loss": 2.0876, + "epoch": 0.7, + "grad_norm": 13.817024230957031, + "learning_rate": 1.533447684391081e-05, + "loss": 1.0332, "step": 5577 }, { - "epoch": 1.68, - "grad_norm": 9.616783142089844, - "learning_rate": 8.820286659316429e-06, - "loss": 1.7439, + "epoch": 0.7, + "grad_norm": 22.90940284729004, + "learning_rate": 1.533364012885412e-05, + "loss": 0.6995, "step": 5578 }, { - "epoch": 1.68, - "grad_norm": 15.427207946777344, - "learning_rate": 8.818282048712039e-06, - "loss": 1.205, + "epoch": 0.7, + "grad_norm": 9.916383743286133, + "learning_rate": 1.5332803413797433e-05, + "loss": 1.0828, "step": 5579 }, { - "epoch": 1.68, - "grad_norm": 17.407861709594727, - "learning_rate": 8.816277438107649e-06, - "loss": 1.9737, + "epoch": 0.7, + "grad_norm": 16.110549926757812, + "learning_rate": 1.5331966698740746e-05, + "loss": 1.7157, "step": 5580 }, { - "epoch": 1.68, - "grad_norm": 30.608501434326172, - "learning_rate": 8.814272827503259e-06, - "loss": 1.4772, + "epoch": 0.7, + "grad_norm": 10.246378898620605, + "learning_rate": 1.5331129983684057e-05, + "loss": 2.2646, "step": 5581 }, { - "epoch": 1.68, - "grad_norm": 17.050554275512695, - "learning_rate": 8.812268216898869e-06, - "loss": 1.2003, + "epoch": 0.7, + "grad_norm": 88.1216049194336, + "learning_rate": 1.533029326862737e-05, + "loss": 1.5729, "step": 5582 }, { - "epoch": 1.68, - "grad_norm": 21.977340698242188, - "learning_rate": 8.810263606294477e-06, - "loss": 2.4612, + "epoch": 0.7, + "grad_norm": 9.350086212158203, + "learning_rate": 1.5329456553570684e-05, + "loss": 1.5943, "step": 5583 }, { - "epoch": 1.68, - "grad_norm": 48.15430450439453, - "learning_rate": 8.808258995690088e-06, - "loss": 1.7486, + "epoch": 0.7, + "grad_norm": 8.791582107543945, + "learning_rate": 1.5328619838513998e-05, + "loss": 1.1793, "step": 5584 }, { - "epoch": 1.68, - "grad_norm": 15.752915382385254, - "learning_rate": 8.806254385085698e-06, - "loss": 1.6484, + "epoch": 0.7, + "grad_norm": 20.162771224975586, + "learning_rate": 1.5327783123457308e-05, + "loss": 1.5224, "step": 5585 }, { - "epoch": 1.68, - "grad_norm": 37.53594207763672, - "learning_rate": 8.804249774481308e-06, - "loss": 2.1715, + "epoch": 0.7, + "grad_norm": 25.328039169311523, + "learning_rate": 1.532694640840062e-05, + "loss": 2.0069, "step": 5586 }, { - "epoch": 1.68, - "grad_norm": 10.964682579040527, - "learning_rate": 8.802245163876918e-06, - "loss": 1.4082, + "epoch": 0.7, + "grad_norm": 128.4274444580078, + "learning_rate": 1.5326109693343935e-05, + "loss": 1.6119, "step": 5587 }, { - "epoch": 1.68, - "grad_norm": 23.61287498474121, - "learning_rate": 8.800240553272528e-06, - "loss": 0.7415, + "epoch": 0.7, + "grad_norm": 19.298948287963867, + "learning_rate": 1.5325272978287245e-05, + "loss": 0.7291, "step": 5588 }, { - "epoch": 1.68, - "grad_norm": 15.340435981750488, - "learning_rate": 8.798235942668138e-06, - "loss": 1.5422, + "epoch": 0.7, + "grad_norm": 8.975295066833496, + "learning_rate": 1.532443626323056e-05, + "loss": 0.8676, "step": 5589 }, { - "epoch": 1.68, - "grad_norm": 12.990081787109375, - "learning_rate": 8.796231332063746e-06, - "loss": 1.4462, + "epoch": 0.7, + "grad_norm": 42.80747985839844, + "learning_rate": 1.532359954817387e-05, + "loss": 1.6301, "step": 5590 }, { - "epoch": 1.68, - "grad_norm": 10.152392387390137, - "learning_rate": 8.794226721459356e-06, - "loss": 0.9929, + "epoch": 0.7, + "grad_norm": 11.307854652404785, + "learning_rate": 1.5322762833117183e-05, + "loss": 0.9856, "step": 5591 }, { - "epoch": 1.68, - "grad_norm": 13.210673332214355, - "learning_rate": 8.792222110854968e-06, - "loss": 1.7244, + "epoch": 0.7, + "grad_norm": 13.756830215454102, + "learning_rate": 1.5321926118060497e-05, + "loss": 1.5287, "step": 5592 }, { - "epoch": 1.68, - "grad_norm": 13.106926918029785, - "learning_rate": 8.790217500250577e-06, - "loss": 1.1276, + "epoch": 0.7, + "grad_norm": 7.556723117828369, + "learning_rate": 1.5321089403003807e-05, + "loss": 0.9806, "step": 5593 }, { - "epoch": 1.68, - "grad_norm": 11.7565336227417, - "learning_rate": 8.788212889646187e-06, - "loss": 1.1467, + "epoch": 0.7, + "grad_norm": 11.78661060333252, + "learning_rate": 1.532025268794712e-05, + "loss": 1.3586, "step": 5594 }, { - "epoch": 1.68, - "grad_norm": 32.24980163574219, - "learning_rate": 8.786208279041797e-06, - "loss": 1.8956, + "epoch": 0.7, + "grad_norm": 29.15213966369629, + "learning_rate": 1.531941597289043e-05, + "loss": 2.4593, "step": 5595 }, { - "epoch": 1.68, - "grad_norm": 15.186212539672852, - "learning_rate": 8.784203668437407e-06, - "loss": 1.8275, + "epoch": 0.7, + "grad_norm": 13.034687995910645, + "learning_rate": 1.5318579257833744e-05, + "loss": 1.1098, "step": 5596 }, { - "epoch": 1.68, - "grad_norm": 26.510732650756836, - "learning_rate": 8.782199057833017e-06, - "loss": 1.2493, + "epoch": 0.7, + "grad_norm": 21.663742065429688, + "learning_rate": 1.5317742542777058e-05, + "loss": 1.2553, "step": 5597 }, { - "epoch": 1.68, - "grad_norm": 9.903314590454102, - "learning_rate": 8.780194447228625e-06, - "loss": 0.8917, + "epoch": 0.7, + "grad_norm": 19.978364944458008, + "learning_rate": 1.5316905827720372e-05, + "loss": 1.5179, "step": 5598 }, { - "epoch": 1.68, - "grad_norm": 9.444249153137207, - "learning_rate": 8.778189836624237e-06, - "loss": 1.073, + "epoch": 0.7, + "grad_norm": 15.038065910339355, + "learning_rate": 1.5316069112663682e-05, + "loss": 1.4449, "step": 5599 }, { - "epoch": 1.68, - "grad_norm": 17.22249412536621, - "learning_rate": 8.776185226019847e-06, - "loss": 1.2148, + "epoch": 0.7, + "grad_norm": 14.578551292419434, + "learning_rate": 1.5315232397606996e-05, + "loss": 1.1441, "step": 5600 }, { - "epoch": 1.68, - "grad_norm": 9.89676284790039, - "learning_rate": 8.774180615415456e-06, - "loss": 1.2275, + "epoch": 0.7, + "eval_loss": 0.11903589963912964, + "eval_runtime": 94.7477, + "eval_samples_per_second": 37.383, + "eval_steps_per_second": 37.383, + "step": 5600 + }, + { + "epoch": 0.7, + "grad_norm": 23.72501564025879, + "learning_rate": 1.531439568255031e-05, + "loss": 3.6484, "step": 5601 }, { - "epoch": 1.68, - "grad_norm": 12.96465015411377, - "learning_rate": 8.772176004811066e-06, - "loss": 1.0976, + "epoch": 0.7, + "grad_norm": 15.129220008850098, + "learning_rate": 1.531355896749362e-05, + "loss": 1.1805, "step": 5602 }, { - "epoch": 1.68, - "grad_norm": 15.916191101074219, - "learning_rate": 8.770171394206676e-06, - "loss": 1.7115, + "epoch": 0.7, + "grad_norm": 10.363043785095215, + "learning_rate": 1.5312722252436933e-05, + "loss": 1.2322, "step": 5603 }, { - "epoch": 1.68, - "grad_norm": 18.037708282470703, - "learning_rate": 8.768166783602286e-06, - "loss": 1.7032, + "epoch": 0.7, + "grad_norm": 11.52414321899414, + "learning_rate": 1.5311885537380247e-05, + "loss": 1.0481, "step": 5604 }, { - "epoch": 1.69, - "grad_norm": 15.334985733032227, - "learning_rate": 8.766162172997896e-06, - "loss": 1.3752, + "epoch": 0.7, + "grad_norm": 8.691742897033691, + "learning_rate": 1.531104882232356e-05, + "loss": 1.5519, "step": 5605 }, { - "epoch": 1.69, - "grad_norm": 9.522956848144531, - "learning_rate": 8.764157562393506e-06, - "loss": 0.623, + "epoch": 0.7, + "grad_norm": 15.305723190307617, + "learning_rate": 1.531021210726687e-05, + "loss": 1.3309, "step": 5606 }, { - "epoch": 1.69, - "grad_norm": 21.579017639160156, - "learning_rate": 8.762152951789116e-06, - "loss": 1.5285, + "epoch": 0.7, + "grad_norm": 22.525123596191406, + "learning_rate": 1.5309375392210184e-05, + "loss": 1.3004, "step": 5607 }, { - "epoch": 1.69, - "grad_norm": 10.721216201782227, - "learning_rate": 8.760148341184726e-06, - "loss": 0.893, + "epoch": 0.7, + "grad_norm": 18.87114906311035, + "learning_rate": 1.5308538677153498e-05, + "loss": 1.4569, "step": 5608 }, { - "epoch": 1.69, - "grad_norm": 29.23563575744629, - "learning_rate": 8.758143730580335e-06, - "loss": 2.0828, + "epoch": 0.7, + "grad_norm": 17.632736206054688, + "learning_rate": 1.530770196209681e-05, + "loss": 1.9694, "step": 5609 }, { - "epoch": 1.69, - "grad_norm": 5.085595607757568, - "learning_rate": 8.756139119975945e-06, - "loss": 0.6403, + "epoch": 0.7, + "grad_norm": 16.05190086364746, + "learning_rate": 1.5306865247040122e-05, + "loss": 0.5519, "step": 5610 }, { - "epoch": 1.69, - "grad_norm": 16.123058319091797, - "learning_rate": 8.754134509371556e-06, - "loss": 1.7075, + "epoch": 0.7, + "grad_norm": 15.205567359924316, + "learning_rate": 1.5306028531983436e-05, + "loss": 0.8385, "step": 5611 }, { - "epoch": 1.69, - "grad_norm": 54.834938049316406, - "learning_rate": 8.752129898767165e-06, - "loss": 1.6164, + "epoch": 0.7, + "grad_norm": 10.732412338256836, + "learning_rate": 1.530519181692675e-05, + "loss": 1.9857, "step": 5612 }, { - "epoch": 1.69, - "grad_norm": 15.420845031738281, - "learning_rate": 8.750125288162775e-06, - "loss": 1.3926, + "epoch": 0.7, + "grad_norm": 3.120633840560913, + "learning_rate": 1.530435510187006e-05, + "loss": 0.0964, "step": 5613 }, { - "epoch": 1.69, - "grad_norm": 14.784860610961914, - "learning_rate": 8.748120677558385e-06, - "loss": 1.175, + "epoch": 0.7, + "grad_norm": 17.112985610961914, + "learning_rate": 1.5303518386813373e-05, + "loss": 2.0094, "step": 5614 }, { - "epoch": 1.69, - "grad_norm": 32.64680862426758, - "learning_rate": 8.746116066953995e-06, - "loss": 1.5805, + "epoch": 0.7, + "grad_norm": 7.553732395172119, + "learning_rate": 1.5302681671756683e-05, + "loss": 0.5445, "step": 5615 }, { - "epoch": 1.69, - "grad_norm": 28.060972213745117, - "learning_rate": 8.744111456349605e-06, - "loss": 1.803, + "epoch": 0.7, + "grad_norm": 7.010446071624756, + "learning_rate": 1.5301844956699997e-05, + "loss": 1.3869, "step": 5616 }, { - "epoch": 1.69, - "grad_norm": 12.223137855529785, - "learning_rate": 8.742106845745215e-06, - "loss": 1.149, + "epoch": 0.7, + "grad_norm": 16.693117141723633, + "learning_rate": 1.530100824164331e-05, + "loss": 1.4811, "step": 5617 }, { - "epoch": 1.69, - "grad_norm": 14.413814544677734, - "learning_rate": 8.740102235140825e-06, - "loss": 1.5615, + "epoch": 0.71, + "grad_norm": 11.959110260009766, + "learning_rate": 1.530017152658662e-05, + "loss": 1.2947, "step": 5618 }, { - "epoch": 1.69, - "grad_norm": 24.743038177490234, - "learning_rate": 8.738097624536435e-06, - "loss": 1.5725, + "epoch": 0.71, + "grad_norm": 9.126626968383789, + "learning_rate": 1.5299334811529935e-05, + "loss": 1.7426, "step": 5619 }, { - "epoch": 1.69, - "grad_norm": 37.43949508666992, - "learning_rate": 8.736093013932044e-06, - "loss": 2.383, + "epoch": 0.71, + "grad_norm": 134.4025115966797, + "learning_rate": 1.5298498096473245e-05, + "loss": 1.8938, "step": 5620 }, { - "epoch": 1.69, - "grad_norm": 19.173534393310547, - "learning_rate": 8.734088403327654e-06, - "loss": 2.1207, + "epoch": 0.71, + "grad_norm": 15.066232681274414, + "learning_rate": 1.529766138141656e-05, + "loss": 2.3855, "step": 5621 }, { - "epoch": 1.69, - "grad_norm": 17.49724006652832, - "learning_rate": 8.732083792723264e-06, - "loss": 1.7264, + "epoch": 0.71, + "grad_norm": 24.542264938354492, + "learning_rate": 1.5296824666359872e-05, + "loss": 0.9356, "step": 5622 }, { - "epoch": 1.69, - "grad_norm": 9.76134967803955, - "learning_rate": 8.730079182118874e-06, - "loss": 1.5944, + "epoch": 0.71, + "grad_norm": 7.537159442901611, + "learning_rate": 1.5295987951303183e-05, + "loss": 1.6194, "step": 5623 }, { - "epoch": 1.69, - "grad_norm": 11.717310905456543, - "learning_rate": 8.728074571514484e-06, - "loss": 1.756, + "epoch": 0.71, + "grad_norm": 8.826299667358398, + "learning_rate": 1.5295151236246496e-05, + "loss": 2.2357, "step": 5624 }, { - "epoch": 1.69, - "grad_norm": 21.17192840576172, - "learning_rate": 8.726069960910094e-06, - "loss": 2.2556, + "epoch": 0.71, + "grad_norm": 16.212858200073242, + "learning_rate": 1.529431452118981e-05, + "loss": 1.3535, "step": 5625 }, { - "epoch": 1.69, - "grad_norm": 16.35768699645996, - "learning_rate": 8.724065350305704e-06, - "loss": 1.2782, + "epoch": 0.71, + "grad_norm": 23.37677574157715, + "learning_rate": 1.5293477806133123e-05, + "loss": 1.8374, "step": 5626 }, { - "epoch": 1.69, - "grad_norm": 12.22645378112793, - "learning_rate": 8.722060739701313e-06, - "loss": 1.3418, + "epoch": 0.71, + "grad_norm": 7.744858264923096, + "learning_rate": 1.5292641091076434e-05, + "loss": 2.3618, "step": 5627 }, { - "epoch": 1.69, - "grad_norm": 19.546850204467773, - "learning_rate": 8.720056129096923e-06, - "loss": 1.2375, + "epoch": 0.71, + "grad_norm": 55.11528778076172, + "learning_rate": 1.5291804376019747e-05, + "loss": 2.4706, "step": 5628 }, { - "epoch": 1.69, - "grad_norm": 14.505806922912598, - "learning_rate": 8.718051518492535e-06, - "loss": 1.5247, + "epoch": 0.71, + "grad_norm": 13.760666847229004, + "learning_rate": 1.529096766096306e-05, + "loss": 0.7654, "step": 5629 }, { - "epoch": 1.69, - "grad_norm": 11.707478523254395, - "learning_rate": 8.716046907888145e-06, - "loss": 1.2686, + "epoch": 0.71, + "grad_norm": 13.986364364624023, + "learning_rate": 1.529013094590637e-05, + "loss": 1.6457, "step": 5630 }, { - "epoch": 1.69, - "grad_norm": 12.525168418884277, - "learning_rate": 8.714042297283753e-06, - "loss": 2.1298, + "epoch": 0.71, + "grad_norm": 5.984301567077637, + "learning_rate": 1.5289294230849685e-05, + "loss": 0.6163, "step": 5631 }, { - "epoch": 1.69, - "grad_norm": 13.70942497253418, - "learning_rate": 8.712037686679363e-06, - "loss": 0.8167, + "epoch": 0.71, + "grad_norm": 11.783964157104492, + "learning_rate": 1.5288457515793e-05, + "loss": 0.8994, "step": 5632 }, { - "epoch": 1.69, - "grad_norm": 18.232576370239258, - "learning_rate": 8.710033076074973e-06, - "loss": 1.0261, + "epoch": 0.71, + "grad_norm": 73.12659454345703, + "learning_rate": 1.5287620800736312e-05, + "loss": 2.9345, "step": 5633 }, { - "epoch": 1.69, - "grad_norm": 19.63440704345703, - "learning_rate": 8.708028465470583e-06, - "loss": 2.1018, + "epoch": 0.71, + "grad_norm": 10.865703582763672, + "learning_rate": 1.5286784085679622e-05, + "loss": 2.3683, "step": 5634 }, { - "epoch": 1.69, - "grad_norm": 17.513425827026367, - "learning_rate": 8.706023854866193e-06, - "loss": 1.2959, + "epoch": 0.71, + "grad_norm": 12.446228981018066, + "learning_rate": 1.5285947370622936e-05, + "loss": 2.5152, "step": 5635 }, { - "epoch": 1.69, - "grad_norm": 15.64637565612793, - "learning_rate": 8.704019244261803e-06, - "loss": 1.8533, + "epoch": 0.71, + "grad_norm": 13.962808609008789, + "learning_rate": 1.528511065556625e-05, + "loss": 1.0881, "step": 5636 }, { - "epoch": 1.69, - "grad_norm": 7.151846885681152, - "learning_rate": 8.702014633657414e-06, - "loss": 0.8085, + "epoch": 0.71, + "grad_norm": 15.042387962341309, + "learning_rate": 1.528427394050956e-05, + "loss": 2.8747, "step": 5637 }, { - "epoch": 1.7, - "grad_norm": 9.188348770141602, - "learning_rate": 8.700010023053022e-06, - "loss": 1.1322, + "epoch": 0.71, + "grad_norm": 104.28298950195312, + "learning_rate": 1.5283437225452874e-05, + "loss": 3.2383, "step": 5638 }, { - "epoch": 1.7, - "grad_norm": 9.800728797912598, - "learning_rate": 8.698005412448632e-06, - "loss": 0.9624, + "epoch": 0.71, + "grad_norm": 37.05125045776367, + "learning_rate": 1.5282600510396187e-05, + "loss": 1.7988, "step": 5639 }, { - "epoch": 1.7, - "grad_norm": 14.96066951751709, - "learning_rate": 8.696000801844242e-06, - "loss": 1.3465, - "step": 5640 - }, - { - "epoch": 1.7, - "eval_loss": 0.1950412392616272, - "eval_runtime": 43.6602, - "eval_samples_per_second": 33.875, - "eval_steps_per_second": 33.875, + "epoch": 0.71, + "grad_norm": 22.914669036865234, + "learning_rate": 1.5281763795339498e-05, + "loss": 1.1929, "step": 5640 }, { - "epoch": 1.7, - "grad_norm": 21.22214698791504, - "learning_rate": 8.693996191239852e-06, - "loss": 1.9302, + "epoch": 0.71, + "grad_norm": 12.188714981079102, + "learning_rate": 1.528092708028281e-05, + "loss": 2.225, "step": 5641 }, { - "epoch": 1.7, - "grad_norm": 11.030116081237793, - "learning_rate": 8.691991580635462e-06, - "loss": 0.9403, + "epoch": 0.71, + "grad_norm": 12.646594047546387, + "learning_rate": 1.5280090365226125e-05, + "loss": 1.5027, "step": 5642 }, { - "epoch": 1.7, - "grad_norm": 10.79010009765625, - "learning_rate": 8.689986970031072e-06, - "loss": 1.2434, + "epoch": 0.71, + "grad_norm": 6.229799270629883, + "learning_rate": 1.5279253650169435e-05, + "loss": 0.515, "step": 5643 }, { - "epoch": 1.7, - "grad_norm": 17.979541778564453, - "learning_rate": 8.687982359426682e-06, - "loss": 1.4883, + "epoch": 0.71, + "grad_norm": 12.691096305847168, + "learning_rate": 1.527841693511275e-05, + "loss": 1.5411, "step": 5644 }, { - "epoch": 1.7, - "grad_norm": 11.965887069702148, - "learning_rate": 8.685977748822292e-06, - "loss": 1.381, + "epoch": 0.71, + "grad_norm": 15.420011520385742, + "learning_rate": 1.5277580220056062e-05, + "loss": 2.482, "step": 5645 }, { - "epoch": 1.7, - "grad_norm": 16.491119384765625, - "learning_rate": 8.683973138217901e-06, - "loss": 1.1538, + "epoch": 0.71, + "grad_norm": 13.545212745666504, + "learning_rate": 1.5276743504999373e-05, + "loss": 1.7458, "step": 5646 }, { - "epoch": 1.7, - "grad_norm": 12.865142822265625, - "learning_rate": 8.681968527613511e-06, - "loss": 1.2, + "epoch": 0.71, + "grad_norm": 1.1677732467651367, + "learning_rate": 1.5275906789942686e-05, + "loss": 0.0206, "step": 5647 }, { - "epoch": 1.7, - "grad_norm": 22.270700454711914, - "learning_rate": 8.679963917009123e-06, - "loss": 1.3727, + "epoch": 0.71, + "grad_norm": 5.918126583099365, + "learning_rate": 1.5275070074885997e-05, + "loss": 2.0381, "step": 5648 }, { - "epoch": 1.7, - "grad_norm": 31.27298927307129, - "learning_rate": 8.677959306404731e-06, - "loss": 2.6597, + "epoch": 0.71, + "grad_norm": 42.72283172607422, + "learning_rate": 1.527423335982931e-05, + "loss": 2.3005, "step": 5649 }, { - "epoch": 1.7, - "grad_norm": 10.323209762573242, - "learning_rate": 8.675954695800341e-06, - "loss": 0.9057, + "epoch": 0.71, + "grad_norm": 22.049144744873047, + "learning_rate": 1.5273396644772624e-05, + "loss": 1.9597, "step": 5650 }, { - "epoch": 1.7, - "grad_norm": 14.9830961227417, - "learning_rate": 8.673950085195951e-06, - "loss": 2.7499, + "epoch": 0.71, + "grad_norm": 6.75792121887207, + "learning_rate": 1.5272559929715934e-05, + "loss": 1.9141, "step": 5651 }, { - "epoch": 1.7, - "grad_norm": 17.780193328857422, - "learning_rate": 8.671945474591561e-06, - "loss": 1.4829, + "epoch": 0.71, + "grad_norm": 8.55809211730957, + "learning_rate": 1.5271723214659248e-05, + "loss": 2.7011, "step": 5652 }, { - "epoch": 1.7, - "grad_norm": 14.081412315368652, - "learning_rate": 8.669940863987171e-06, - "loss": 1.159, + "epoch": 0.71, + "grad_norm": 13.690023422241211, + "learning_rate": 1.527088649960256e-05, + "loss": 0.4655, "step": 5653 }, { - "epoch": 1.7, - "grad_norm": 16.137582778930664, - "learning_rate": 8.667936253382782e-06, - "loss": 1.0004, + "epoch": 0.71, + "grad_norm": 17.889263153076172, + "learning_rate": 1.5270049784545875e-05, + "loss": 2.3118, "step": 5654 }, { - "epoch": 1.7, - "grad_norm": 21.85881805419922, - "learning_rate": 8.665931642778392e-06, - "loss": 1.4793, + "epoch": 0.71, + "grad_norm": 18.358047485351562, + "learning_rate": 1.5269213069489185e-05, + "loss": 2.4082, "step": 5655 }, { - "epoch": 1.7, - "grad_norm": 21.70241355895996, - "learning_rate": 8.663927032174002e-06, - "loss": 1.6126, + "epoch": 0.71, + "grad_norm": 8.954587936401367, + "learning_rate": 1.52683763544325e-05, + "loss": 1.8922, "step": 5656 }, { - "epoch": 1.7, - "grad_norm": 16.00739288330078, - "learning_rate": 8.66192242156961e-06, - "loss": 1.7453, + "epoch": 0.71, + "grad_norm": 27.990930557250977, + "learning_rate": 1.5267539639375813e-05, + "loss": 2.135, "step": 5657 }, { - "epoch": 1.7, - "grad_norm": 41.264095306396484, - "learning_rate": 8.65991781096522e-06, - "loss": 1.8027, + "epoch": 0.71, + "grad_norm": 30.18874168395996, + "learning_rate": 1.5266702924319123e-05, + "loss": 1.9133, "step": 5658 }, { - "epoch": 1.7, - "grad_norm": 59.908199310302734, - "learning_rate": 8.65791320036083e-06, - "loss": 1.4352, + "epoch": 0.71, + "grad_norm": 34.31163024902344, + "learning_rate": 1.5265866209262437e-05, + "loss": 2.7876, "step": 5659 }, { - "epoch": 1.7, - "grad_norm": 26.252933502197266, - "learning_rate": 8.65590858975644e-06, - "loss": 1.1851, + "epoch": 0.71, + "grad_norm": 6.2728447914123535, + "learning_rate": 1.526502949420575e-05, + "loss": 0.7026, "step": 5660 }, { - "epoch": 1.7, - "grad_norm": 14.983129501342773, - "learning_rate": 8.65390397915205e-06, - "loss": 1.1969, + "epoch": 0.71, + "grad_norm": 14.44756031036377, + "learning_rate": 1.5264192779149064e-05, + "loss": 1.9564, "step": 5661 }, { - "epoch": 1.7, - "grad_norm": 7.037642478942871, - "learning_rate": 8.65189936854766e-06, - "loss": 0.6684, + "epoch": 0.71, + "grad_norm": 10.850642204284668, + "learning_rate": 1.5263356064092374e-05, + "loss": 0.6897, "step": 5662 }, { - "epoch": 1.7, - "grad_norm": 12.585198402404785, - "learning_rate": 8.64989475794327e-06, - "loss": 1.4334, + "epoch": 0.71, + "grad_norm": 8.339897155761719, + "learning_rate": 1.5262519349035688e-05, + "loss": 0.6974, "step": 5663 }, { - "epoch": 1.7, - "grad_norm": 21.677915573120117, - "learning_rate": 8.64789014733888e-06, - "loss": 2.5165, + "epoch": 0.71, + "grad_norm": 53.20794677734375, + "learning_rate": 1.5261682633979e-05, + "loss": 4.7937, "step": 5664 }, { - "epoch": 1.7, - "grad_norm": 23.204702377319336, - "learning_rate": 8.645885536734489e-06, - "loss": 1.5472, + "epoch": 0.71, + "grad_norm": 20.38677978515625, + "learning_rate": 1.5260845918922312e-05, + "loss": 1.6178, "step": 5665 }, { - "epoch": 1.7, - "grad_norm": 10.986197471618652, - "learning_rate": 8.643880926130101e-06, - "loss": 1.6953, + "epoch": 0.71, + "grad_norm": 10.413582801818848, + "learning_rate": 1.5260009203865625e-05, + "loss": 0.7548, "step": 5666 }, { - "epoch": 1.7, - "grad_norm": 11.328323364257812, - "learning_rate": 8.641876315525711e-06, - "loss": 0.5602, + "epoch": 0.71, + "grad_norm": 21.674497604370117, + "learning_rate": 1.525917248880894e-05, + "loss": 2.4653, "step": 5667 }, { - "epoch": 1.7, - "grad_norm": 9.922806739807129, - "learning_rate": 8.63987170492132e-06, - "loss": 1.3713, + "epoch": 0.71, + "grad_norm": 45.409671783447266, + "learning_rate": 1.5258335773752251e-05, + "loss": 1.736, "step": 5668 }, { - "epoch": 1.7, - "grad_norm": 8.560861587524414, - "learning_rate": 8.63786709431693e-06, - "loss": 1.0511, + "epoch": 0.71, + "grad_norm": 13.167337417602539, + "learning_rate": 1.5257499058695561e-05, + "loss": 1.1981, "step": 5669 }, { - "epoch": 1.7, - "grad_norm": 38.06193923950195, - "learning_rate": 8.63586248371254e-06, - "loss": 1.5991, + "epoch": 0.71, + "grad_norm": 7.016934394836426, + "learning_rate": 1.5256662343638875e-05, + "loss": 0.4368, "step": 5670 }, { - "epoch": 1.71, - "grad_norm": 17.680063247680664, - "learning_rate": 8.63385787310815e-06, - "loss": 2.7477, + "epoch": 0.71, + "grad_norm": 22.98343849182129, + "learning_rate": 1.5255825628582189e-05, + "loss": 1.8847, "step": 5671 }, { - "epoch": 1.71, - "grad_norm": 29.49445915222168, - "learning_rate": 8.63185326250376e-06, - "loss": 1.7613, + "epoch": 0.71, + "grad_norm": 8.879514694213867, + "learning_rate": 1.5254988913525499e-05, + "loss": 1.9898, "step": 5672 }, { - "epoch": 1.71, - "grad_norm": 32.89970397949219, - "learning_rate": 8.62984865189937e-06, - "loss": 1.6423, + "epoch": 0.71, + "grad_norm": 6.920591354370117, + "learning_rate": 1.5254152198468812e-05, + "loss": 0.8359, "step": 5673 }, { - "epoch": 1.71, - "grad_norm": 40.36264419555664, - "learning_rate": 8.62784404129498e-06, - "loss": 1.2661, + "epoch": 0.71, + "grad_norm": 14.733323097229004, + "learning_rate": 1.5253315483412126e-05, + "loss": 2.9059, "step": 5674 }, { - "epoch": 1.71, - "grad_norm": 15.551639556884766, - "learning_rate": 8.625839430690588e-06, - "loss": 1.6711, + "epoch": 0.71, + "grad_norm": 9.401959419250488, + "learning_rate": 1.5252478768355438e-05, + "loss": 1.1228, "step": 5675 }, { - "epoch": 1.71, - "grad_norm": 30.854095458984375, - "learning_rate": 8.623834820086198e-06, - "loss": 1.0708, + "epoch": 0.71, + "grad_norm": 14.805521011352539, + "learning_rate": 1.525164205329875e-05, + "loss": 0.7959, "step": 5676 }, { - "epoch": 1.71, - "grad_norm": 30.149341583251953, - "learning_rate": 8.621830209481808e-06, - "loss": 1.7262, + "epoch": 0.71, + "grad_norm": 17.671751022338867, + "learning_rate": 1.5250805338242064e-05, + "loss": 1.9533, "step": 5677 }, { - "epoch": 1.71, - "grad_norm": 17.061527252197266, - "learning_rate": 8.619825598877418e-06, - "loss": 1.4248, + "epoch": 0.71, + "grad_norm": 9.388689994812012, + "learning_rate": 1.5249968623185376e-05, + "loss": 2.1568, "step": 5678 }, { - "epoch": 1.71, - "grad_norm": 78.73602294921875, - "learning_rate": 8.617820988273029e-06, - "loss": 2.1561, + "epoch": 0.71, + "grad_norm": 22.36805534362793, + "learning_rate": 1.5249131908128688e-05, + "loss": 2.0653, "step": 5679 }, { - "epoch": 1.71, - "grad_norm": 51.467491149902344, - "learning_rate": 8.615816377668639e-06, - "loss": 1.7134, + "epoch": 0.71, + "grad_norm": 6.249520778656006, + "learning_rate": 1.5248295193072e-05, + "loss": 2.0328, "step": 5680 }, { - "epoch": 1.71, - "grad_norm": 17.935325622558594, - "learning_rate": 8.613811767064249e-06, - "loss": 1.576, + "epoch": 0.71, + "grad_norm": 12.663107872009277, + "learning_rate": 1.5247458478015313e-05, + "loss": 1.3492, "step": 5681 }, { - "epoch": 1.71, - "grad_norm": 14.966944694519043, - "learning_rate": 8.611807156459859e-06, - "loss": 2.0311, + "epoch": 0.71, + "grad_norm": 7.885650157928467, + "learning_rate": 1.5246621762958627e-05, + "loss": 0.5711, "step": 5682 }, { - "epoch": 1.71, - "grad_norm": 14.564214706420898, - "learning_rate": 8.609802545855467e-06, - "loss": 1.7562, + "epoch": 0.71, + "grad_norm": 7.987274646759033, + "learning_rate": 1.5245785047901937e-05, + "loss": 1.8003, "step": 5683 }, { - "epoch": 1.71, - "grad_norm": 11.899775505065918, - "learning_rate": 8.607797935251077e-06, - "loss": 1.2177, + "epoch": 0.71, + "grad_norm": 12.844233512878418, + "learning_rate": 1.524494833284525e-05, + "loss": 1.0966, "step": 5684 }, { - "epoch": 1.71, - "grad_norm": 14.998151779174805, - "learning_rate": 8.605793324646689e-06, - "loss": 1.6503, + "epoch": 0.71, + "grad_norm": 20.380704879760742, + "learning_rate": 1.5244111617788564e-05, + "loss": 1.5229, "step": 5685 }, { - "epoch": 1.71, - "grad_norm": 18.94357681274414, - "learning_rate": 8.603788714042297e-06, - "loss": 1.411, + "epoch": 0.71, + "grad_norm": 5.888026714324951, + "learning_rate": 1.5243274902731875e-05, + "loss": 0.4774, "step": 5686 }, { - "epoch": 1.71, - "grad_norm": 14.1841459274292, - "learning_rate": 8.601784103437908e-06, - "loss": 1.5919, + "epoch": 0.71, + "grad_norm": 5.787362098693848, + "learning_rate": 1.5242438187675188e-05, + "loss": 2.1025, "step": 5687 }, { - "epoch": 1.71, - "grad_norm": 32.609100341796875, - "learning_rate": 8.599779492833518e-06, - "loss": 1.7305, + "epoch": 0.71, + "grad_norm": 41.53786087036133, + "learning_rate": 1.5241601472618502e-05, + "loss": 3.4078, "step": 5688 }, { - "epoch": 1.71, - "grad_norm": 35.52297592163086, - "learning_rate": 8.597774882229128e-06, - "loss": 2.0784, + "epoch": 0.71, + "grad_norm": 29.634103775024414, + "learning_rate": 1.5240764757561814e-05, + "loss": 2.0694, "step": 5689 }, { - "epoch": 1.71, - "grad_norm": 13.570123672485352, - "learning_rate": 8.595770271624738e-06, - "loss": 1.0387, + "epoch": 0.71, + "grad_norm": 7.678778171539307, + "learning_rate": 1.5239928042505126e-05, + "loss": 1.2133, "step": 5690 }, { - "epoch": 1.71, - "grad_norm": 12.182998657226562, - "learning_rate": 8.593765661020348e-06, - "loss": 1.042, + "epoch": 0.71, + "grad_norm": 7.2586798667907715, + "learning_rate": 1.523909132744844e-05, + "loss": 0.7017, "step": 5691 }, { - "epoch": 1.71, - "grad_norm": 56.89901351928711, - "learning_rate": 8.591761050415958e-06, - "loss": 2.2758, + "epoch": 0.71, + "grad_norm": 10.308073997497559, + "learning_rate": 1.5238254612391751e-05, + "loss": 1.672, "step": 5692 }, { - "epoch": 1.71, - "grad_norm": 16.903789520263672, - "learning_rate": 8.589756439811568e-06, - "loss": 1.0774, + "epoch": 0.71, + "grad_norm": 8.929986000061035, + "learning_rate": 1.5237417897335063e-05, + "loss": 0.3281, "step": 5693 }, { - "epoch": 1.71, - "grad_norm": 36.77326583862305, - "learning_rate": 8.587751829207176e-06, - "loss": 2.6291, + "epoch": 0.71, + "grad_norm": 18.599157333374023, + "learning_rate": 1.5236581182278375e-05, + "loss": 1.7576, "step": 5694 }, { - "epoch": 1.71, - "grad_norm": 24.694618225097656, - "learning_rate": 8.585747218602787e-06, - "loss": 1.4135, + "epoch": 0.71, + "grad_norm": 46.4240608215332, + "learning_rate": 1.5235744467221689e-05, + "loss": 1.8176, "step": 5695 }, { - "epoch": 1.71, - "grad_norm": 44.71474075317383, - "learning_rate": 8.583742607998397e-06, - "loss": 1.5254, + "epoch": 0.71, + "grad_norm": 12.64277458190918, + "learning_rate": 1.5234907752165003e-05, + "loss": 1.5387, "step": 5696 }, { - "epoch": 1.71, - "grad_norm": 27.899707794189453, - "learning_rate": 8.581737997394007e-06, - "loss": 1.8512, + "epoch": 0.71, + "grad_norm": 9.800168991088867, + "learning_rate": 1.5234071037108313e-05, + "loss": 0.7847, "step": 5697 }, { - "epoch": 1.71, - "grad_norm": 8.52084732055664, - "learning_rate": 8.579733386789617e-06, - "loss": 1.3119, + "epoch": 0.72, + "grad_norm": 9.509275436401367, + "learning_rate": 1.5233234322051627e-05, + "loss": 0.6097, "step": 5698 }, { - "epoch": 1.71, - "grad_norm": 12.418741226196289, - "learning_rate": 8.577728776185227e-06, - "loss": 1.7178, + "epoch": 0.72, + "grad_norm": 16.417707443237305, + "learning_rate": 1.523239760699494e-05, + "loss": 1.5009, "step": 5699 }, { - "epoch": 1.71, - "grad_norm": 11.793305397033691, - "learning_rate": 8.575724165580837e-06, - "loss": 1.0568, + "epoch": 0.72, + "grad_norm": 10.363656997680664, + "learning_rate": 1.523156089193825e-05, + "loss": 0.9842, "step": 5700 }, { - "epoch": 1.71, - "grad_norm": 9.70165729522705, - "learning_rate": 8.573719554976447e-06, - "loss": 1.6653, + "epoch": 0.72, + "grad_norm": 6.770571231842041, + "learning_rate": 1.5230724176881564e-05, + "loss": 0.5578, "step": 5701 }, { - "epoch": 1.71, - "grad_norm": 15.392131805419922, - "learning_rate": 8.571714944372055e-06, - "loss": 2.2157, + "epoch": 0.72, + "grad_norm": 39.45873260498047, + "learning_rate": 1.5229887461824878e-05, + "loss": 1.904, "step": 5702 }, { - "epoch": 1.71, - "grad_norm": 35.93635177612305, - "learning_rate": 8.569710333767667e-06, - "loss": 1.2638, + "epoch": 0.72, + "grad_norm": 9.785941123962402, + "learning_rate": 1.522905074676819e-05, + "loss": 0.9517, "step": 5703 }, { - "epoch": 1.71, - "grad_norm": 12.030016899108887, - "learning_rate": 8.567705723163277e-06, - "loss": 1.3019, + "epoch": 0.72, + "grad_norm": 8.684469223022461, + "learning_rate": 1.5228214031711502e-05, + "loss": 0.7899, "step": 5704 }, { - "epoch": 1.72, - "grad_norm": 39.288814544677734, - "learning_rate": 8.565701112558886e-06, - "loss": 2.3544, + "epoch": 0.72, + "grad_norm": 14.175126075744629, + "learning_rate": 1.5227377316654814e-05, + "loss": 1.3251, "step": 5705 }, { - "epoch": 1.72, - "grad_norm": 23.307493209838867, - "learning_rate": 8.563696501954496e-06, - "loss": 1.5503, + "epoch": 0.72, + "grad_norm": 11.509045600891113, + "learning_rate": 1.5226540601598127e-05, + "loss": 1.8926, "step": 5706 }, { - "epoch": 1.72, - "grad_norm": 26.393465042114258, - "learning_rate": 8.561691891350106e-06, - "loss": 2.5251, + "epoch": 0.72, + "grad_norm": 44.07933807373047, + "learning_rate": 1.522570388654144e-05, + "loss": 1.7037, "step": 5707 }, { - "epoch": 1.72, - "grad_norm": 10.045159339904785, - "learning_rate": 8.559687280745716e-06, - "loss": 1.1675, + "epoch": 0.72, + "grad_norm": 27.919408798217773, + "learning_rate": 1.5224867171484751e-05, + "loss": 1.7059, "step": 5708 }, { - "epoch": 1.72, - "grad_norm": 35.52949142456055, - "learning_rate": 8.557682670141326e-06, - "loss": 2.1613, + "epoch": 0.72, + "grad_norm": 16.3609619140625, + "learning_rate": 1.5224030456428065e-05, + "loss": 3.2524, "step": 5709 }, { - "epoch": 1.72, - "grad_norm": 19.822158813476562, - "learning_rate": 8.555678059536936e-06, - "loss": 1.5757, + "epoch": 0.72, + "grad_norm": 7.351317405700684, + "learning_rate": 1.5223193741371378e-05, + "loss": 1.9986, "step": 5710 }, { - "epoch": 1.72, - "grad_norm": 41.30690383911133, - "learning_rate": 8.553673448932546e-06, - "loss": 2.7388, + "epoch": 0.72, + "grad_norm": 8.646843910217285, + "learning_rate": 1.5222357026314689e-05, + "loss": 1.075, "step": 5711 }, { - "epoch": 1.72, - "grad_norm": 20.42682456970215, - "learning_rate": 8.551668838328155e-06, - "loss": 2.3481, + "epoch": 0.72, + "grad_norm": 47.738895416259766, + "learning_rate": 1.5221520311258002e-05, + "loss": 2.951, "step": 5712 }, { - "epoch": 1.72, - "grad_norm": 9.312350273132324, - "learning_rate": 8.549664227723765e-06, - "loss": 0.9268, + "epoch": 0.72, + "grad_norm": 28.56525230407715, + "learning_rate": 1.5220683596201316e-05, + "loss": 1.5992, "step": 5713 }, { - "epoch": 1.72, - "grad_norm": 11.393967628479004, - "learning_rate": 8.547659617119375e-06, - "loss": 1.6853, + "epoch": 0.72, + "grad_norm": 11.733442306518555, + "learning_rate": 1.5219846881144626e-05, + "loss": 1.3493, "step": 5714 }, { - "epoch": 1.72, - "grad_norm": 22.255510330200195, - "learning_rate": 8.545655006514987e-06, - "loss": 1.0106, + "epoch": 0.72, + "grad_norm": 9.42813491821289, + "learning_rate": 1.521901016608794e-05, + "loss": 0.8787, "step": 5715 }, { - "epoch": 1.72, - "grad_norm": 12.787086486816406, - "learning_rate": 8.543650395910595e-06, - "loss": 1.6355, + "epoch": 0.72, + "grad_norm": 14.754581451416016, + "learning_rate": 1.5218173451031254e-05, + "loss": 0.965, "step": 5716 }, { - "epoch": 1.72, - "grad_norm": 39.999839782714844, - "learning_rate": 8.541645785306205e-06, - "loss": 1.8123, + "epoch": 0.72, + "grad_norm": 21.997575759887695, + "learning_rate": 1.5217336735974566e-05, + "loss": 1.7967, "step": 5717 }, { - "epoch": 1.72, - "grad_norm": 7.7507853507995605, - "learning_rate": 8.539641174701815e-06, - "loss": 1.4238, + "epoch": 0.72, + "grad_norm": 9.579668045043945, + "learning_rate": 1.5216500020917878e-05, + "loss": 2.0504, "step": 5718 }, { - "epoch": 1.72, - "grad_norm": 15.439056396484375, - "learning_rate": 8.537636564097425e-06, - "loss": 1.7531, + "epoch": 0.72, + "grad_norm": 15.060192108154297, + "learning_rate": 1.521566330586119e-05, + "loss": 1.9743, "step": 5719 }, { - "epoch": 1.72, - "grad_norm": 17.616294860839844, - "learning_rate": 8.535631953493034e-06, - "loss": 1.8694, + "epoch": 0.72, + "grad_norm": 18.11211395263672, + "learning_rate": 1.5214826590804503e-05, + "loss": 2.1337, "step": 5720 }, { - "epoch": 1.72, - "grad_norm": 10.567305564880371, - "learning_rate": 8.533627342888644e-06, - "loss": 1.0895, + "epoch": 0.72, + "grad_norm": 10.534466743469238, + "learning_rate": 1.5213989875747815e-05, + "loss": 1.7234, "step": 5721 }, { - "epoch": 1.72, - "grad_norm": 14.490511894226074, - "learning_rate": 8.531622732284255e-06, - "loss": 1.4669, + "epoch": 0.72, + "grad_norm": 15.524293899536133, + "learning_rate": 1.5213153160691127e-05, + "loss": 0.9391, "step": 5722 }, { - "epoch": 1.72, - "grad_norm": 9.218794822692871, - "learning_rate": 8.529618121679864e-06, - "loss": 1.0658, + "epoch": 0.72, + "grad_norm": 27.32461929321289, + "learning_rate": 1.521231644563444e-05, + "loss": 3.1592, "step": 5723 }, { - "epoch": 1.72, - "grad_norm": 10.850655555725098, - "learning_rate": 8.527613511075474e-06, - "loss": 1.0692, + "epoch": 0.72, + "grad_norm": 16.06114387512207, + "learning_rate": 1.5211479730577754e-05, + "loss": 2.1683, "step": 5724 }, { - "epoch": 1.72, - "grad_norm": 29.35923194885254, - "learning_rate": 8.525608900471084e-06, - "loss": 1.4617, + "epoch": 0.72, + "grad_norm": 5.4787211418151855, + "learning_rate": 1.5210643015521065e-05, + "loss": 0.2532, "step": 5725 }, { - "epoch": 1.72, - "grad_norm": 19.219905853271484, - "learning_rate": 8.523604289866694e-06, - "loss": 2.7337, + "epoch": 0.72, + "grad_norm": 14.662065505981445, + "learning_rate": 1.5209806300464378e-05, + "loss": 2.8309, "step": 5726 }, { - "epoch": 1.72, - "grad_norm": 17.148277282714844, - "learning_rate": 8.521599679262304e-06, - "loss": 1.4809, + "epoch": 0.72, + "grad_norm": 13.122373580932617, + "learning_rate": 1.5208969585407692e-05, + "loss": 1.8849, "step": 5727 }, { - "epoch": 1.72, - "grad_norm": 55.859092712402344, - "learning_rate": 8.519595068657914e-06, - "loss": 1.3051, + "epoch": 0.72, + "grad_norm": 33.124778747558594, + "learning_rate": 1.5208132870351002e-05, + "loss": 2.8645, "step": 5728 }, { - "epoch": 1.72, - "grad_norm": 12.13316822052002, - "learning_rate": 8.517590458053524e-06, - "loss": 1.3944, + "epoch": 0.72, + "grad_norm": 13.683943748474121, + "learning_rate": 1.5207296155294316e-05, + "loss": 1.1949, "step": 5729 }, { - "epoch": 1.72, - "grad_norm": 26.604719161987305, - "learning_rate": 8.515585847449134e-06, - "loss": 1.6258, + "epoch": 0.72, + "grad_norm": 9.916946411132812, + "learning_rate": 1.520645944023763e-05, + "loss": 2.8687, "step": 5730 }, { - "epoch": 1.72, - "grad_norm": 13.429365158081055, - "learning_rate": 8.513581236844743e-06, - "loss": 1.0267, + "epoch": 0.72, + "grad_norm": 15.309993743896484, + "learning_rate": 1.5205622725180941e-05, + "loss": 2.193, "step": 5731 }, { - "epoch": 1.72, - "grad_norm": 13.195096969604492, - "learning_rate": 8.511576626240353e-06, - "loss": 1.1112, + "epoch": 0.72, + "grad_norm": 11.65296745300293, + "learning_rate": 1.5204786010124253e-05, + "loss": 1.0531, "step": 5732 }, { - "epoch": 1.72, - "grad_norm": 19.855567932128906, - "learning_rate": 8.509572015635963e-06, - "loss": 1.1505, + "epoch": 0.72, + "grad_norm": 27.579500198364258, + "learning_rate": 1.5203949295067565e-05, + "loss": 1.1293, "step": 5733 }, { - "epoch": 1.72, - "grad_norm": 13.332557678222656, - "learning_rate": 8.507567405031573e-06, - "loss": 1.5326, + "epoch": 0.72, + "grad_norm": 6.373600006103516, + "learning_rate": 1.5203112580010879e-05, + "loss": 1.607, "step": 5734 }, { - "epoch": 1.72, - "grad_norm": 11.699455261230469, - "learning_rate": 8.505562794427183e-06, - "loss": 1.3027, + "epoch": 0.72, + "grad_norm": 12.031510353088379, + "learning_rate": 1.5202275864954191e-05, + "loss": 1.845, "step": 5735 }, { - "epoch": 1.72, - "grad_norm": 55.0339469909668, - "learning_rate": 8.503558183822793e-06, - "loss": 1.8905, + "epoch": 0.72, + "grad_norm": 6.446319103240967, + "learning_rate": 1.5201439149897503e-05, + "loss": 1.699, "step": 5736 }, { - "epoch": 1.72, - "grad_norm": 14.223106384277344, - "learning_rate": 8.501553573218403e-06, - "loss": 1.1121, + "epoch": 0.72, + "grad_norm": 18.43304443359375, + "learning_rate": 1.5200602434840817e-05, + "loss": 1.6499, "step": 5737 }, { - "epoch": 1.73, - "grad_norm": 19.943954467773438, - "learning_rate": 8.499548962614013e-06, - "loss": 1.9677, + "epoch": 0.72, + "grad_norm": 21.51383399963379, + "learning_rate": 1.519976571978413e-05, + "loss": 1.3497, "step": 5738 }, { - "epoch": 1.73, - "grad_norm": 30.25005340576172, - "learning_rate": 8.497544352009622e-06, - "loss": 1.7023, + "epoch": 0.72, + "grad_norm": 11.108257293701172, + "learning_rate": 1.519892900472744e-05, + "loss": 0.6843, "step": 5739 }, { - "epoch": 1.73, - "grad_norm": 19.0448055267334, - "learning_rate": 8.495539741405234e-06, - "loss": 1.3845, + "epoch": 0.72, + "grad_norm": 13.190654754638672, + "learning_rate": 1.5198092289670754e-05, + "loss": 2.054, "step": 5740 }, { - "epoch": 1.73, - "grad_norm": 16.908838272094727, - "learning_rate": 8.493535130800844e-06, - "loss": 1.7335, + "epoch": 0.72, + "grad_norm": 12.344770431518555, + "learning_rate": 1.5197255574614068e-05, + "loss": 1.5029, "step": 5741 }, { - "epoch": 1.73, - "grad_norm": 24.786378860473633, - "learning_rate": 8.491530520196452e-06, - "loss": 1.7752, + "epoch": 0.72, + "grad_norm": 10.223804473876953, + "learning_rate": 1.5196418859557378e-05, + "loss": 1.5523, "step": 5742 }, { - "epoch": 1.73, - "grad_norm": 14.871647834777832, - "learning_rate": 8.489525909592062e-06, - "loss": 1.7707, + "epoch": 0.72, + "grad_norm": 16.279478073120117, + "learning_rate": 1.5195582144500692e-05, + "loss": 2.2957, "step": 5743 }, { - "epoch": 1.73, - "grad_norm": 9.237364768981934, - "learning_rate": 8.487521298987672e-06, - "loss": 1.2148, + "epoch": 0.72, + "grad_norm": 11.446799278259277, + "learning_rate": 1.5194745429444005e-05, + "loss": 1.5505, "step": 5744 }, { - "epoch": 1.73, - "grad_norm": 20.291860580444336, - "learning_rate": 8.485516688383282e-06, - "loss": 1.3439, + "epoch": 0.72, + "grad_norm": 29.410280227661133, + "learning_rate": 1.5193908714387317e-05, + "loss": 2.2996, "step": 5745 }, { - "epoch": 1.73, - "grad_norm": 23.027454376220703, - "learning_rate": 8.483512077778892e-06, - "loss": 1.6966, + "epoch": 0.72, + "grad_norm": 34.11577606201172, + "learning_rate": 1.519307199933063e-05, + "loss": 1.5296, "step": 5746 }, { - "epoch": 1.73, - "grad_norm": 14.931220054626465, - "learning_rate": 8.481507467174502e-06, - "loss": 1.8485, + "epoch": 0.72, + "grad_norm": 22.040115356445312, + "learning_rate": 1.5192235284273941e-05, + "loss": 1.6252, "step": 5747 }, { - "epoch": 1.73, - "grad_norm": 8.827281951904297, - "learning_rate": 8.479502856570113e-06, - "loss": 0.8949, + "epoch": 0.72, + "grad_norm": 37.460208892822266, + "learning_rate": 1.5191398569217255e-05, + "loss": 2.2588, "step": 5748 }, { - "epoch": 1.73, - "grad_norm": 13.313647270202637, - "learning_rate": 8.477498245965723e-06, - "loss": 1.3746, + "epoch": 0.72, + "grad_norm": 11.320125579833984, + "learning_rate": 1.5190561854160567e-05, + "loss": 2.044, "step": 5749 }, { - "epoch": 1.73, - "grad_norm": 40.448280334472656, - "learning_rate": 8.475493635361331e-06, - "loss": 2.6908, + "epoch": 0.72, + "grad_norm": 15.098015785217285, + "learning_rate": 1.5189725139103879e-05, + "loss": 2.4267, "step": 5750 }, { - "epoch": 1.73, - "grad_norm": 15.871477127075195, - "learning_rate": 8.473489024756941e-06, - "loss": 1.6084, + "epoch": 0.72, + "grad_norm": 12.515155792236328, + "learning_rate": 1.5188888424047192e-05, + "loss": 1.7228, "step": 5751 }, { - "epoch": 1.73, - "grad_norm": 38.03894805908203, - "learning_rate": 8.471484414152553e-06, - "loss": 1.693, + "epoch": 0.72, + "grad_norm": 44.248958587646484, + "learning_rate": 1.5188051708990506e-05, + "loss": 2.0615, "step": 5752 }, { - "epoch": 1.73, - "grad_norm": 53.909751892089844, - "learning_rate": 8.469479803548161e-06, - "loss": 1.3998, + "epoch": 0.72, + "grad_norm": 17.942691802978516, + "learning_rate": 1.5187214993933816e-05, + "loss": 2.337, "step": 5753 }, { - "epoch": 1.73, - "grad_norm": 23.2731876373291, - "learning_rate": 8.467475192943771e-06, - "loss": 1.1304, + "epoch": 0.72, + "grad_norm": 9.37904167175293, + "learning_rate": 1.518637827887713e-05, + "loss": 1.5438, "step": 5754 }, { - "epoch": 1.73, - "grad_norm": 16.913698196411133, - "learning_rate": 8.465470582339381e-06, - "loss": 1.2357, + "epoch": 0.72, + "grad_norm": 21.882314682006836, + "learning_rate": 1.5185541563820444e-05, + "loss": 2.6402, "step": 5755 }, { - "epoch": 1.73, - "grad_norm": 25.4580135345459, - "learning_rate": 8.463465971734992e-06, - "loss": 1.669, + "epoch": 0.72, + "grad_norm": 19.666954040527344, + "learning_rate": 1.5184704848763754e-05, + "loss": 1.7297, "step": 5756 }, { - "epoch": 1.73, - "grad_norm": 17.684297561645508, - "learning_rate": 8.4614613611306e-06, - "loss": 1.8284, + "epoch": 0.72, + "grad_norm": 14.530044555664062, + "learning_rate": 1.5183868133707067e-05, + "loss": 1.3628, "step": 5757 }, { - "epoch": 1.73, - "grad_norm": 11.82136058807373, - "learning_rate": 8.459456750526212e-06, - "loss": 1.5129, + "epoch": 0.72, + "grad_norm": 34.499019622802734, + "learning_rate": 1.518303141865038e-05, + "loss": 2.3917, "step": 5758 }, { - "epoch": 1.73, - "grad_norm": 19.27686882019043, - "learning_rate": 8.457452139921822e-06, - "loss": 2.0577, + "epoch": 0.72, + "grad_norm": 10.190333366394043, + "learning_rate": 1.5182194703593693e-05, + "loss": 3.1606, "step": 5759 }, { - "epoch": 1.73, - "grad_norm": 19.104829788208008, - "learning_rate": 8.45544752931743e-06, - "loss": 1.8845, - "step": 5760 - }, - { - "epoch": 1.73, - "eval_loss": 0.19915775954723358, - "eval_runtime": 43.571, - "eval_samples_per_second": 33.945, - "eval_steps_per_second": 33.945, + "epoch": 0.72, + "grad_norm": 20.318904876708984, + "learning_rate": 1.5181357988537005e-05, + "loss": 2.2891, "step": 5760 }, { - "epoch": 1.73, - "grad_norm": 30.6223201751709, - "learning_rate": 8.45344291871304e-06, - "loss": 1.4588, + "epoch": 0.72, + "grad_norm": 40.29243469238281, + "learning_rate": 1.5180521273480317e-05, + "loss": 0.8837, "step": 5761 }, { - "epoch": 1.73, - "grad_norm": 20.711313247680664, - "learning_rate": 8.45143830810865e-06, - "loss": 1.9552, + "epoch": 0.72, + "grad_norm": 9.452630996704102, + "learning_rate": 1.517968455842363e-05, + "loss": 1.8831, "step": 5762 }, { - "epoch": 1.73, - "grad_norm": 9.937006950378418, - "learning_rate": 8.44943369750426e-06, - "loss": 1.4769, + "epoch": 0.72, + "grad_norm": 5.90389347076416, + "learning_rate": 1.5178847843366943e-05, + "loss": 0.5783, "step": 5763 }, { - "epoch": 1.73, - "grad_norm": 11.456831932067871, - "learning_rate": 8.44742908689987e-06, - "loss": 1.486, + "epoch": 0.72, + "grad_norm": 26.681222915649414, + "learning_rate": 1.5178011128310255e-05, + "loss": 1.5196, "step": 5764 }, { - "epoch": 1.73, - "grad_norm": 11.295050621032715, - "learning_rate": 8.44542447629548e-06, - "loss": 1.1309, + "epoch": 0.72, + "grad_norm": 14.583979606628418, + "learning_rate": 1.5177174413253568e-05, + "loss": 1.7832, "step": 5765 }, { - "epoch": 1.73, - "grad_norm": 10.123924255371094, - "learning_rate": 8.44341986569109e-06, - "loss": 0.8497, + "epoch": 0.72, + "grad_norm": 6.687341690063477, + "learning_rate": 1.5176337698196882e-05, + "loss": 0.9506, "step": 5766 }, { - "epoch": 1.73, - "grad_norm": 23.44426727294922, - "learning_rate": 8.4414152550867e-06, - "loss": 1.9237, + "epoch": 0.72, + "grad_norm": 7.795732498168945, + "learning_rate": 1.5175500983140192e-05, + "loss": 0.8163, "step": 5767 }, { - "epoch": 1.73, - "grad_norm": 23.85992431640625, - "learning_rate": 8.439410644482309e-06, - "loss": 2.8773, + "epoch": 0.72, + "grad_norm": 62.16205978393555, + "learning_rate": 1.5174664268083506e-05, + "loss": 1.8991, "step": 5768 }, { - "epoch": 1.73, - "grad_norm": 12.136680603027344, - "learning_rate": 8.43740603387792e-06, - "loss": 1.9736, + "epoch": 0.72, + "grad_norm": 43.09776306152344, + "learning_rate": 1.517382755302682e-05, + "loss": 1.6252, "step": 5769 }, { - "epoch": 1.73, - "grad_norm": 13.099090576171875, - "learning_rate": 8.43540142327353e-06, - "loss": 1.9405, + "epoch": 0.72, + "grad_norm": 7.008608818054199, + "learning_rate": 1.517299083797013e-05, + "loss": 0.7518, "step": 5770 }, { - "epoch": 1.74, - "grad_norm": 18.678722381591797, - "learning_rate": 8.43339681266914e-06, - "loss": 2.2382, + "epoch": 0.72, + "grad_norm": 4.863678932189941, + "learning_rate": 1.5172154122913443e-05, + "loss": 0.9405, "step": 5771 }, { - "epoch": 1.74, - "grad_norm": 14.240285873413086, - "learning_rate": 8.43139220206475e-06, - "loss": 1.3103, + "epoch": 0.72, + "grad_norm": 11.31428337097168, + "learning_rate": 1.5171317407856755e-05, + "loss": 1.3956, "step": 5772 }, { - "epoch": 1.74, - "grad_norm": 21.930389404296875, - "learning_rate": 8.42938759146036e-06, - "loss": 1.4554, + "epoch": 0.72, + "grad_norm": 18.717117309570312, + "learning_rate": 1.5170480692800069e-05, + "loss": 1.6175, "step": 5773 }, { - "epoch": 1.74, - "grad_norm": 20.750886917114258, - "learning_rate": 8.42738298085597e-06, - "loss": 1.1572, + "epoch": 0.72, + "grad_norm": 23.560714721679688, + "learning_rate": 1.5169643977743381e-05, + "loss": 2.3305, "step": 5774 }, { - "epoch": 1.74, - "grad_norm": 13.55035400390625, - "learning_rate": 8.42537837025158e-06, - "loss": 1.0804, + "epoch": 0.72, + "grad_norm": 17.379194259643555, + "learning_rate": 1.5168807262686693e-05, + "loss": 1.2868, "step": 5775 }, { - "epoch": 1.74, - "grad_norm": 12.210824966430664, - "learning_rate": 8.423373759647188e-06, - "loss": 1.526, + "epoch": 0.72, + "grad_norm": 13.342092514038086, + "learning_rate": 1.5167970547630006e-05, + "loss": 2.5117, "step": 5776 }, { - "epoch": 1.74, - "grad_norm": 9.945907592773438, - "learning_rate": 8.4213691490428e-06, - "loss": 1.1015, + "epoch": 0.73, + "grad_norm": 10.251347541809082, + "learning_rate": 1.5167133832573317e-05, + "loss": 1.5929, "step": 5777 }, { - "epoch": 1.74, - "grad_norm": 16.482894897460938, - "learning_rate": 8.41936453843841e-06, - "loss": 1.9265, + "epoch": 0.73, + "grad_norm": 19.485706329345703, + "learning_rate": 1.516629711751663e-05, + "loss": 1.989, "step": 5778 }, { - "epoch": 1.74, - "grad_norm": 16.412376403808594, - "learning_rate": 8.417359927834018e-06, - "loss": 1.9719, + "epoch": 0.73, + "grad_norm": 17.89291763305664, + "learning_rate": 1.5165460402459944e-05, + "loss": 1.2587, "step": 5779 }, { - "epoch": 1.74, - "grad_norm": 49.32164764404297, - "learning_rate": 8.415355317229628e-06, - "loss": 2.7723, + "epoch": 0.73, + "grad_norm": 20.66066551208496, + "learning_rate": 1.5164623687403258e-05, + "loss": 1.8046, "step": 5780 }, { - "epoch": 1.74, - "grad_norm": 27.84273910522461, - "learning_rate": 8.413350706625239e-06, - "loss": 1.1557, + "epoch": 0.73, + "grad_norm": 25.348182678222656, + "learning_rate": 1.5163786972346568e-05, + "loss": 1.994, "step": 5781 }, { - "epoch": 1.74, - "grad_norm": 25.465152740478516, - "learning_rate": 8.411346096020849e-06, - "loss": 1.241, + "epoch": 0.73, + "grad_norm": 18.08916664123535, + "learning_rate": 1.5162950257289882e-05, + "loss": 3.1051, "step": 5782 }, { - "epoch": 1.74, - "grad_norm": 21.347864151000977, - "learning_rate": 8.409341485416459e-06, - "loss": 2.8817, + "epoch": 0.73, + "grad_norm": 26.334613800048828, + "learning_rate": 1.5162113542233195e-05, + "loss": 0.6722, "step": 5783 }, { - "epoch": 1.74, - "grad_norm": 17.83620834350586, - "learning_rate": 8.407336874812069e-06, - "loss": 1.4303, + "epoch": 0.73, + "grad_norm": 3.4977641105651855, + "learning_rate": 1.5161276827176505e-05, + "loss": 0.4504, "step": 5784 }, { - "epoch": 1.74, - "grad_norm": 24.558000564575195, - "learning_rate": 8.405332264207679e-06, - "loss": 0.9856, + "epoch": 0.73, + "grad_norm": 24.30824089050293, + "learning_rate": 1.5160440112119819e-05, + "loss": 1.6743, "step": 5785 }, { - "epoch": 1.74, - "grad_norm": 16.54327392578125, - "learning_rate": 8.403327653603289e-06, - "loss": 0.702, + "epoch": 0.73, + "grad_norm": 8.112894058227539, + "learning_rate": 1.5159603397063131e-05, + "loss": 1.1564, "step": 5786 }, { - "epoch": 1.74, - "grad_norm": 21.651714324951172, - "learning_rate": 8.401323042998897e-06, - "loss": 1.6273, + "epoch": 0.73, + "grad_norm": 4.117024898529053, + "learning_rate": 1.5158766682006445e-05, + "loss": 0.3427, "step": 5787 }, { - "epoch": 1.74, - "grad_norm": 13.498676300048828, - "learning_rate": 8.399318432394507e-06, - "loss": 0.949, + "epoch": 0.73, + "grad_norm": 16.71449851989746, + "learning_rate": 1.5157929966949757e-05, + "loss": 2.5273, "step": 5788 }, { - "epoch": 1.74, - "grad_norm": 10.88339900970459, - "learning_rate": 8.39731382179012e-06, - "loss": 1.7288, + "epoch": 0.73, + "grad_norm": 12.165425300598145, + "learning_rate": 1.5157093251893069e-05, + "loss": 0.888, "step": 5789 }, { - "epoch": 1.74, - "grad_norm": 10.3096284866333, - "learning_rate": 8.395309211185728e-06, - "loss": 1.1514, + "epoch": 0.73, + "grad_norm": 18.057558059692383, + "learning_rate": 1.5156256536836382e-05, + "loss": 1.0489, "step": 5790 }, { - "epoch": 1.74, - "grad_norm": 17.805374145507812, - "learning_rate": 8.393304600581338e-06, - "loss": 1.5093, + "epoch": 0.73, + "grad_norm": 39.79581069946289, + "learning_rate": 1.5155419821779693e-05, + "loss": 2.0024, "step": 5791 }, { - "epoch": 1.74, - "grad_norm": 16.813570022583008, - "learning_rate": 8.391299989976948e-06, - "loss": 0.8186, + "epoch": 0.73, + "grad_norm": 17.75973129272461, + "learning_rate": 1.5154583106723006e-05, + "loss": 2.2979, "step": 5792 }, { - "epoch": 1.74, - "grad_norm": 20.913787841796875, - "learning_rate": 8.389295379372558e-06, - "loss": 1.7035, + "epoch": 0.73, + "grad_norm": 10.372922897338867, + "learning_rate": 1.515374639166632e-05, + "loss": 0.681, "step": 5793 }, { - "epoch": 1.74, - "grad_norm": 52.88319778442383, - "learning_rate": 8.387290768768166e-06, - "loss": 1.7603, + "epoch": 0.73, + "grad_norm": 15.179542541503906, + "learning_rate": 1.5152909676609633e-05, + "loss": 1.9187, "step": 5794 }, { - "epoch": 1.74, - "grad_norm": 15.752192497253418, - "learning_rate": 8.385286158163778e-06, - "loss": 1.2862, + "epoch": 0.73, + "grad_norm": 12.520872116088867, + "learning_rate": 1.5152072961552944e-05, + "loss": 1.2136, "step": 5795 }, { - "epoch": 1.74, - "grad_norm": 8.408364295959473, - "learning_rate": 8.383281547559388e-06, - "loss": 0.9196, + "epoch": 0.73, + "grad_norm": 33.91791915893555, + "learning_rate": 1.5151236246496257e-05, + "loss": 1.4088, "step": 5796 }, { - "epoch": 1.74, - "grad_norm": 18.868669509887695, - "learning_rate": 8.381276936954997e-06, - "loss": 1.5121, + "epoch": 0.73, + "grad_norm": 23.570266723632812, + "learning_rate": 1.515039953143957e-05, + "loss": 1.559, "step": 5797 }, { - "epoch": 1.74, - "grad_norm": 20.32513999938965, - "learning_rate": 8.379272326350607e-06, - "loss": 2.3994, + "epoch": 0.73, + "grad_norm": 18.323856353759766, + "learning_rate": 1.5149562816382881e-05, + "loss": 1.7075, "step": 5798 }, { - "epoch": 1.74, - "grad_norm": 19.855003356933594, - "learning_rate": 8.377267715746217e-06, - "loss": 1.8385, + "epoch": 0.73, + "grad_norm": 8.110604286193848, + "learning_rate": 1.5148726101326195e-05, + "loss": 0.1192, "step": 5799 }, { - "epoch": 1.74, - "grad_norm": 15.905160903930664, - "learning_rate": 8.375263105141827e-06, - "loss": 1.1867, + "epoch": 0.73, + "grad_norm": 17.01755142211914, + "learning_rate": 1.5147889386269507e-05, + "loss": 2.3614, "step": 5800 }, { - "epoch": 1.74, - "grad_norm": 8.156031608581543, - "learning_rate": 8.373258494537437e-06, - "loss": 0.9248, + "epoch": 0.73, + "grad_norm": 12.760758399963379, + "learning_rate": 1.514705267121282e-05, + "loss": 2.2628, "step": 5801 }, { - "epoch": 1.74, - "grad_norm": 20.510618209838867, - "learning_rate": 8.371253883933047e-06, - "loss": 1.6939, + "epoch": 0.73, + "grad_norm": 27.24703598022461, + "learning_rate": 1.5146215956156133e-05, + "loss": 1.8465, "step": 5802 }, { - "epoch": 1.74, - "grad_norm": 39.78818130493164, - "learning_rate": 8.369249273328657e-06, - "loss": 1.3656, + "epoch": 0.73, + "grad_norm": 11.42320728302002, + "learning_rate": 1.5145379241099444e-05, + "loss": 1.4728, "step": 5803 }, { - "epoch": 1.75, - "grad_norm": 33.16683578491211, - "learning_rate": 8.367244662724267e-06, - "loss": 1.5406, + "epoch": 0.73, + "grad_norm": 15.267542839050293, + "learning_rate": 1.5144542526042758e-05, + "loss": 1.6918, "step": 5804 }, { - "epoch": 1.75, - "grad_norm": 21.53314971923828, - "learning_rate": 8.365240052119875e-06, - "loss": 1.0033, + "epoch": 0.73, + "grad_norm": 11.409517288208008, + "learning_rate": 1.5143705810986068e-05, + "loss": 1.9686, "step": 5805 }, { - "epoch": 1.75, - "grad_norm": 24.56745719909668, - "learning_rate": 8.363235441515486e-06, - "loss": 1.7757, + "epoch": 0.73, + "grad_norm": 23.77625274658203, + "learning_rate": 1.5142869095929382e-05, + "loss": 1.0783, "step": 5806 }, { - "epoch": 1.75, - "grad_norm": 15.24342155456543, - "learning_rate": 8.361230830911096e-06, - "loss": 1.292, + "epoch": 0.73, + "grad_norm": 20.124948501586914, + "learning_rate": 1.5142032380872696e-05, + "loss": 0.981, "step": 5807 }, { - "epoch": 1.75, - "grad_norm": 27.435335159301758, - "learning_rate": 8.359226220306706e-06, - "loss": 1.9828, + "epoch": 0.73, + "grad_norm": 12.48090934753418, + "learning_rate": 1.514119566581601e-05, + "loss": 1.0114, "step": 5808 }, { - "epoch": 1.75, - "grad_norm": 20.47377586364746, - "learning_rate": 8.357221609702316e-06, - "loss": 1.9325, + "epoch": 0.73, + "grad_norm": 11.107354164123535, + "learning_rate": 1.514035895075932e-05, + "loss": 0.9579, "step": 5809 }, { - "epoch": 1.75, - "grad_norm": 6.70631742477417, - "learning_rate": 8.355216999097926e-06, - "loss": 1.1601, + "epoch": 0.73, + "grad_norm": 10.767329216003418, + "learning_rate": 1.5139522235702633e-05, + "loss": 2.4955, "step": 5810 }, { - "epoch": 1.75, - "grad_norm": 13.876242637634277, - "learning_rate": 8.353212388493536e-06, - "loss": 1.3469, + "epoch": 0.73, + "grad_norm": 8.370363235473633, + "learning_rate": 1.5138685520645945e-05, + "loss": 2.1176, "step": 5811 }, { - "epoch": 1.75, - "grad_norm": 22.673198699951172, - "learning_rate": 8.351207777889146e-06, - "loss": 2.9192, + "epoch": 0.73, + "grad_norm": 17.333980560302734, + "learning_rate": 1.5137848805589257e-05, + "loss": 1.7624, "step": 5812 }, { - "epoch": 1.75, - "grad_norm": 9.959436416625977, - "learning_rate": 8.349203167284754e-06, - "loss": 1.4124, + "epoch": 0.73, + "grad_norm": 13.138062477111816, + "learning_rate": 1.513701209053257e-05, + "loss": 1.5703, "step": 5813 }, { - "epoch": 1.75, - "grad_norm": 30.46356773376465, - "learning_rate": 8.347198556680366e-06, - "loss": 2.1221, + "epoch": 0.73, + "grad_norm": 9.520326614379883, + "learning_rate": 1.5136175375475883e-05, + "loss": 0.9554, "step": 5814 }, { - "epoch": 1.75, - "grad_norm": 12.394341468811035, - "learning_rate": 8.345193946075976e-06, - "loss": 1.2795, + "epoch": 0.73, + "grad_norm": 21.114534378051758, + "learning_rate": 1.5135338660419196e-05, + "loss": 2.0713, "step": 5815 }, { - "epoch": 1.75, - "grad_norm": 71.86808013916016, - "learning_rate": 8.343189335471585e-06, - "loss": 3.5364, + "epoch": 0.73, + "grad_norm": 10.720865249633789, + "learning_rate": 1.5134501945362507e-05, + "loss": 1.742, "step": 5816 }, { - "epoch": 1.75, - "grad_norm": 13.528796195983887, - "learning_rate": 8.341184724867195e-06, - "loss": 1.1612, + "epoch": 0.73, + "grad_norm": 27.921756744384766, + "learning_rate": 1.513366523030582e-05, + "loss": 1.9351, "step": 5817 }, { - "epoch": 1.75, - "grad_norm": 38.51426696777344, - "learning_rate": 8.339180114262805e-06, - "loss": 1.6762, + "epoch": 0.73, + "grad_norm": 27.85314178466797, + "learning_rate": 1.5132828515249134e-05, + "loss": 1.4395, "step": 5818 }, { - "epoch": 1.75, - "grad_norm": 15.68415355682373, - "learning_rate": 8.337175503658415e-06, - "loss": 2.1607, + "epoch": 0.73, + "grad_norm": 10.19604778289795, + "learning_rate": 1.5131991800192444e-05, + "loss": 0.805, "step": 5819 }, { - "epoch": 1.75, - "grad_norm": 11.136553764343262, - "learning_rate": 8.335170893054025e-06, - "loss": 0.7781, + "epoch": 0.73, + "grad_norm": 71.00228118896484, + "learning_rate": 1.5131155085135758e-05, + "loss": 3.1386, "step": 5820 }, { - "epoch": 1.75, - "grad_norm": 15.836930274963379, - "learning_rate": 8.333166282449635e-06, - "loss": 1.9072, + "epoch": 0.73, + "grad_norm": 19.628244400024414, + "learning_rate": 1.5130318370079072e-05, + "loss": 1.9822, "step": 5821 }, { - "epoch": 1.75, - "grad_norm": 27.627634048461914, - "learning_rate": 8.331161671845245e-06, - "loss": 1.4755, + "epoch": 0.73, + "grad_norm": 15.459436416625977, + "learning_rate": 1.5129481655022385e-05, + "loss": 1.6848, "step": 5822 }, { - "epoch": 1.75, - "grad_norm": 11.358156204223633, - "learning_rate": 8.329157061240855e-06, - "loss": 1.3935, + "epoch": 0.73, + "grad_norm": 7.346459865570068, + "learning_rate": 1.5128644939965695e-05, + "loss": 2.9543, "step": 5823 }, { - "epoch": 1.75, - "grad_norm": 11.681863784790039, - "learning_rate": 8.327152450636464e-06, - "loss": 1.2521, + "epoch": 0.73, + "grad_norm": 35.106449127197266, + "learning_rate": 1.5127808224909009e-05, + "loss": 2.701, "step": 5824 }, { - "epoch": 1.75, - "grad_norm": 37.17042541503906, - "learning_rate": 8.325147840032074e-06, - "loss": 1.7879, + "epoch": 0.73, + "grad_norm": 67.77783966064453, + "learning_rate": 1.5126971509852321e-05, + "loss": 2.7705, "step": 5825 }, { - "epoch": 1.75, - "grad_norm": 17.618371963500977, - "learning_rate": 8.323143229427686e-06, - "loss": 1.283, + "epoch": 0.73, + "grad_norm": 9.533352851867676, + "learning_rate": 1.5126134794795633e-05, + "loss": 2.0982, "step": 5826 }, { - "epoch": 1.75, - "grad_norm": 13.247238159179688, - "learning_rate": 8.321138618823294e-06, - "loss": 0.9504, + "epoch": 0.73, + "grad_norm": 12.30368423461914, + "learning_rate": 1.5125298079738947e-05, + "loss": 1.8838, "step": 5827 }, { - "epoch": 1.75, - "grad_norm": 22.750028610229492, - "learning_rate": 8.319134008218904e-06, - "loss": 1.7133, + "epoch": 0.73, + "grad_norm": 7.546562194824219, + "learning_rate": 1.5124461364682259e-05, + "loss": 1.8823, "step": 5828 }, { - "epoch": 1.75, - "grad_norm": 23.75035285949707, - "learning_rate": 8.317129397614514e-06, - "loss": 1.4273, + "epoch": 0.73, + "grad_norm": 18.230085372924805, + "learning_rate": 1.5123624649625572e-05, + "loss": 1.4497, "step": 5829 }, { - "epoch": 1.75, - "grad_norm": 17.772930145263672, - "learning_rate": 8.315124787010124e-06, - "loss": 1.4838, + "epoch": 0.73, + "grad_norm": 11.704885482788086, + "learning_rate": 1.5122787934568883e-05, + "loss": 0.8476, "step": 5830 }, { - "epoch": 1.75, - "grad_norm": 14.174525260925293, - "learning_rate": 8.313120176405733e-06, - "loss": 1.5128, + "epoch": 0.73, + "grad_norm": 13.442259788513184, + "learning_rate": 1.5121951219512196e-05, + "loss": 2.872, "step": 5831 }, { - "epoch": 1.75, - "grad_norm": 59.5333366394043, - "learning_rate": 8.311115565801344e-06, - "loss": 1.0576, + "epoch": 0.73, + "grad_norm": 14.087244033813477, + "learning_rate": 1.512111450445551e-05, + "loss": 2.4998, "step": 5832 }, { - "epoch": 1.75, - "grad_norm": 20.53312873840332, - "learning_rate": 8.309110955196954e-06, - "loss": 1.0399, + "epoch": 0.73, + "grad_norm": 2.390308141708374, + "learning_rate": 1.512027778939882e-05, + "loss": 0.0709, "step": 5833 }, { - "epoch": 1.75, - "grad_norm": 31.05029296875, - "learning_rate": 8.307106344592565e-06, - "loss": 1.5115, + "epoch": 0.73, + "grad_norm": 41.68463897705078, + "learning_rate": 1.5119441074342134e-05, + "loss": 3.2485, "step": 5834 }, { - "epoch": 1.75, - "grad_norm": 12.972631454467773, - "learning_rate": 8.305101733988173e-06, - "loss": 1.2375, + "epoch": 0.73, + "grad_norm": 16.793846130371094, + "learning_rate": 1.5118604359285447e-05, + "loss": 0.8453, "step": 5835 }, { - "epoch": 1.75, - "grad_norm": 13.963759422302246, - "learning_rate": 8.303097123383783e-06, - "loss": 1.5626, + "epoch": 0.73, + "grad_norm": 5.7606611251831055, + "learning_rate": 1.5117767644228761e-05, + "loss": 1.9174, "step": 5836 }, { - "epoch": 1.75, - "grad_norm": 11.903511047363281, - "learning_rate": 8.301092512779393e-06, - "loss": 1.4446, + "epoch": 0.73, + "grad_norm": 10.618186950683594, + "learning_rate": 1.5116930929172071e-05, + "loss": 1.0161, "step": 5837 }, { - "epoch": 1.76, - "grad_norm": 18.711076736450195, - "learning_rate": 8.299087902175003e-06, - "loss": 1.3406, + "epoch": 0.73, + "grad_norm": 21.706491470336914, + "learning_rate": 1.5116094214115385e-05, + "loss": 2.4378, "step": 5838 }, { - "epoch": 1.76, - "grad_norm": 84.2789535522461, - "learning_rate": 8.297083291570613e-06, - "loss": 2.1487, + "epoch": 0.73, + "grad_norm": 11.850515365600586, + "learning_rate": 1.5115257499058697e-05, + "loss": 1.4123, "step": 5839 }, { - "epoch": 1.76, - "grad_norm": 22.546630859375, - "learning_rate": 8.295078680966223e-06, - "loss": 1.1753, + "epoch": 0.73, + "grad_norm": 19.344482421875, + "learning_rate": 1.5114420784002009e-05, + "loss": 2.6149, "step": 5840 }, { - "epoch": 1.76, - "grad_norm": 12.994787216186523, - "learning_rate": 8.293074070361833e-06, - "loss": 1.6728, + "epoch": 0.73, + "grad_norm": 35.159759521484375, + "learning_rate": 1.5113584068945322e-05, + "loss": 1.4108, "step": 5841 }, { - "epoch": 1.76, - "grad_norm": 8.531490325927734, - "learning_rate": 8.291069459757442e-06, - "loss": 0.8257, + "epoch": 0.73, + "grad_norm": 14.665122985839844, + "learning_rate": 1.5112747353888634e-05, + "loss": 2.0348, "step": 5842 }, { - "epoch": 1.76, - "grad_norm": 21.086061477661133, - "learning_rate": 8.289064849153052e-06, - "loss": 1.1679, + "epoch": 0.73, + "grad_norm": 12.41344165802002, + "learning_rate": 1.5111910638831948e-05, + "loss": 2.1254, "step": 5843 }, { - "epoch": 1.76, - "grad_norm": 23.762374877929688, - "learning_rate": 8.287060238548664e-06, - "loss": 3.049, + "epoch": 0.73, + "grad_norm": 42.95769119262695, + "learning_rate": 1.5111073923775258e-05, + "loss": 3.2697, "step": 5844 }, { - "epoch": 1.76, - "grad_norm": 18.857986450195312, - "learning_rate": 8.285055627944272e-06, - "loss": 1.5555, + "epoch": 0.73, + "grad_norm": 25.262025833129883, + "learning_rate": 1.5110237208718572e-05, + "loss": 1.1157, "step": 5845 }, { - "epoch": 1.76, - "grad_norm": 11.890861511230469, - "learning_rate": 8.283051017339882e-06, - "loss": 1.1098, + "epoch": 0.73, + "grad_norm": 19.907249450683594, + "learning_rate": 1.5109400493661886e-05, + "loss": 2.3685, "step": 5846 }, { - "epoch": 1.76, - "grad_norm": 8.958333969116211, - "learning_rate": 8.281046406735492e-06, - "loss": 0.8418, + "epoch": 0.73, + "grad_norm": 16.224332809448242, + "learning_rate": 1.5108563778605196e-05, + "loss": 2.0695, "step": 5847 }, { - "epoch": 1.76, - "grad_norm": 15.934253692626953, - "learning_rate": 8.279041796131102e-06, - "loss": 1.3008, + "epoch": 0.73, + "grad_norm": 9.057453155517578, + "learning_rate": 1.510772706354851e-05, + "loss": 2.1087, "step": 5848 }, { - "epoch": 1.76, - "grad_norm": 21.336219787597656, - "learning_rate": 8.277037185526712e-06, - "loss": 1.1645, + "epoch": 0.73, + "grad_norm": 33.40324401855469, + "learning_rate": 1.5106890348491823e-05, + "loss": 0.9799, "step": 5849 }, { - "epoch": 1.76, - "grad_norm": 12.621298789978027, - "learning_rate": 8.27503257492232e-06, - "loss": 2.0196, + "epoch": 0.73, + "grad_norm": 8.36249828338623, + "learning_rate": 1.5106053633435135e-05, + "loss": 0.6396, "step": 5850 }, { - "epoch": 1.76, - "grad_norm": 13.761375427246094, - "learning_rate": 8.273027964317933e-06, - "loss": 2.1065, + "epoch": 0.73, + "grad_norm": 13.222352027893066, + "learning_rate": 1.5105216918378447e-05, + "loss": 2.3163, "step": 5851 }, { - "epoch": 1.76, - "grad_norm": 35.57234191894531, - "learning_rate": 8.271023353713543e-06, - "loss": 1.1288, + "epoch": 0.73, + "grad_norm": 11.83804988861084, + "learning_rate": 1.510438020332176e-05, + "loss": 1.0425, "step": 5852 }, { - "epoch": 1.76, - "grad_norm": 10.00034236907959, - "learning_rate": 8.269018743109151e-06, - "loss": 1.3954, + "epoch": 0.73, + "grad_norm": 8.717399597167969, + "learning_rate": 1.5103543488265073e-05, + "loss": 0.9513, "step": 5853 }, { - "epoch": 1.76, - "grad_norm": 37.05139923095703, - "learning_rate": 8.267014132504761e-06, - "loss": 1.4784, + "epoch": 0.73, + "grad_norm": 16.511960983276367, + "learning_rate": 1.5102706773208385e-05, + "loss": 1.6451, "step": 5854 }, { - "epoch": 1.76, - "grad_norm": 22.268341064453125, - "learning_rate": 8.265009521900371e-06, - "loss": 1.3877, + "epoch": 0.73, + "grad_norm": 20.263565063476562, + "learning_rate": 1.5101870058151698e-05, + "loss": 0.8851, "step": 5855 }, { - "epoch": 1.76, - "grad_norm": 13.753389358520508, - "learning_rate": 8.263004911295981e-06, - "loss": 1.5012, + "epoch": 0.73, + "grad_norm": 11.619148254394531, + "learning_rate": 1.510103334309501e-05, + "loss": 3.0982, "step": 5856 }, { - "epoch": 1.76, - "grad_norm": 18.250154495239258, - "learning_rate": 8.261000300691591e-06, - "loss": 1.7476, + "epoch": 0.74, + "grad_norm": 17.539045333862305, + "learning_rate": 1.5100196628038324e-05, + "loss": 1.0885, "step": 5857 }, { - "epoch": 1.76, - "grad_norm": 14.758729934692383, - "learning_rate": 8.258995690087201e-06, - "loss": 1.5117, + "epoch": 0.74, + "grad_norm": 21.50510597229004, + "learning_rate": 1.5099359912981634e-05, + "loss": 1.0739, "step": 5858 }, { - "epoch": 1.76, - "grad_norm": 20.81943130493164, - "learning_rate": 8.256991079482812e-06, - "loss": 1.363, + "epoch": 0.74, + "grad_norm": 33.4230842590332, + "learning_rate": 1.5098523197924948e-05, + "loss": 2.9656, "step": 5859 }, { - "epoch": 1.76, - "grad_norm": 26.64165496826172, - "learning_rate": 8.254986468878422e-06, - "loss": 1.7073, + "epoch": 0.74, + "grad_norm": 15.180930137634277, + "learning_rate": 1.5097686482868261e-05, + "loss": 1.5781, "step": 5860 }, { - "epoch": 1.76, - "grad_norm": 14.987895965576172, - "learning_rate": 8.25298185827403e-06, - "loss": 0.7878, + "epoch": 0.74, + "grad_norm": 13.385412216186523, + "learning_rate": 1.5096849767811572e-05, + "loss": 2.7543, "step": 5861 }, { - "epoch": 1.76, - "grad_norm": 17.91753387451172, - "learning_rate": 8.25097724766964e-06, - "loss": 1.4102, + "epoch": 0.74, + "grad_norm": 18.107589721679688, + "learning_rate": 1.5096013052754885e-05, + "loss": 2.256, "step": 5862 }, { - "epoch": 1.76, - "grad_norm": 6.570464134216309, - "learning_rate": 8.248972637065252e-06, - "loss": 0.7017, + "epoch": 0.74, + "grad_norm": 25.71607780456543, + "learning_rate": 1.5095176337698199e-05, + "loss": 2.8298, "step": 5863 }, { - "epoch": 1.76, - "grad_norm": 21.44278335571289, - "learning_rate": 8.24696802646086e-06, - "loss": 1.4487, + "epoch": 0.74, + "grad_norm": 92.62855529785156, + "learning_rate": 1.5094339622641511e-05, + "loss": 0.4783, "step": 5864 }, { - "epoch": 1.76, - "grad_norm": 15.875109672546387, - "learning_rate": 8.24496341585647e-06, - "loss": 1.178, + "epoch": 0.74, + "grad_norm": 34.5122184753418, + "learning_rate": 1.5093502907584823e-05, + "loss": 1.4871, "step": 5865 }, { - "epoch": 1.76, - "grad_norm": 36.79011154174805, - "learning_rate": 8.24295880525208e-06, - "loss": 1.9915, + "epoch": 0.74, + "grad_norm": 9.95934772491455, + "learning_rate": 1.5092666192528137e-05, + "loss": 1.7922, "step": 5866 }, { - "epoch": 1.76, - "grad_norm": 21.93402862548828, - "learning_rate": 8.24095419464769e-06, - "loss": 1.4122, + "epoch": 0.74, + "grad_norm": 25.549142837524414, + "learning_rate": 1.5091829477471449e-05, + "loss": 1.5132, "step": 5867 }, { - "epoch": 1.76, - "grad_norm": 23.80208969116211, - "learning_rate": 8.2389495840433e-06, - "loss": 2.1488, + "epoch": 0.74, + "grad_norm": 5.921211242675781, + "learning_rate": 1.509099276241476e-05, + "loss": 0.9918, "step": 5868 }, { - "epoch": 1.76, - "grad_norm": 10.25802993774414, - "learning_rate": 8.23694497343891e-06, - "loss": 1.7158, + "epoch": 0.74, + "grad_norm": 47.82905197143555, + "learning_rate": 1.5090156047358072e-05, + "loss": 2.308, "step": 5869 }, { - "epoch": 1.76, - "grad_norm": 38.02273178100586, - "learning_rate": 8.23494036283452e-06, - "loss": 1.6496, + "epoch": 0.74, + "grad_norm": 6.752317428588867, + "learning_rate": 1.5089319332301386e-05, + "loss": 1.8953, "step": 5870 }, { - "epoch": 1.77, - "grad_norm": 27.261823654174805, - "learning_rate": 8.232935752230131e-06, - "loss": 2.1812, + "epoch": 0.74, + "grad_norm": 10.293950080871582, + "learning_rate": 1.50884826172447e-05, + "loss": 1.0804, "step": 5871 }, { - "epoch": 1.77, - "grad_norm": 10.888236999511719, - "learning_rate": 8.23093114162574e-06, - "loss": 1.3974, + "epoch": 0.74, + "grad_norm": 16.059457778930664, + "learning_rate": 1.508764590218801e-05, + "loss": 3.0737, "step": 5872 }, { - "epoch": 1.77, - "grad_norm": 33.661293029785156, - "learning_rate": 8.22892653102135e-06, - "loss": 2.1975, + "epoch": 0.74, + "grad_norm": 13.257723808288574, + "learning_rate": 1.5086809187131324e-05, + "loss": 1.2577, "step": 5873 }, { - "epoch": 1.77, - "grad_norm": 38.529293060302734, - "learning_rate": 8.22692192041696e-06, - "loss": 1.8204, + "epoch": 0.74, + "grad_norm": 7.22626256942749, + "learning_rate": 1.5085972472074637e-05, + "loss": 1.2732, "step": 5874 }, { - "epoch": 1.77, - "grad_norm": 28.78758430480957, - "learning_rate": 8.22491730981257e-06, - "loss": 2.9932, + "epoch": 0.74, + "grad_norm": 16.098403930664062, + "learning_rate": 1.5085135757017948e-05, + "loss": 2.8811, "step": 5875 }, { - "epoch": 1.77, - "grad_norm": 10.32004165649414, - "learning_rate": 8.22291269920818e-06, - "loss": 2.0646, + "epoch": 0.74, + "grad_norm": 11.780889511108398, + "learning_rate": 1.5084299041961261e-05, + "loss": 0.93, "step": 5876 }, { - "epoch": 1.77, - "grad_norm": 18.924707412719727, - "learning_rate": 8.22090808860379e-06, - "loss": 2.3191, + "epoch": 0.74, + "grad_norm": 14.249544143676758, + "learning_rate": 1.5083462326904575e-05, + "loss": 0.5134, "step": 5877 }, { - "epoch": 1.77, - "grad_norm": 14.681024551391602, - "learning_rate": 8.2189034779994e-06, - "loss": 1.1271, + "epoch": 0.74, + "grad_norm": 12.100791931152344, + "learning_rate": 1.5082625611847887e-05, + "loss": 1.4711, "step": 5878 }, { - "epoch": 1.77, - "grad_norm": 13.027987480163574, - "learning_rate": 8.216898867395008e-06, - "loss": 1.6397, + "epoch": 0.74, + "grad_norm": 7.681851387023926, + "learning_rate": 1.5081788896791199e-05, + "loss": 0.7961, "step": 5879 }, { - "epoch": 1.77, - "grad_norm": 37.9376335144043, - "learning_rate": 8.214894256790618e-06, - "loss": 1.7215, - "step": 5880 - }, - { - "epoch": 1.77, - "eval_loss": 0.19819287955760956, - "eval_runtime": 43.4174, - "eval_samples_per_second": 34.065, - "eval_steps_per_second": 34.065, + "epoch": 0.74, + "grad_norm": 16.879785537719727, + "learning_rate": 1.5080952181734512e-05, + "loss": 2.2265, "step": 5880 }, { - "epoch": 1.77, - "grad_norm": 37.3797492980957, - "learning_rate": 8.21288964618623e-06, - "loss": 2.3832, + "epoch": 0.74, + "grad_norm": 4.811009883880615, + "learning_rate": 1.5080115466677824e-05, + "loss": 1.4758, "step": 5881 }, { - "epoch": 1.77, - "grad_norm": 20.478801727294922, - "learning_rate": 8.210885035581838e-06, - "loss": 1.8481, + "epoch": 0.74, + "grad_norm": 8.104880332946777, + "learning_rate": 1.5079278751621136e-05, + "loss": 0.3154, "step": 5882 }, { - "epoch": 1.77, - "grad_norm": 22.586971282958984, - "learning_rate": 8.208880424977449e-06, - "loss": 1.8847, + "epoch": 0.74, + "grad_norm": 12.702399253845215, + "learning_rate": 1.5078442036564448e-05, + "loss": 1.6427, "step": 5883 }, { - "epoch": 1.77, - "grad_norm": 25.06772232055664, - "learning_rate": 8.206875814373059e-06, - "loss": 2.011, + "epoch": 0.74, + "grad_norm": 12.479487419128418, + "learning_rate": 1.5077605321507762e-05, + "loss": 1.8192, "step": 5884 }, { - "epoch": 1.77, - "grad_norm": 13.59852123260498, - "learning_rate": 8.204871203768669e-06, - "loss": 1.5316, + "epoch": 0.74, + "grad_norm": 14.454097747802734, + "learning_rate": 1.5076768606451076e-05, + "loss": 0.5032, "step": 5885 }, { - "epoch": 1.77, - "grad_norm": 15.501117706298828, - "learning_rate": 8.202866593164279e-06, - "loss": 1.5476, + "epoch": 0.74, + "grad_norm": 16.55340003967285, + "learning_rate": 1.5075931891394386e-05, + "loss": 1.97, "step": 5886 }, { - "epoch": 1.77, - "grad_norm": 15.47221851348877, - "learning_rate": 8.200861982559889e-06, - "loss": 1.5741, + "epoch": 0.74, + "grad_norm": 7.660234451293945, + "learning_rate": 1.50750951763377e-05, + "loss": 1.47, "step": 5887 }, { - "epoch": 1.77, - "grad_norm": 25.984909057617188, - "learning_rate": 8.198857371955499e-06, - "loss": 1.8725, + "epoch": 0.74, + "grad_norm": 20.445825576782227, + "learning_rate": 1.5074258461281013e-05, + "loss": 1.4376, "step": 5888 }, { - "epoch": 1.77, - "grad_norm": 23.49461555480957, - "learning_rate": 8.196852761351109e-06, - "loss": 2.3657, + "epoch": 0.74, + "grad_norm": 18.65691375732422, + "learning_rate": 1.5073421746224323e-05, + "loss": 2.0051, "step": 5889 }, { - "epoch": 1.77, - "grad_norm": 12.506348609924316, - "learning_rate": 8.194848150746717e-06, - "loss": 1.4631, + "epoch": 0.74, + "grad_norm": 10.39387321472168, + "learning_rate": 1.5072585031167637e-05, + "loss": 0.6718, "step": 5890 }, { - "epoch": 1.77, - "grad_norm": 22.092987060546875, - "learning_rate": 8.192843540142327e-06, - "loss": 1.3425, + "epoch": 0.74, + "grad_norm": 9.236499786376953, + "learning_rate": 1.507174831611095e-05, + "loss": 3.0904, "step": 5891 }, { - "epoch": 1.77, - "grad_norm": 24.98388671875, - "learning_rate": 8.190838929537938e-06, - "loss": 2.064, + "epoch": 0.74, + "grad_norm": 18.530906677246094, + "learning_rate": 1.5070911601054261e-05, + "loss": 3.6638, "step": 5892 }, { - "epoch": 1.77, - "grad_norm": 15.064332962036133, - "learning_rate": 8.188834318933548e-06, - "loss": 1.257, + "epoch": 0.74, + "grad_norm": 7.476229667663574, + "learning_rate": 1.5070074885997575e-05, + "loss": 2.8985, "step": 5893 }, { - "epoch": 1.77, - "grad_norm": 54.17677307128906, - "learning_rate": 8.186829708329158e-06, - "loss": 2.009, + "epoch": 0.74, + "grad_norm": 7.896212100982666, + "learning_rate": 1.5069238170940888e-05, + "loss": 0.7143, "step": 5894 }, { - "epoch": 1.77, - "grad_norm": 18.08331298828125, - "learning_rate": 8.184825097724768e-06, - "loss": 1.2969, + "epoch": 0.74, + "grad_norm": 12.695160865783691, + "learning_rate": 1.50684014558842e-05, + "loss": 2.2646, "step": 5895 }, { - "epoch": 1.77, - "grad_norm": 32.95471954345703, - "learning_rate": 8.182820487120378e-06, - "loss": 1.2109, + "epoch": 0.74, + "grad_norm": 9.66424560546875, + "learning_rate": 1.5067564740827512e-05, + "loss": 1.8797, "step": 5896 }, { - "epoch": 1.77, - "grad_norm": 11.22049331665039, - "learning_rate": 8.180815876515988e-06, - "loss": 1.2192, + "epoch": 0.74, + "grad_norm": 15.434563636779785, + "learning_rate": 1.5066728025770824e-05, + "loss": 2.3841, "step": 5897 }, { - "epoch": 1.77, - "grad_norm": 14.255977630615234, - "learning_rate": 8.178811265911596e-06, - "loss": 1.887, + "epoch": 0.74, + "grad_norm": 16.04581642150879, + "learning_rate": 1.5065891310714138e-05, + "loss": 2.2305, "step": 5898 }, { - "epoch": 1.77, - "grad_norm": 12.499604225158691, - "learning_rate": 8.176806655307206e-06, - "loss": 1.3196, + "epoch": 0.74, + "grad_norm": 14.663934707641602, + "learning_rate": 1.506505459565745e-05, + "loss": 2.8839, "step": 5899 }, { - "epoch": 1.77, - "grad_norm": 15.631196975708008, - "learning_rate": 8.174802044702818e-06, - "loss": 1.3063, + "epoch": 0.74, + "grad_norm": 10.927321434020996, + "learning_rate": 1.5064217880600762e-05, + "loss": 0.4926, "step": 5900 }, { - "epoch": 1.77, - "grad_norm": 13.534256935119629, - "learning_rate": 8.172797434098427e-06, - "loss": 1.6275, + "epoch": 0.74, + "grad_norm": 9.020415306091309, + "learning_rate": 1.5063381165544075e-05, + "loss": 2.0736, "step": 5901 }, { - "epoch": 1.77, - "grad_norm": 10.233455657958984, - "learning_rate": 8.170792823494037e-06, - "loss": 1.1873, + "epoch": 0.74, + "grad_norm": 8.869327545166016, + "learning_rate": 1.5062544450487389e-05, + "loss": 1.334, "step": 5902 }, { - "epoch": 1.77, - "grad_norm": 11.301042556762695, - "learning_rate": 8.168788212889647e-06, - "loss": 1.2987, + "epoch": 0.74, + "grad_norm": 12.613329887390137, + "learning_rate": 1.50617077354307e-05, + "loss": 1.3902, "step": 5903 }, { - "epoch": 1.78, - "grad_norm": 46.812076568603516, - "learning_rate": 8.166783602285257e-06, - "loss": 1.7821, + "epoch": 0.74, + "grad_norm": 3.1661107540130615, + "learning_rate": 1.5060871020374013e-05, + "loss": 0.0721, "step": 5904 }, { - "epoch": 1.78, - "grad_norm": 34.08024597167969, - "learning_rate": 8.164778991680867e-06, - "loss": 1.3355, + "epoch": 0.74, + "grad_norm": 4.516031265258789, + "learning_rate": 1.5060034305317327e-05, + "loss": 0.3199, "step": 5905 }, { - "epoch": 1.78, - "grad_norm": 12.29112720489502, - "learning_rate": 8.162774381076477e-06, - "loss": 1.3053, + "epoch": 0.74, + "grad_norm": 35.51274490356445, + "learning_rate": 1.5059197590260637e-05, + "loss": 1.3607, "step": 5906 }, { - "epoch": 1.78, - "grad_norm": 16.215862274169922, - "learning_rate": 8.160769770472087e-06, - "loss": 1.6378, + "epoch": 0.74, + "grad_norm": 12.607372283935547, + "learning_rate": 1.505836087520395e-05, + "loss": 1.5191, "step": 5907 }, { - "epoch": 1.78, - "grad_norm": 33.6343994140625, - "learning_rate": 8.158765159867697e-06, - "loss": 1.9444, + "epoch": 0.74, + "grad_norm": 16.76181411743164, + "learning_rate": 1.5057524160147264e-05, + "loss": 2.4684, "step": 5908 }, { - "epoch": 1.78, - "grad_norm": 27.786399841308594, - "learning_rate": 8.156760549263306e-06, - "loss": 1.1768, + "epoch": 0.74, + "grad_norm": 17.170143127441406, + "learning_rate": 1.5056687445090576e-05, + "loss": 1.527, "step": 5909 }, { - "epoch": 1.78, - "grad_norm": 22.714481353759766, - "learning_rate": 8.154755938658916e-06, - "loss": 1.5109, + "epoch": 0.74, + "grad_norm": 16.721603393554688, + "learning_rate": 1.5055850730033888e-05, + "loss": 1.5672, "step": 5910 }, { - "epoch": 1.78, - "grad_norm": 14.203557968139648, - "learning_rate": 8.152751328054526e-06, - "loss": 1.2664, + "epoch": 0.74, + "grad_norm": 18.294769287109375, + "learning_rate": 1.50550140149772e-05, + "loss": 1.8744, "step": 5911 }, { - "epoch": 1.78, - "grad_norm": 25.13650894165039, - "learning_rate": 8.150746717450136e-06, - "loss": 1.6833, + "epoch": 0.74, + "grad_norm": 12.959049224853516, + "learning_rate": 1.5054177299920514e-05, + "loss": 3.0204, "step": 5912 }, { - "epoch": 1.78, - "grad_norm": 8.750547409057617, - "learning_rate": 8.148742106845746e-06, - "loss": 0.7758, + "epoch": 0.74, + "grad_norm": 35.87045669555664, + "learning_rate": 1.5053340584863826e-05, + "loss": 2.0355, "step": 5913 }, { - "epoch": 1.78, - "grad_norm": 13.47951889038086, - "learning_rate": 8.146737496241356e-06, - "loss": 0.9706, + "epoch": 0.74, + "grad_norm": 27.6667423248291, + "learning_rate": 1.5052503869807138e-05, + "loss": 3.6111, "step": 5914 }, { - "epoch": 1.78, - "grad_norm": 25.456212997436523, - "learning_rate": 8.144732885636966e-06, - "loss": 2.0665, + "epoch": 0.74, + "grad_norm": 10.446527481079102, + "learning_rate": 1.5051667154750451e-05, + "loss": 1.8251, "step": 5915 }, { - "epoch": 1.78, - "grad_norm": 26.591684341430664, - "learning_rate": 8.142728275032575e-06, - "loss": 2.694, + "epoch": 0.74, + "grad_norm": 23.50820541381836, + "learning_rate": 1.5050830439693765e-05, + "loss": 2.3433, "step": 5916 }, { - "epoch": 1.78, - "grad_norm": 141.93113708496094, - "learning_rate": 8.140723664428185e-06, - "loss": 2.0094, + "epoch": 0.74, + "grad_norm": 12.742378234863281, + "learning_rate": 1.5049993724637075e-05, + "loss": 1.8785, "step": 5917 }, { - "epoch": 1.78, - "grad_norm": 14.334453582763672, - "learning_rate": 8.138719053823796e-06, - "loss": 1.156, + "epoch": 0.74, + "grad_norm": 9.84451675415039, + "learning_rate": 1.5049157009580389e-05, + "loss": 0.8814, "step": 5918 }, { - "epoch": 1.78, - "grad_norm": 31.279136657714844, - "learning_rate": 8.136714443219406e-06, - "loss": 1.2964, + "epoch": 0.74, + "grad_norm": 16.8998966217041, + "learning_rate": 1.5048320294523702e-05, + "loss": 1.2168, "step": 5919 }, { - "epoch": 1.78, - "grad_norm": 13.835858345031738, - "learning_rate": 8.134709832615015e-06, - "loss": 0.992, + "epoch": 0.74, + "grad_norm": 43.36582946777344, + "learning_rate": 1.5047483579467013e-05, + "loss": 1.2332, "step": 5920 }, { - "epoch": 1.78, - "grad_norm": 20.324710845947266, - "learning_rate": 8.132705222010625e-06, - "loss": 2.0425, + "epoch": 0.74, + "grad_norm": 10.693442344665527, + "learning_rate": 1.5046646864410326e-05, + "loss": 1.291, "step": 5921 }, { - "epoch": 1.78, - "grad_norm": 9.181856155395508, - "learning_rate": 8.130700611406235e-06, - "loss": 1.2267, + "epoch": 0.74, + "grad_norm": 11.892716407775879, + "learning_rate": 1.5045810149353638e-05, + "loss": 1.3415, "step": 5922 }, { - "epoch": 1.78, - "grad_norm": 12.382397651672363, - "learning_rate": 8.128696000801845e-06, - "loss": 1.4096, + "epoch": 0.74, + "grad_norm": 9.612030029296875, + "learning_rate": 1.5044973434296952e-05, + "loss": 2.065, "step": 5923 }, { - "epoch": 1.78, - "grad_norm": 19.25230598449707, - "learning_rate": 8.126691390197455e-06, - "loss": 1.86, + "epoch": 0.74, + "grad_norm": 462.9150695800781, + "learning_rate": 1.5044136719240264e-05, + "loss": 3.5542, "step": 5924 }, { - "epoch": 1.78, - "grad_norm": 9.224576950073242, - "learning_rate": 8.124686779593065e-06, - "loss": 1.291, + "epoch": 0.74, + "grad_norm": 34.43170166015625, + "learning_rate": 1.5043300004183576e-05, + "loss": 2.6402, "step": 5925 }, { - "epoch": 1.78, - "grad_norm": 26.03525161743164, - "learning_rate": 8.122682168988675e-06, - "loss": 2.0772, + "epoch": 0.74, + "grad_norm": 8.972990989685059, + "learning_rate": 1.504246328912689e-05, + "loss": 1.6312, "step": 5926 }, { - "epoch": 1.78, - "grad_norm": 12.974356651306152, - "learning_rate": 8.120677558384284e-06, - "loss": 1.524, + "epoch": 0.74, + "grad_norm": 19.01601791381836, + "learning_rate": 1.50416265740702e-05, + "loss": 3.8481, "step": 5927 }, { - "epoch": 1.78, - "grad_norm": 9.397273063659668, - "learning_rate": 8.118672947779894e-06, - "loss": 1.2373, + "epoch": 0.74, + "grad_norm": 7.473775386810303, + "learning_rate": 1.5040789859013513e-05, + "loss": 0.6885, "step": 5928 }, { - "epoch": 1.78, - "grad_norm": 12.543724060058594, - "learning_rate": 8.116668337175504e-06, - "loss": 1.4601, + "epoch": 0.74, + "grad_norm": 5.208780765533447, + "learning_rate": 1.5039953143956827e-05, + "loss": 0.7665, "step": 5929 }, { - "epoch": 1.78, - "grad_norm": 10.495887756347656, - "learning_rate": 8.114663726571114e-06, - "loss": 1.3694, + "epoch": 0.74, + "grad_norm": 23.150463104248047, + "learning_rate": 1.503911642890014e-05, + "loss": 1.9791, "step": 5930 }, { - "epoch": 1.78, - "grad_norm": 12.478328704833984, - "learning_rate": 8.112659115966724e-06, - "loss": 1.8051, + "epoch": 0.74, + "grad_norm": 14.49713134765625, + "learning_rate": 1.5038279713843451e-05, + "loss": 1.5597, "step": 5931 }, { - "epoch": 1.78, - "grad_norm": 18.770479202270508, - "learning_rate": 8.110654505362334e-06, - "loss": 1.3905, + "epoch": 0.74, + "grad_norm": 6.02341365814209, + "learning_rate": 1.5037442998786765e-05, + "loss": 0.8754, "step": 5932 }, { - "epoch": 1.78, - "grad_norm": 14.52056884765625, - "learning_rate": 8.108649894757944e-06, - "loss": 1.4971, + "epoch": 0.74, + "grad_norm": 12.140684127807617, + "learning_rate": 1.5036606283730078e-05, + "loss": 1.5937, "step": 5933 }, { - "epoch": 1.78, - "grad_norm": 25.12417221069336, - "learning_rate": 8.106645284153554e-06, - "loss": 1.9117, + "epoch": 0.74, + "grad_norm": 19.201501846313477, + "learning_rate": 1.5035769568673388e-05, + "loss": 1.6273, "step": 5934 }, { - "epoch": 1.78, - "grad_norm": 40.989715576171875, - "learning_rate": 8.104640673549163e-06, - "loss": 2.4517, + "epoch": 0.74, + "grad_norm": 10.110654830932617, + "learning_rate": 1.5034932853616702e-05, + "loss": 1.313, "step": 5935 }, { - "epoch": 1.78, - "grad_norm": 7.442935943603516, - "learning_rate": 8.102636062944773e-06, - "loss": 1.0286, + "epoch": 0.74, + "grad_norm": 18.283098220825195, + "learning_rate": 1.5034096138560014e-05, + "loss": 1.1007, "step": 5936 }, { - "epoch": 1.79, - "grad_norm": 13.013799667358398, - "learning_rate": 8.100631452340385e-06, - "loss": 1.1536, + "epoch": 0.75, + "grad_norm": 10.114410400390625, + "learning_rate": 1.5033259423503328e-05, + "loss": 0.6061, "step": 5937 }, { - "epoch": 1.79, - "grad_norm": 10.014057159423828, - "learning_rate": 8.098626841735993e-06, - "loss": 1.193, + "epoch": 0.75, + "grad_norm": 3.9301300048828125, + "learning_rate": 1.503242270844664e-05, + "loss": 1.2018, "step": 5938 }, { - "epoch": 1.79, - "grad_norm": 24.215864181518555, - "learning_rate": 8.096622231131603e-06, - "loss": 1.3133, + "epoch": 0.75, + "grad_norm": 30.078611373901367, + "learning_rate": 1.5031585993389952e-05, + "loss": 2.5315, "step": 5939 }, { - "epoch": 1.79, - "grad_norm": 14.128569602966309, - "learning_rate": 8.094617620527213e-06, - "loss": 1.0683, + "epoch": 0.75, + "grad_norm": 2.8266096115112305, + "learning_rate": 1.5030749278333265e-05, + "loss": 0.1006, "step": 5940 }, { - "epoch": 1.79, - "grad_norm": 18.90635108947754, - "learning_rate": 8.092613009922823e-06, - "loss": 1.3165, + "epoch": 0.75, + "grad_norm": 7.4051995277404785, + "learning_rate": 1.5029912563276576e-05, + "loss": 0.9698, "step": 5941 }, { - "epoch": 1.79, - "grad_norm": 7.926889896392822, - "learning_rate": 8.090608399318433e-06, - "loss": 1.3542, + "epoch": 0.75, + "grad_norm": 23.7181396484375, + "learning_rate": 1.502907584821989e-05, + "loss": 3.3637, "step": 5942 }, { - "epoch": 1.79, - "grad_norm": 15.462095260620117, - "learning_rate": 8.088603788714043e-06, - "loss": 1.7081, + "epoch": 0.75, + "grad_norm": 15.384878158569336, + "learning_rate": 1.5028239133163203e-05, + "loss": 0.8438, "step": 5943 }, { - "epoch": 1.79, - "grad_norm": 13.095417976379395, - "learning_rate": 8.086599178109653e-06, - "loss": 1.3562, + "epoch": 0.75, + "grad_norm": 12.596027374267578, + "learning_rate": 1.5027402418106516e-05, + "loss": 1.9872, "step": 5944 }, { - "epoch": 1.79, - "grad_norm": 16.457101821899414, - "learning_rate": 8.084594567505264e-06, - "loss": 1.1891, + "epoch": 0.75, + "grad_norm": 10.537801742553711, + "learning_rate": 1.5026565703049827e-05, + "loss": 2.608, "step": 5945 }, { - "epoch": 1.79, - "grad_norm": 9.37852668762207, - "learning_rate": 8.082589956900872e-06, - "loss": 1.3149, + "epoch": 0.75, + "grad_norm": 13.232535362243652, + "learning_rate": 1.502572898799314e-05, + "loss": 1.8912, "step": 5946 }, { - "epoch": 1.79, - "grad_norm": 10.058578491210938, - "learning_rate": 8.080585346296482e-06, - "loss": 1.2129, + "epoch": 0.75, + "grad_norm": 80.67562866210938, + "learning_rate": 1.5024892272936454e-05, + "loss": 1.1722, "step": 5947 }, { - "epoch": 1.79, - "grad_norm": 15.420865058898926, - "learning_rate": 8.078580735692092e-06, - "loss": 2.2392, + "epoch": 0.75, + "grad_norm": 12.835854530334473, + "learning_rate": 1.5024055557879764e-05, + "loss": 1.1671, "step": 5948 }, { - "epoch": 1.79, - "grad_norm": 17.6695556640625, - "learning_rate": 8.076576125087702e-06, - "loss": 1.1624, + "epoch": 0.75, + "grad_norm": 39.22283172607422, + "learning_rate": 1.5023218842823078e-05, + "loss": 1.5886, "step": 5949 }, { - "epoch": 1.79, - "grad_norm": 12.30250072479248, - "learning_rate": 8.074571514483312e-06, - "loss": 1.4646, + "epoch": 0.75, + "grad_norm": 5.5500922203063965, + "learning_rate": 1.502238212776639e-05, + "loss": 0.7196, "step": 5950 }, { - "epoch": 1.79, - "grad_norm": 26.107072830200195, - "learning_rate": 8.072566903878922e-06, - "loss": 1.4681, + "epoch": 0.75, + "grad_norm": 7.820392608642578, + "learning_rate": 1.5021545412709704e-05, + "loss": 0.0653, "step": 5951 }, { - "epoch": 1.79, - "grad_norm": 16.788949966430664, - "learning_rate": 8.070562293274532e-06, - "loss": 1.3324, + "epoch": 0.75, + "grad_norm": 14.047516822814941, + "learning_rate": 1.5020708697653016e-05, + "loss": 0.8211, "step": 5952 }, { - "epoch": 1.79, - "grad_norm": 43.306427001953125, - "learning_rate": 8.068557682670143e-06, - "loss": 1.6477, + "epoch": 0.75, + "grad_norm": 16.972585678100586, + "learning_rate": 1.5019871982596327e-05, + "loss": 2.8933, "step": 5953 }, { - "epoch": 1.79, - "grad_norm": 14.007320404052734, - "learning_rate": 8.066553072065751e-06, - "loss": 0.9753, + "epoch": 0.75, + "grad_norm": 2.9097461700439453, + "learning_rate": 1.5019035267539641e-05, + "loss": 0.1308, "step": 5954 }, { - "epoch": 1.79, - "grad_norm": 9.967177391052246, - "learning_rate": 8.064548461461363e-06, - "loss": 0.8905, + "epoch": 0.75, + "grad_norm": 42.2338981628418, + "learning_rate": 1.5018198552482951e-05, + "loss": 2.3033, "step": 5955 }, { - "epoch": 1.79, - "grad_norm": 63.55514144897461, - "learning_rate": 8.062543850856973e-06, - "loss": 2.9485, + "epoch": 0.75, + "grad_norm": 4.907975673675537, + "learning_rate": 1.5017361837426265e-05, + "loss": 2.3643, "step": 5956 }, { - "epoch": 1.79, - "grad_norm": 18.12586212158203, - "learning_rate": 8.060539240252581e-06, - "loss": 1.9415, + "epoch": 0.75, + "grad_norm": 28.78272819519043, + "learning_rate": 1.5016525122369579e-05, + "loss": 2.56, "step": 5957 }, { - "epoch": 1.79, - "grad_norm": 9.148773193359375, - "learning_rate": 8.058534629648191e-06, - "loss": 1.476, + "epoch": 0.75, + "grad_norm": 6.502068996429443, + "learning_rate": 1.5015688407312892e-05, + "loss": 0.5838, "step": 5958 }, { - "epoch": 1.79, - "grad_norm": 10.708128929138184, - "learning_rate": 8.056530019043801e-06, - "loss": 1.3661, + "epoch": 0.75, + "grad_norm": 24.910249710083008, + "learning_rate": 1.5014851692256203e-05, + "loss": 1.3385, "step": 5959 }, { - "epoch": 1.79, - "grad_norm": 20.42496681213379, - "learning_rate": 8.054525408439411e-06, - "loss": 1.4746, + "epoch": 0.75, + "grad_norm": 18.505050659179688, + "learning_rate": 1.5014014977199516e-05, + "loss": 1.3337, "step": 5960 }, { - "epoch": 1.79, - "grad_norm": 15.104199409484863, - "learning_rate": 8.052520797835022e-06, - "loss": 1.8083, + "epoch": 0.75, + "grad_norm": 13.536693572998047, + "learning_rate": 1.5013178262142828e-05, + "loss": 3.3743, "step": 5961 }, { - "epoch": 1.79, - "grad_norm": 20.05451011657715, - "learning_rate": 8.050516187230632e-06, - "loss": 1.6234, + "epoch": 0.75, + "grad_norm": 9.997637748718262, + "learning_rate": 1.501234154708614e-05, + "loss": 1.1199, "step": 5962 }, { - "epoch": 1.79, - "grad_norm": 23.32365608215332, - "learning_rate": 8.048511576626242e-06, - "loss": 1.5622, + "epoch": 0.75, + "grad_norm": 10.148368835449219, + "learning_rate": 1.5011504832029454e-05, + "loss": 0.7278, "step": 5963 }, { - "epoch": 1.79, - "grad_norm": 17.466800689697266, - "learning_rate": 8.04650696602185e-06, - "loss": 0.6051, + "epoch": 0.75, + "grad_norm": 13.460878372192383, + "learning_rate": 1.5010668116972766e-05, + "loss": 2.8044, "step": 5964 }, { - "epoch": 1.79, - "grad_norm": 26.45103645324707, - "learning_rate": 8.04450235541746e-06, - "loss": 1.5636, + "epoch": 0.75, + "grad_norm": 23.029918670654297, + "learning_rate": 1.500983140191608e-05, + "loss": 2.7466, "step": 5965 }, { - "epoch": 1.79, - "grad_norm": 25.677671432495117, - "learning_rate": 8.04249774481307e-06, - "loss": 1.1426, + "epoch": 0.75, + "grad_norm": 19.22269058227539, + "learning_rate": 1.5008994686859391e-05, + "loss": 3.4052, "step": 5966 }, { - "epoch": 1.79, - "grad_norm": 17.478527069091797, - "learning_rate": 8.04049313420868e-06, - "loss": 0.8924, + "epoch": 0.75, + "grad_norm": 5.88612699508667, + "learning_rate": 1.5008157971802703e-05, + "loss": 0.1444, "step": 5967 }, { - "epoch": 1.79, - "grad_norm": 15.59758472442627, - "learning_rate": 8.03848852360429e-06, - "loss": 1.4224, + "epoch": 0.75, + "grad_norm": 24.3934268951416, + "learning_rate": 1.5007321256746017e-05, + "loss": 2.4715, "step": 5968 }, { - "epoch": 1.79, - "grad_norm": 20.508508682250977, - "learning_rate": 8.0364839129999e-06, - "loss": 2.0134, + "epoch": 0.75, + "grad_norm": 48.39157485961914, + "learning_rate": 1.5006484541689327e-05, + "loss": 0.5484, "step": 5969 }, { - "epoch": 1.79, - "grad_norm": 14.481962203979492, - "learning_rate": 8.03447930239551e-06, - "loss": 1.176, + "epoch": 0.75, + "grad_norm": 9.383609771728516, + "learning_rate": 1.5005647826632641e-05, + "loss": 0.8422, "step": 5970 }, { - "epoch": 1.8, - "grad_norm": 17.673538208007812, - "learning_rate": 8.03247469179112e-06, - "loss": 1.5274, + "epoch": 0.75, + "grad_norm": 10.745577812194824, + "learning_rate": 1.5004811111575955e-05, + "loss": 1.2341, "step": 5971 }, { - "epoch": 1.8, - "grad_norm": 16.05086898803711, - "learning_rate": 8.030470081186729e-06, - "loss": 1.2769, + "epoch": 0.75, + "grad_norm": 22.75591468811035, + "learning_rate": 1.5003974396519268e-05, + "loss": 2.6592, "step": 5972 }, { - "epoch": 1.8, - "grad_norm": 26.023082733154297, - "learning_rate": 8.028465470582339e-06, - "loss": 0.8421, + "epoch": 0.75, + "grad_norm": 17.623111724853516, + "learning_rate": 1.5003137681462578e-05, + "loss": 1.9956, "step": 5973 }, { - "epoch": 1.8, - "grad_norm": 19.35530662536621, - "learning_rate": 8.026460859977951e-06, - "loss": 2.123, + "epoch": 0.75, + "grad_norm": 7.998566150665283, + "learning_rate": 1.5002300966405892e-05, + "loss": 0.7073, "step": 5974 }, { - "epoch": 1.8, - "grad_norm": 45.65658950805664, - "learning_rate": 8.02445624937356e-06, - "loss": 2.4255, + "epoch": 0.75, + "grad_norm": 5.96798038482666, + "learning_rate": 1.5001464251349204e-05, + "loss": 0.475, "step": 5975 }, { - "epoch": 1.8, - "grad_norm": 50.98988342285156, - "learning_rate": 8.02245163876917e-06, - "loss": 2.1818, + "epoch": 0.75, + "grad_norm": 7.934585094451904, + "learning_rate": 1.5000627536292516e-05, + "loss": 0.9308, "step": 5976 }, { - "epoch": 1.8, - "grad_norm": 25.256816864013672, - "learning_rate": 8.02044702816478e-06, - "loss": 2.182, + "epoch": 0.75, + "grad_norm": 4.579667568206787, + "learning_rate": 1.499979082123583e-05, + "loss": 0.6803, "step": 5977 }, { - "epoch": 1.8, - "grad_norm": 18.960912704467773, - "learning_rate": 8.01844241756039e-06, - "loss": 1.4124, + "epoch": 0.75, + "grad_norm": 38.206138610839844, + "learning_rate": 1.4998954106179142e-05, + "loss": 0.7277, "step": 5978 }, { - "epoch": 1.8, - "grad_norm": 20.172710418701172, - "learning_rate": 8.016437806956e-06, - "loss": 0.64, + "epoch": 0.75, + "grad_norm": 4.93873929977417, + "learning_rate": 1.4998117391122455e-05, + "loss": 0.3812, "step": 5979 }, { - "epoch": 1.8, - "grad_norm": 16.025949478149414, - "learning_rate": 8.01443319635161e-06, - "loss": 1.1168, + "epoch": 0.75, + "grad_norm": 19.105716705322266, + "learning_rate": 1.4997280676065766e-05, + "loss": 0.8092, "step": 5980 }, { - "epoch": 1.8, - "grad_norm": 19.96694564819336, - "learning_rate": 8.01242858574722e-06, - "loss": 0.9764, + "epoch": 0.75, + "grad_norm": 9.995222091674805, + "learning_rate": 1.4996443961009079e-05, + "loss": 1.533, "step": 5981 }, { - "epoch": 1.8, - "grad_norm": 28.706392288208008, - "learning_rate": 8.01042397514283e-06, - "loss": 1.6412, + "epoch": 0.75, + "grad_norm": 8.4161958694458, + "learning_rate": 1.4995607245952393e-05, + "loss": 0.6504, "step": 5982 }, { - "epoch": 1.8, - "grad_norm": 9.621232032775879, - "learning_rate": 8.008419364538438e-06, - "loss": 0.9458, + "epoch": 0.75, + "grad_norm": 43.21015548706055, + "learning_rate": 1.4994770530895703e-05, + "loss": 1.8054, "step": 5983 }, { - "epoch": 1.8, - "grad_norm": 50.64955520629883, - "learning_rate": 8.006414753934048e-06, - "loss": 1.558, + "epoch": 0.75, + "grad_norm": 16.76747703552246, + "learning_rate": 1.4993933815839017e-05, + "loss": 3.7512, "step": 5984 }, { - "epoch": 1.8, - "grad_norm": 54.46828842163086, - "learning_rate": 8.004410143329658e-06, - "loss": 2.4981, + "epoch": 0.75, + "grad_norm": 9.055212020874023, + "learning_rate": 1.499309710078233e-05, + "loss": 1.5094, "step": 5985 }, { - "epoch": 1.8, - "grad_norm": 41.00409698486328, - "learning_rate": 8.002405532725269e-06, - "loss": 1.5329, + "epoch": 0.75, + "grad_norm": 48.513824462890625, + "learning_rate": 1.4992260385725644e-05, + "loss": 2.6326, "step": 5986 }, { - "epoch": 1.8, - "grad_norm": 14.016818046569824, - "learning_rate": 8.000400922120879e-06, - "loss": 1.4779, + "epoch": 0.75, + "grad_norm": 10.663233757019043, + "learning_rate": 1.4991423670668954e-05, + "loss": 1.4526, "step": 5987 }, { - "epoch": 1.8, - "grad_norm": 46.308258056640625, - "learning_rate": 7.998396311516489e-06, - "loss": 1.1857, + "epoch": 0.75, + "grad_norm": 14.359576225280762, + "learning_rate": 1.4990586955612268e-05, + "loss": 2.2563, "step": 5988 }, { - "epoch": 1.8, - "grad_norm": 10.816703796386719, - "learning_rate": 7.996391700912099e-06, - "loss": 1.2238, + "epoch": 0.75, + "grad_norm": 21.671953201293945, + "learning_rate": 1.498975024055558e-05, + "loss": 1.9075, "step": 5989 }, { - "epoch": 1.8, - "grad_norm": 14.56143856048584, - "learning_rate": 7.994387090307709e-06, - "loss": 1.5197, + "epoch": 0.75, + "grad_norm": 5.257259845733643, + "learning_rate": 1.4988913525498892e-05, + "loss": 1.7287, "step": 5990 }, { - "epoch": 1.8, - "grad_norm": 13.148183822631836, - "learning_rate": 7.992382479703317e-06, - "loss": 1.0046, + "epoch": 0.75, + "grad_norm": 22.53580665588379, + "learning_rate": 1.4988076810442205e-05, + "loss": 0.7538, "step": 5991 }, { - "epoch": 1.8, - "grad_norm": 54.134552001953125, - "learning_rate": 7.990377869098929e-06, - "loss": 1.8494, + "epoch": 0.75, + "grad_norm": 24.42538833618164, + "learning_rate": 1.4987240095385517e-05, + "loss": 2.8615, "step": 5992 }, { - "epoch": 1.8, - "grad_norm": 12.82621955871582, - "learning_rate": 7.988373258494539e-06, - "loss": 1.4167, + "epoch": 0.75, + "grad_norm": 7.242316722869873, + "learning_rate": 1.4986403380328831e-05, + "loss": 2.4656, "step": 5993 }, { - "epoch": 1.8, - "grad_norm": 14.938492774963379, - "learning_rate": 7.986368647890148e-06, - "loss": 0.6812, + "epoch": 0.75, + "grad_norm": 15.731229782104492, + "learning_rate": 1.4985566665272141e-05, + "loss": 1.8357, "step": 5994 }, { - "epoch": 1.8, - "grad_norm": 39.79289627075195, - "learning_rate": 7.984364037285758e-06, - "loss": 4.1951, + "epoch": 0.75, + "grad_norm": 17.896705627441406, + "learning_rate": 1.4984729950215455e-05, + "loss": 2.1809, "step": 5995 }, { - "epoch": 1.8, - "grad_norm": 12.317481994628906, - "learning_rate": 7.982359426681368e-06, - "loss": 1.0172, + "epoch": 0.75, + "grad_norm": 3.5809171199798584, + "learning_rate": 1.4983893235158769e-05, + "loss": 0.3103, "step": 5996 }, { - "epoch": 1.8, - "grad_norm": 28.569238662719727, - "learning_rate": 7.980354816076978e-06, - "loss": 2.539, + "epoch": 0.75, + "grad_norm": 11.365817070007324, + "learning_rate": 1.4983056520102079e-05, + "loss": 0.815, "step": 5997 }, { - "epoch": 1.8, - "grad_norm": 28.877849578857422, - "learning_rate": 7.978350205472588e-06, - "loss": 1.4473, + "epoch": 0.75, + "grad_norm": 29.889799118041992, + "learning_rate": 1.4982219805045393e-05, + "loss": 2.3377, "step": 5998 }, { - "epoch": 1.8, - "grad_norm": 42.884979248046875, - "learning_rate": 7.976345594868198e-06, - "loss": 0.6649, + "epoch": 0.75, + "grad_norm": 100.96052551269531, + "learning_rate": 1.4981383089988706e-05, + "loss": 2.4042, "step": 5999 }, { - "epoch": 1.8, - "grad_norm": 8.961115837097168, - "learning_rate": 7.974340984263808e-06, - "loss": 1.1251, + "epoch": 0.75, + "grad_norm": 27.039257049560547, + "learning_rate": 1.498054637493202e-05, + "loss": 3.018, "step": 6000 }, { - "epoch": 1.8, - "eval_loss": 0.19368119537830353, - "eval_runtime": 43.5079, - "eval_samples_per_second": 33.994, - "eval_steps_per_second": 33.994, + "epoch": 0.75, + "eval_loss": 0.11360074579715729, + "eval_runtime": 95.1759, + "eval_samples_per_second": 37.215, + "eval_steps_per_second": 37.215, "step": 6000 }, { - "epoch": 1.8, - "grad_norm": 72.2662124633789, - "learning_rate": 7.972336373659416e-06, - "loss": 2.015, + "epoch": 0.75, + "grad_norm": 4.2123284339904785, + "learning_rate": 1.497970965987533e-05, + "loss": 0.4822, "step": 6001 }, { - "epoch": 1.8, - "grad_norm": 16.20361328125, - "learning_rate": 7.970331763055027e-06, - "loss": 1.2574, + "epoch": 0.75, + "grad_norm": 20.182884216308594, + "learning_rate": 1.4978872944818644e-05, + "loss": 3.4223, "step": 6002 }, { - "epoch": 1.8, - "grad_norm": 20.717201232910156, - "learning_rate": 7.968327152450637e-06, - "loss": 1.5776, + "epoch": 0.75, + "grad_norm": 7.4854559898376465, + "learning_rate": 1.4978036229761956e-05, + "loss": 1.7623, "step": 6003 }, { - "epoch": 1.81, - "grad_norm": 23.648340225219727, - "learning_rate": 7.966322541846248e-06, - "loss": 1.8967, + "epoch": 0.75, + "grad_norm": 10.160518646240234, + "learning_rate": 1.4977199514705268e-05, + "loss": 0.7109, "step": 6004 }, { - "epoch": 1.81, - "grad_norm": 28.407695770263672, - "learning_rate": 7.964317931241857e-06, - "loss": 2.1085, + "epoch": 0.75, + "grad_norm": 7.039008140563965, + "learning_rate": 1.4976362799648581e-05, + "loss": 2.4178, "step": 6005 }, { - "epoch": 1.81, - "grad_norm": 13.047331809997559, - "learning_rate": 7.962313320637467e-06, - "loss": 1.9436, + "epoch": 0.75, + "grad_norm": 16.30356216430664, + "learning_rate": 1.4975526084591893e-05, + "loss": 2.6283, "step": 6006 }, { - "epoch": 1.81, - "grad_norm": 12.131548881530762, - "learning_rate": 7.960308710033077e-06, - "loss": 1.3458, + "epoch": 0.75, + "grad_norm": 7.128363609313965, + "learning_rate": 1.4974689369535207e-05, + "loss": 1.4611, "step": 6007 }, { - "epoch": 1.81, - "grad_norm": 15.92016315460205, - "learning_rate": 7.958304099428687e-06, - "loss": 0.8177, + "epoch": 0.75, + "grad_norm": 4.830732822418213, + "learning_rate": 1.4973852654478517e-05, + "loss": 0.7371, "step": 6008 }, { - "epoch": 1.81, - "grad_norm": 9.055414199829102, - "learning_rate": 7.956299488824295e-06, - "loss": 1.3297, + "epoch": 0.75, + "grad_norm": 16.139087677001953, + "learning_rate": 1.497301593942183e-05, + "loss": 2.0725, "step": 6009 }, { - "epoch": 1.81, - "grad_norm": 13.94730281829834, - "learning_rate": 7.954294878219907e-06, - "loss": 1.4445, + "epoch": 0.75, + "grad_norm": 16.47942352294922, + "learning_rate": 1.4972179224365144e-05, + "loss": 1.6814, "step": 6010 }, { - "epoch": 1.81, - "grad_norm": 17.391616821289062, - "learning_rate": 7.952290267615517e-06, - "loss": 0.9038, + "epoch": 0.75, + "grad_norm": 17.696067810058594, + "learning_rate": 1.4971342509308455e-05, + "loss": 2.2306, "step": 6011 }, { - "epoch": 1.81, - "grad_norm": 198.21731567382812, - "learning_rate": 7.950285657011126e-06, - "loss": 2.3957, + "epoch": 0.75, + "grad_norm": 13.801817893981934, + "learning_rate": 1.4970505794251768e-05, + "loss": 1.6959, "step": 6012 }, { - "epoch": 1.81, - "grad_norm": 12.243670463562012, - "learning_rate": 7.948281046406736e-06, - "loss": 1.4155, + "epoch": 0.75, + "grad_norm": 21.30613899230957, + "learning_rate": 1.4969669079195082e-05, + "loss": 3.468, "step": 6013 }, { - "epoch": 1.81, - "grad_norm": 13.951066017150879, - "learning_rate": 7.946276435802346e-06, - "loss": 1.4097, + "epoch": 0.75, + "grad_norm": 45.791160583496094, + "learning_rate": 1.4968832364138394e-05, + "loss": 3.4723, "step": 6014 }, { - "epoch": 1.81, - "grad_norm": 24.333097457885742, - "learning_rate": 7.944271825197956e-06, - "loss": 2.4542, + "epoch": 0.75, + "grad_norm": 9.92892074584961, + "learning_rate": 1.4967995649081706e-05, + "loss": 2.3938, "step": 6015 }, { - "epoch": 1.81, - "grad_norm": 17.491724014282227, - "learning_rate": 7.942267214593566e-06, - "loss": 1.649, + "epoch": 0.75, + "grad_norm": 22.17215919494629, + "learning_rate": 1.496715893402502e-05, + "loss": 1.9325, "step": 6016 }, { - "epoch": 1.81, - "grad_norm": 11.039308547973633, - "learning_rate": 7.940262603989176e-06, - "loss": 1.1686, + "epoch": 0.76, + "grad_norm": 59.155479431152344, + "learning_rate": 1.4966322218968332e-05, + "loss": 3.5734, "step": 6017 }, { - "epoch": 1.81, - "grad_norm": 18.201019287109375, - "learning_rate": 7.938257993384786e-06, - "loss": 1.5321, + "epoch": 0.76, + "grad_norm": 16.35432243347168, + "learning_rate": 1.4965485503911644e-05, + "loss": 2.029, "step": 6018 }, { - "epoch": 1.81, - "grad_norm": 29.47193717956543, - "learning_rate": 7.936253382780396e-06, - "loss": 1.5978, + "epoch": 0.76, + "grad_norm": 12.570013999938965, + "learning_rate": 1.4964648788854957e-05, + "loss": 1.008, "step": 6019 }, { - "epoch": 1.81, - "grad_norm": 24.67177391052246, - "learning_rate": 7.934248772176005e-06, - "loss": 2.0138, + "epoch": 0.76, + "grad_norm": 34.56005096435547, + "learning_rate": 1.4963812073798269e-05, + "loss": 1.8091, "step": 6020 }, { - "epoch": 1.81, - "grad_norm": 25.735088348388672, - "learning_rate": 7.932244161571615e-06, - "loss": 3.0204, + "epoch": 0.76, + "grad_norm": 33.60686111450195, + "learning_rate": 1.4962975358741583e-05, + "loss": 1.8547, "step": 6021 }, { - "epoch": 1.81, - "grad_norm": 63.19715118408203, - "learning_rate": 7.930239550967225e-06, - "loss": 1.5556, + "epoch": 0.76, + "grad_norm": 17.62992286682129, + "learning_rate": 1.4962138643684893e-05, + "loss": 0.9139, "step": 6022 }, { - "epoch": 1.81, - "grad_norm": 12.563130378723145, - "learning_rate": 7.928234940362835e-06, - "loss": 1.3196, + "epoch": 0.76, + "grad_norm": 10.80237865447998, + "learning_rate": 1.4961301928628207e-05, + "loss": 1.0544, "step": 6023 }, { - "epoch": 1.81, - "grad_norm": 21.176742553710938, - "learning_rate": 7.926230329758445e-06, - "loss": 2.416, + "epoch": 0.76, + "grad_norm": 13.512557983398438, + "learning_rate": 1.496046521357152e-05, + "loss": 1.1676, "step": 6024 }, { - "epoch": 1.81, - "grad_norm": 14.44190502166748, - "learning_rate": 7.924225719154055e-06, - "loss": 1.14, + "epoch": 0.76, + "grad_norm": 8.398104667663574, + "learning_rate": 1.495962849851483e-05, + "loss": 2.2205, "step": 6025 }, { - "epoch": 1.81, - "grad_norm": 10.227465629577637, - "learning_rate": 7.922221108549665e-06, - "loss": 0.9925, + "epoch": 0.76, + "grad_norm": 24.02195167541504, + "learning_rate": 1.4958791783458144e-05, + "loss": 4.6731, "step": 6026 }, { - "epoch": 1.81, - "grad_norm": 16.822582244873047, - "learning_rate": 7.920216497945275e-06, - "loss": 1.413, + "epoch": 0.76, + "grad_norm": 34.829097747802734, + "learning_rate": 1.4957955068401458e-05, + "loss": 2.6504, "step": 6027 }, { - "epoch": 1.81, - "grad_norm": 23.78660774230957, - "learning_rate": 7.918211887340884e-06, - "loss": 2.2229, + "epoch": 0.76, + "grad_norm": 21.19928741455078, + "learning_rate": 1.495711835334477e-05, + "loss": 2.3643, "step": 6028 }, { - "epoch": 1.81, - "grad_norm": 22.946805953979492, - "learning_rate": 7.916207276736495e-06, - "loss": 1.9366, + "epoch": 0.76, + "grad_norm": 11.841145515441895, + "learning_rate": 1.4956281638288082e-05, + "loss": 0.8766, "step": 6029 }, { - "epoch": 1.81, - "grad_norm": 11.842130661010742, - "learning_rate": 7.914202666132105e-06, - "loss": 2.1797, + "epoch": 0.76, + "grad_norm": 5.688900947570801, + "learning_rate": 1.4955444923231395e-05, + "loss": 1.7931, "step": 6030 }, { - "epoch": 1.81, - "grad_norm": 11.936902046203613, - "learning_rate": 7.912198055527714e-06, - "loss": 0.9288, + "epoch": 0.76, + "grad_norm": 12.185578346252441, + "learning_rate": 1.4954608208174707e-05, + "loss": 1.2761, "step": 6031 }, { - "epoch": 1.81, - "grad_norm": 12.97545337677002, - "learning_rate": 7.910193444923324e-06, - "loss": 0.8792, + "epoch": 0.76, + "grad_norm": 19.21368980407715, + "learning_rate": 1.495377149311802e-05, + "loss": 1.5665, "step": 6032 }, { - "epoch": 1.81, - "grad_norm": 22.72028160095215, - "learning_rate": 7.908188834318934e-06, - "loss": 1.8991, + "epoch": 0.76, + "grad_norm": 11.331363677978516, + "learning_rate": 1.4952934778061331e-05, + "loss": 2.9063, "step": 6033 }, { - "epoch": 1.81, - "grad_norm": 68.23728942871094, - "learning_rate": 7.906184223714544e-06, - "loss": 2.0705, + "epoch": 0.76, + "grad_norm": 16.766036987304688, + "learning_rate": 1.4952098063004645e-05, + "loss": 2.0617, "step": 6034 }, { - "epoch": 1.81, - "grad_norm": 17.14826202392578, - "learning_rate": 7.904179613110154e-06, - "loss": 1.9903, + "epoch": 0.76, + "grad_norm": 18.78944206237793, + "learning_rate": 1.4951261347947959e-05, + "loss": 1.3033, "step": 6035 }, { - "epoch": 1.81, - "grad_norm": 33.424381256103516, - "learning_rate": 7.902175002505764e-06, - "loss": 1.5851, + "epoch": 0.76, + "grad_norm": 19.58803939819336, + "learning_rate": 1.4950424632891269e-05, + "loss": 2.1772, "step": 6036 }, { - "epoch": 1.82, - "grad_norm": 21.694814682006836, - "learning_rate": 7.900170391901374e-06, - "loss": 2.1811, + "epoch": 0.76, + "grad_norm": 24.190006256103516, + "learning_rate": 1.4949587917834583e-05, + "loss": 1.8668, "step": 6037 }, { - "epoch": 1.82, - "grad_norm": 11.936399459838867, - "learning_rate": 7.898165781296984e-06, - "loss": 1.4652, + "epoch": 0.76, + "grad_norm": 14.641880989074707, + "learning_rate": 1.4948751202777896e-05, + "loss": 2.1728, "step": 6038 }, { - "epoch": 1.82, - "grad_norm": 10.102071762084961, - "learning_rate": 7.896161170692593e-06, - "loss": 1.9421, + "epoch": 0.76, + "grad_norm": 20.583711624145508, + "learning_rate": 1.4947914487721206e-05, + "loss": 2.317, "step": 6039 }, { - "epoch": 1.82, - "grad_norm": 32.03966522216797, - "learning_rate": 7.894156560088203e-06, - "loss": 1.4121, + "epoch": 0.76, + "grad_norm": 16.76109504699707, + "learning_rate": 1.494707777266452e-05, + "loss": 1.2081, "step": 6040 }, { - "epoch": 1.82, - "grad_norm": 31.259422302246094, - "learning_rate": 7.892151949483815e-06, - "loss": 1.255, + "epoch": 0.76, + "grad_norm": 61.240535736083984, + "learning_rate": 1.4946241057607834e-05, + "loss": 2.282, "step": 6041 }, { - "epoch": 1.82, - "grad_norm": 11.89094352722168, - "learning_rate": 7.890147338879423e-06, - "loss": 1.1638, + "epoch": 0.76, + "grad_norm": 13.91823673248291, + "learning_rate": 1.4945404342551146e-05, + "loss": 3.5434, "step": 6042 }, { - "epoch": 1.82, - "grad_norm": 16.84495735168457, - "learning_rate": 7.888142728275033e-06, - "loss": 1.4586, + "epoch": 0.76, + "grad_norm": 7.848240852355957, + "learning_rate": 1.4944567627494458e-05, + "loss": 2.3143, "step": 6043 }, { - "epoch": 1.82, - "grad_norm": 9.196840286254883, - "learning_rate": 7.886138117670643e-06, - "loss": 2.2115, + "epoch": 0.76, + "grad_norm": 4.802414894104004, + "learning_rate": 1.4943730912437771e-05, + "loss": 0.6465, "step": 6044 }, { - "epoch": 1.82, - "grad_norm": 20.001394271850586, - "learning_rate": 7.884133507066253e-06, - "loss": 1.0957, + "epoch": 0.76, + "grad_norm": 17.290687561035156, + "learning_rate": 1.4942894197381083e-05, + "loss": 1.7776, "step": 6045 }, { - "epoch": 1.82, - "grad_norm": 30.092775344848633, - "learning_rate": 7.882128896461862e-06, - "loss": 1.6151, + "epoch": 0.76, + "grad_norm": 22.302507400512695, + "learning_rate": 1.4942057482324395e-05, + "loss": 0.5936, "step": 6046 }, { - "epoch": 1.82, - "grad_norm": 10.02807331085205, - "learning_rate": 7.880124285857474e-06, - "loss": 0.872, + "epoch": 0.76, + "grad_norm": 5.213108062744141, + "learning_rate": 1.4941220767267707e-05, + "loss": 0.6772, "step": 6047 }, { - "epoch": 1.82, - "grad_norm": 21.552358627319336, - "learning_rate": 7.878119675253084e-06, - "loss": 1.4557, + "epoch": 0.76, + "grad_norm": 12.678808212280273, + "learning_rate": 1.494038405221102e-05, + "loss": 1.4635, "step": 6048 }, { - "epoch": 1.82, - "grad_norm": 21.00843620300293, - "learning_rate": 7.876115064648692e-06, - "loss": 1.379, + "epoch": 0.76, + "grad_norm": 7.262757301330566, + "learning_rate": 1.4939547337154334e-05, + "loss": 3.6633, "step": 6049 }, { - "epoch": 1.82, - "grad_norm": 16.021032333374023, - "learning_rate": 7.874110454044302e-06, - "loss": 1.8451, + "epoch": 0.76, + "grad_norm": 10.369536399841309, + "learning_rate": 1.4938710622097645e-05, + "loss": 0.4444, "step": 6050 }, { - "epoch": 1.82, - "grad_norm": 14.853699684143066, - "learning_rate": 7.872105843439912e-06, - "loss": 1.125, + "epoch": 0.76, + "grad_norm": 14.876389503479004, + "learning_rate": 1.4937873907040958e-05, + "loss": 1.0291, "step": 6051 }, { - "epoch": 1.82, - "grad_norm": 20.046295166015625, - "learning_rate": 7.870101232835522e-06, - "loss": 1.9366, + "epoch": 0.76, + "grad_norm": 8.755902290344238, + "learning_rate": 1.4937037191984272e-05, + "loss": 1.6724, "step": 6052 }, { - "epoch": 1.82, - "grad_norm": 17.23594856262207, - "learning_rate": 7.868096622231132e-06, - "loss": 1.6949, + "epoch": 0.76, + "grad_norm": 15.990140914916992, + "learning_rate": 1.4936200476927582e-05, + "loss": 2.0313, "step": 6053 }, { - "epoch": 1.82, - "grad_norm": 16.25902557373047, - "learning_rate": 7.866092011626742e-06, - "loss": 1.9581, + "epoch": 0.76, + "grad_norm": 4.509998321533203, + "learning_rate": 1.4935363761870896e-05, + "loss": 0.436, "step": 6054 }, { - "epoch": 1.82, - "grad_norm": 25.339801788330078, - "learning_rate": 7.864087401022353e-06, - "loss": 1.4696, + "epoch": 0.76, + "grad_norm": 14.44178295135498, + "learning_rate": 1.493452704681421e-05, + "loss": 2.0875, "step": 6055 }, { - "epoch": 1.82, - "grad_norm": 45.478515625, - "learning_rate": 7.862082790417963e-06, - "loss": 2.0867, + "epoch": 0.76, + "grad_norm": 13.471675872802734, + "learning_rate": 1.4933690331757521e-05, + "loss": 0.8051, "step": 6056 }, { - "epoch": 1.82, - "grad_norm": 19.254344940185547, - "learning_rate": 7.860078179813571e-06, - "loss": 1.6101, + "epoch": 0.76, + "grad_norm": 23.050397872924805, + "learning_rate": 1.4932853616700833e-05, + "loss": 1.3614, "step": 6057 }, { - "epoch": 1.82, - "grad_norm": 23.987213134765625, - "learning_rate": 7.858073569209181e-06, - "loss": 1.1053, + "epoch": 0.76, + "grad_norm": 7.089566707611084, + "learning_rate": 1.4932016901644147e-05, + "loss": 0.6359, "step": 6058 }, { - "epoch": 1.82, - "grad_norm": 33.420997619628906, - "learning_rate": 7.856068958604791e-06, - "loss": 1.3957, + "epoch": 0.76, + "grad_norm": 18.266427993774414, + "learning_rate": 1.4931180186587459e-05, + "loss": 0.9629, "step": 6059 }, { - "epoch": 1.82, - "grad_norm": 53.56259536743164, - "learning_rate": 7.854064348000401e-06, - "loss": 2.8107, + "epoch": 0.76, + "grad_norm": 14.109417915344238, + "learning_rate": 1.4930343471530771e-05, + "loss": 1.1746, "step": 6060 }, { - "epoch": 1.82, - "grad_norm": 14.887017250061035, - "learning_rate": 7.852059737396011e-06, - "loss": 2.4886, + "epoch": 0.76, + "grad_norm": 9.30927848815918, + "learning_rate": 1.4929506756474083e-05, + "loss": 1.5184, "step": 6061 }, { - "epoch": 1.82, - "grad_norm": 15.285018920898438, - "learning_rate": 7.850055126791621e-06, - "loss": 1.1363, + "epoch": 0.76, + "grad_norm": 7.900919437408447, + "learning_rate": 1.4928670041417397e-05, + "loss": 0.5081, "step": 6062 }, { - "epoch": 1.82, - "grad_norm": 16.956491470336914, - "learning_rate": 7.848050516187231e-06, - "loss": 1.4113, + "epoch": 0.76, + "grad_norm": 14.977768898010254, + "learning_rate": 1.492783332636071e-05, + "loss": 1.6299, "step": 6063 }, { - "epoch": 1.82, - "grad_norm": 22.227479934692383, - "learning_rate": 7.846045905582842e-06, - "loss": 1.2002, + "epoch": 0.76, + "grad_norm": 18.095666885375977, + "learning_rate": 1.492699661130402e-05, + "loss": 4.2114, "step": 6064 }, { - "epoch": 1.82, - "grad_norm": 44.9412727355957, - "learning_rate": 7.84404129497845e-06, - "loss": 1.583, + "epoch": 0.76, + "grad_norm": 25.11492156982422, + "learning_rate": 1.4926159896247334e-05, + "loss": 2.605, "step": 6065 }, { - "epoch": 1.82, - "grad_norm": 37.56648254394531, - "learning_rate": 7.842036684374062e-06, - "loss": 2.022, + "epoch": 0.76, + "grad_norm": 44.8865966796875, + "learning_rate": 1.4925323181190648e-05, + "loss": 2.4028, "step": 6066 }, { - "epoch": 1.82, - "grad_norm": 15.846146583557129, - "learning_rate": 7.840032073769672e-06, - "loss": 0.9523, + "epoch": 0.76, + "grad_norm": 8.544513702392578, + "learning_rate": 1.4924486466133958e-05, + "loss": 1.1757, "step": 6067 }, { - "epoch": 1.82, - "grad_norm": 18.317182540893555, - "learning_rate": 7.83802746316528e-06, - "loss": 1.5857, + "epoch": 0.76, + "grad_norm": 14.130455017089844, + "learning_rate": 1.4923649751077272e-05, + "loss": 1.6795, "step": 6068 }, { - "epoch": 1.82, - "grad_norm": 14.531113624572754, - "learning_rate": 7.83602285256089e-06, - "loss": 1.2722, + "epoch": 0.76, + "grad_norm": 107.8146743774414, + "learning_rate": 1.4922813036020585e-05, + "loss": 1.8548, "step": 6069 }, { - "epoch": 1.83, - "grad_norm": 9.223918914794922, - "learning_rate": 7.8340182419565e-06, - "loss": 1.6124, + "epoch": 0.76, + "grad_norm": 18.72151756286621, + "learning_rate": 1.4921976320963897e-05, + "loss": 1.1193, "step": 6070 }, { - "epoch": 1.83, - "grad_norm": 12.137374877929688, - "learning_rate": 7.83201363135211e-06, - "loss": 1.0094, + "epoch": 0.76, + "grad_norm": 28.117448806762695, + "learning_rate": 1.492113960590721e-05, + "loss": 1.3729, "step": 6071 }, { - "epoch": 1.83, - "grad_norm": 8.65196418762207, - "learning_rate": 7.83000902074772e-06, - "loss": 0.9413, + "epoch": 0.76, + "grad_norm": 7.518640995025635, + "learning_rate": 1.4920302890850521e-05, + "loss": 0.8463, "step": 6072 }, { - "epoch": 1.83, - "grad_norm": 16.220958709716797, - "learning_rate": 7.82800441014333e-06, - "loss": 2.1056, + "epoch": 0.76, + "grad_norm": 15.60339641571045, + "learning_rate": 1.4919466175793835e-05, + "loss": 1.315, "step": 6073 }, { - "epoch": 1.83, - "grad_norm": 19.515766143798828, - "learning_rate": 7.82599979953894e-06, - "loss": 1.763, + "epoch": 0.76, + "grad_norm": 19.411062240600586, + "learning_rate": 1.4918629460737147e-05, + "loss": 2.0167, "step": 6074 }, { - "epoch": 1.83, - "grad_norm": 11.028536796569824, - "learning_rate": 7.82399518893455e-06, - "loss": 1.577, + "epoch": 0.76, + "grad_norm": 9.9929780960083, + "learning_rate": 1.4917792745680459e-05, + "loss": 0.8052, "step": 6075 }, { - "epoch": 1.83, - "grad_norm": 14.080721855163574, - "learning_rate": 7.82199057833016e-06, - "loss": 1.2928, + "epoch": 0.76, + "grad_norm": 24.951963424682617, + "learning_rate": 1.4916956030623772e-05, + "loss": 1.8592, "step": 6076 }, { - "epoch": 1.83, - "grad_norm": 22.41744041442871, - "learning_rate": 7.81998596772577e-06, - "loss": 1.6717, + "epoch": 0.76, + "grad_norm": 12.254054069519043, + "learning_rate": 1.4916119315567086e-05, + "loss": 0.5268, "step": 6077 }, { - "epoch": 1.83, - "grad_norm": 29.117725372314453, - "learning_rate": 7.817981357121381e-06, - "loss": 2.0723, + "epoch": 0.76, + "grad_norm": 24.44095802307129, + "learning_rate": 1.4915282600510396e-05, + "loss": 3.7504, "step": 6078 }, { - "epoch": 1.83, - "grad_norm": 10.062914848327637, - "learning_rate": 7.81597674651699e-06, - "loss": 0.9723, + "epoch": 0.76, + "grad_norm": 16.46070098876953, + "learning_rate": 1.491444588545371e-05, + "loss": 2.3923, "step": 6079 }, { - "epoch": 1.83, - "grad_norm": 19.473957061767578, - "learning_rate": 7.8139721359126e-06, - "loss": 1.6513, + "epoch": 0.76, + "grad_norm": 22.783987045288086, + "learning_rate": 1.4913609170397024e-05, + "loss": 1.3608, "step": 6080 }, { - "epoch": 1.83, - "grad_norm": 13.241780281066895, - "learning_rate": 7.81196752530821e-06, - "loss": 1.4396, + "epoch": 0.76, + "grad_norm": 13.54172420501709, + "learning_rate": 1.4912772455340334e-05, + "loss": 1.4165, "step": 6081 }, { - "epoch": 1.83, - "grad_norm": 50.16336441040039, - "learning_rate": 7.80996291470382e-06, - "loss": 1.4343, + "epoch": 0.76, + "grad_norm": 14.175755500793457, + "learning_rate": 1.4911935740283648e-05, + "loss": 1.288, "step": 6082 }, { - "epoch": 1.83, - "grad_norm": 8.716567039489746, - "learning_rate": 7.807958304099428e-06, - "loss": 0.7952, + "epoch": 0.76, + "grad_norm": 36.6352653503418, + "learning_rate": 1.4911099025226961e-05, + "loss": 1.3954, "step": 6083 }, { - "epoch": 1.83, - "grad_norm": 17.933069229125977, - "learning_rate": 7.80595369349504e-06, - "loss": 2.706, + "epoch": 0.76, + "grad_norm": 6.960003852844238, + "learning_rate": 1.4910262310170273e-05, + "loss": 1.4535, "step": 6084 }, { - "epoch": 1.83, - "grad_norm": 30.77072525024414, - "learning_rate": 7.80394908289065e-06, - "loss": 1.9985, + "epoch": 0.76, + "grad_norm": 5.8838067054748535, + "learning_rate": 1.4909425595113585e-05, + "loss": 0.5222, "step": 6085 }, { - "epoch": 1.83, - "grad_norm": 13.682772636413574, - "learning_rate": 7.801944472286258e-06, - "loss": 0.9191, + "epoch": 0.76, + "grad_norm": 29.456592559814453, + "learning_rate": 1.4908588880056897e-05, + "loss": 1.6292, "step": 6086 }, { - "epoch": 1.83, - "grad_norm": 17.457841873168945, - "learning_rate": 7.799939861681868e-06, - "loss": 1.518, + "epoch": 0.76, + "grad_norm": 13.978496551513672, + "learning_rate": 1.490775216500021e-05, + "loss": 1.9543, "step": 6087 }, { - "epoch": 1.83, - "grad_norm": 41.92538833618164, - "learning_rate": 7.797935251077479e-06, - "loss": 2.1136, + "epoch": 0.76, + "grad_norm": 9.037714004516602, + "learning_rate": 1.4906915449943523e-05, + "loss": 0.9129, "step": 6088 }, { - "epoch": 1.83, - "grad_norm": 22.720664978027344, - "learning_rate": 7.795930640473089e-06, - "loss": 1.8965, + "epoch": 0.76, + "grad_norm": 13.83279800415039, + "learning_rate": 1.4906078734886835e-05, + "loss": 1.8087, "step": 6089 }, { - "epoch": 1.83, - "grad_norm": 13.168417930603027, - "learning_rate": 7.793926029868699e-06, - "loss": 1.3237, + "epoch": 0.76, + "grad_norm": 19.00337028503418, + "learning_rate": 1.4905242019830148e-05, + "loss": 2.4818, "step": 6090 }, { - "epoch": 1.83, - "grad_norm": 22.606231689453125, - "learning_rate": 7.791921419264309e-06, - "loss": 1.6433, + "epoch": 0.76, + "grad_norm": 20.148292541503906, + "learning_rate": 1.4904405304773462e-05, + "loss": 2.7005, "step": 6091 }, { - "epoch": 1.83, - "grad_norm": 22.04924964904785, - "learning_rate": 7.789916808659919e-06, - "loss": 1.9051, + "epoch": 0.76, + "grad_norm": 9.858909606933594, + "learning_rate": 1.4903568589716772e-05, + "loss": 1.2669, "step": 6092 }, { - "epoch": 1.83, - "grad_norm": 12.943755149841309, - "learning_rate": 7.787912198055529e-06, - "loss": 1.3566, + "epoch": 0.76, + "grad_norm": 33.501895904541016, + "learning_rate": 1.4902731874660086e-05, + "loss": 2.5563, "step": 6093 }, { - "epoch": 1.83, - "grad_norm": 8.388080596923828, - "learning_rate": 7.785907587451137e-06, - "loss": 1.0533, + "epoch": 0.76, + "grad_norm": 15.421066284179688, + "learning_rate": 1.49018951596034e-05, + "loss": 1.3921, "step": 6094 }, { - "epoch": 1.83, - "grad_norm": 39.58984375, - "learning_rate": 7.783902976846747e-06, - "loss": 2.1753, + "epoch": 0.76, + "grad_norm": 20.976089477539062, + "learning_rate": 1.490105844454671e-05, + "loss": 1.3577, "step": 6095 }, { - "epoch": 1.83, - "grad_norm": 17.842021942138672, - "learning_rate": 7.781898366242357e-06, - "loss": 0.9146, + "epoch": 0.77, + "grad_norm": 10.768942832946777, + "learning_rate": 1.4900221729490023e-05, + "loss": 0.8958, "step": 6096 }, { - "epoch": 1.83, - "grad_norm": 12.273473739624023, - "learning_rate": 7.779893755637968e-06, - "loss": 1.0507, + "epoch": 0.77, + "grad_norm": 6.768867015838623, + "learning_rate": 1.4899385014433337e-05, + "loss": 2.9632, "step": 6097 }, { - "epoch": 1.83, - "grad_norm": 34.619178771972656, - "learning_rate": 7.777889145033578e-06, - "loss": 2.2011, + "epoch": 0.77, + "grad_norm": 4.851020336151123, + "learning_rate": 1.4898548299376649e-05, + "loss": 1.1444, "step": 6098 }, { - "epoch": 1.83, - "grad_norm": 87.12923431396484, - "learning_rate": 7.775884534429188e-06, - "loss": 1.4831, + "epoch": 0.77, + "grad_norm": 21.36313819885254, + "learning_rate": 1.4897711584319961e-05, + "loss": 1.6182, "step": 6099 }, { - "epoch": 1.83, - "grad_norm": 9.995388984680176, - "learning_rate": 7.773879923824798e-06, - "loss": 1.2102, + "epoch": 0.77, + "grad_norm": 10.490818977355957, + "learning_rate": 1.4896874869263273e-05, + "loss": 1.1183, "step": 6100 }, { - "epoch": 1.83, - "grad_norm": 13.057961463928223, - "learning_rate": 7.771875313220408e-06, - "loss": 1.3872, + "epoch": 0.77, + "grad_norm": 9.165608406066895, + "learning_rate": 1.4896038154206587e-05, + "loss": 1.6784, "step": 6101 }, { - "epoch": 1.83, - "grad_norm": 8.191656112670898, - "learning_rate": 7.769870702616016e-06, - "loss": 1.3387, + "epoch": 0.77, + "grad_norm": 22.0075740814209, + "learning_rate": 1.4895201439149899e-05, + "loss": 1.4024, "step": 6102 }, { - "epoch": 1.83, - "grad_norm": 25.005573272705078, - "learning_rate": 7.767866092011628e-06, - "loss": 1.393, + "epoch": 0.77, + "grad_norm": 21.335983276367188, + "learning_rate": 1.489436472409321e-05, + "loss": 2.9008, "step": 6103 }, { - "epoch": 1.84, - "grad_norm": 11.448406219482422, - "learning_rate": 7.765861481407238e-06, - "loss": 1.2107, + "epoch": 0.77, + "grad_norm": 11.496278762817383, + "learning_rate": 1.4893528009036524e-05, + "loss": 2.6356, "step": 6104 }, { - "epoch": 1.84, - "grad_norm": 10.444814682006836, - "learning_rate": 7.763856870802847e-06, - "loss": 1.1085, + "epoch": 0.77, + "grad_norm": 10.158914566040039, + "learning_rate": 1.4892691293979838e-05, + "loss": 3.4004, "step": 6105 }, { - "epoch": 1.84, - "grad_norm": 22.36345863342285, - "learning_rate": 7.761852260198457e-06, - "loss": 1.4649, + "epoch": 0.77, + "grad_norm": 23.894088745117188, + "learning_rate": 1.4891854578923148e-05, + "loss": 2.5214, "step": 6106 }, { - "epoch": 1.84, - "grad_norm": 12.638916015625, - "learning_rate": 7.759847649594067e-06, - "loss": 1.0798, + "epoch": 0.77, + "grad_norm": 5.749575614929199, + "learning_rate": 1.4891017863866462e-05, + "loss": 0.5022, "step": 6107 }, { - "epoch": 1.84, - "grad_norm": 30.909666061401367, - "learning_rate": 7.757843038989677e-06, - "loss": 1.3542, + "epoch": 0.77, + "grad_norm": 17.750463485717773, + "learning_rate": 1.4890181148809775e-05, + "loss": 2.0954, "step": 6108 }, { - "epoch": 1.84, - "grad_norm": 21.984508514404297, - "learning_rate": 7.755838428385287e-06, - "loss": 1.5787, + "epoch": 0.77, + "grad_norm": 9.65576457977295, + "learning_rate": 1.4889344433753086e-05, + "loss": 1.0989, "step": 6109 }, { - "epoch": 1.84, - "grad_norm": 28.869102478027344, - "learning_rate": 7.753833817780897e-06, - "loss": 1.8047, + "epoch": 0.77, + "grad_norm": 16.375839233398438, + "learning_rate": 1.48885077186964e-05, + "loss": 1.2684, "step": 6110 }, { - "epoch": 1.84, - "grad_norm": 8.794825553894043, - "learning_rate": 7.751829207176507e-06, - "loss": 0.4721, + "epoch": 0.77, + "grad_norm": 34.77222442626953, + "learning_rate": 1.4887671003639713e-05, + "loss": 2.8609, "step": 6111 }, { - "epoch": 1.84, - "grad_norm": 20.192941665649414, - "learning_rate": 7.749824596572117e-06, - "loss": 1.8928, + "epoch": 0.77, + "grad_norm": 11.829339981079102, + "learning_rate": 1.4886834288583025e-05, + "loss": 1.7961, "step": 6112 }, { - "epoch": 1.84, - "grad_norm": 8.28946304321289, - "learning_rate": 7.747819985967726e-06, - "loss": 0.8887, + "epoch": 0.77, + "grad_norm": 21.34874153137207, + "learning_rate": 1.4885997573526337e-05, + "loss": 2.5703, "step": 6113 }, { - "epoch": 1.84, - "grad_norm": 7.44697380065918, - "learning_rate": 7.745815375363336e-06, - "loss": 1.0449, + "epoch": 0.77, + "grad_norm": 14.126192092895508, + "learning_rate": 1.4885160858469649e-05, + "loss": 2.9025, "step": 6114 }, { - "epoch": 1.84, - "grad_norm": 47.45841598510742, - "learning_rate": 7.743810764758947e-06, - "loss": 1.8076, + "epoch": 0.77, + "grad_norm": 15.445282936096191, + "learning_rate": 1.4884324143412962e-05, + "loss": 1.7945, "step": 6115 }, { - "epoch": 1.84, - "grad_norm": 22.896045684814453, - "learning_rate": 7.741806154154556e-06, - "loss": 1.3109, + "epoch": 0.77, + "grad_norm": 10.755892753601074, + "learning_rate": 1.4883487428356274e-05, + "loss": 1.4015, "step": 6116 }, { - "epoch": 1.84, - "grad_norm": 51.84198760986328, - "learning_rate": 7.739801543550166e-06, - "loss": 1.9647, + "epoch": 0.77, + "grad_norm": 7.058221817016602, + "learning_rate": 1.4882650713299586e-05, + "loss": 1.9887, "step": 6117 }, { - "epoch": 1.84, - "grad_norm": 16.364362716674805, - "learning_rate": 7.737796932945776e-06, - "loss": 1.6399, + "epoch": 0.77, + "grad_norm": 21.246171951293945, + "learning_rate": 1.48818139982429e-05, + "loss": 3.261, "step": 6118 }, { - "epoch": 1.84, - "grad_norm": 14.368910789489746, - "learning_rate": 7.735792322341386e-06, - "loss": 1.0631, + "epoch": 0.77, + "grad_norm": 13.608878135681152, + "learning_rate": 1.4880977283186214e-05, + "loss": 2.2239, "step": 6119 }, { - "epoch": 1.84, - "grad_norm": 11.560124397277832, - "learning_rate": 7.733787711736994e-06, - "loss": 1.6401, - "step": 6120 - }, - { - "epoch": 1.84, - "eval_loss": 0.1900317370891571, - "eval_runtime": 43.601, - "eval_samples_per_second": 33.921, - "eval_steps_per_second": 33.921, + "epoch": 0.77, + "grad_norm": 20.881624221801758, + "learning_rate": 1.4880140568129524e-05, + "loss": 1.8091, "step": 6120 }, { - "epoch": 1.84, - "grad_norm": 11.354382514953613, - "learning_rate": 7.731783101132606e-06, - "loss": 1.3231, + "epoch": 0.77, + "grad_norm": 7.887094497680664, + "learning_rate": 1.4879303853072838e-05, + "loss": 0.7012, "step": 6121 }, { - "epoch": 1.84, - "grad_norm": 8.842697143554688, - "learning_rate": 7.729778490528216e-06, - "loss": 0.5822, + "epoch": 0.77, + "grad_norm": 19.80340003967285, + "learning_rate": 1.4878467138016151e-05, + "loss": 1.027, "step": 6122 }, { - "epoch": 1.84, - "grad_norm": 44.98017501831055, - "learning_rate": 7.727773879923826e-06, - "loss": 1.6967, + "epoch": 0.77, + "grad_norm": 22.05449104309082, + "learning_rate": 1.4877630422959461e-05, + "loss": 0.8, "step": 6123 }, { - "epoch": 1.84, - "grad_norm": 29.9891414642334, - "learning_rate": 7.725769269319435e-06, - "loss": 1.5777, + "epoch": 0.77, + "grad_norm": 8.357983589172363, + "learning_rate": 1.4876793707902775e-05, + "loss": 0.6011, "step": 6124 }, { - "epoch": 1.84, - "grad_norm": 25.591136932373047, - "learning_rate": 7.723764658715045e-06, - "loss": 0.9665, + "epoch": 0.77, + "grad_norm": 8.36915397644043, + "learning_rate": 1.4875956992846087e-05, + "loss": 0.7543, "step": 6125 }, { - "epoch": 1.84, - "grad_norm": 13.819255828857422, - "learning_rate": 7.721760048110655e-06, - "loss": 2.2739, + "epoch": 0.77, + "grad_norm": 31.40436553955078, + "learning_rate": 1.48751202777894e-05, + "loss": 2.5164, "step": 6126 }, { - "epoch": 1.84, - "grad_norm": 28.475740432739258, - "learning_rate": 7.719755437506265e-06, - "loss": 2.247, + "epoch": 0.77, + "grad_norm": 15.912656784057617, + "learning_rate": 1.4874283562732713e-05, + "loss": 1.6534, "step": 6127 }, { - "epoch": 1.84, - "grad_norm": 12.238512992858887, - "learning_rate": 7.717750826901875e-06, - "loss": 0.8953, + "epoch": 0.77, + "grad_norm": 7.357669353485107, + "learning_rate": 1.4873446847676025e-05, + "loss": 2.3133, "step": 6128 }, { - "epoch": 1.84, - "grad_norm": 35.80490493774414, - "learning_rate": 7.715746216297485e-06, - "loss": 3.4695, + "epoch": 0.77, + "grad_norm": 7.6885881423950195, + "learning_rate": 1.4872610132619338e-05, + "loss": 0.8044, "step": 6129 }, { - "epoch": 1.84, - "grad_norm": 7.781877040863037, - "learning_rate": 7.713741605693095e-06, - "loss": 1.8642, + "epoch": 0.77, + "grad_norm": 11.81246280670166, + "learning_rate": 1.487177341756265e-05, + "loss": 0.8979, "step": 6130 }, { - "epoch": 1.84, - "grad_norm": 13.301283836364746, - "learning_rate": 7.711736995088704e-06, - "loss": 1.3696, + "epoch": 0.77, + "grad_norm": 5.839165210723877, + "learning_rate": 1.4870936702505962e-05, + "loss": 0.5538, "step": 6131 }, { - "epoch": 1.84, - "grad_norm": 10.512940406799316, - "learning_rate": 7.709732384484314e-06, - "loss": 0.6859, + "epoch": 0.77, + "grad_norm": 9.871504783630371, + "learning_rate": 1.4870099987449276e-05, + "loss": 0.5526, "step": 6132 }, { - "epoch": 1.84, - "grad_norm": 25.800260543823242, - "learning_rate": 7.707727773879926e-06, - "loss": 1.9089, + "epoch": 0.77, + "grad_norm": 27.299074172973633, + "learning_rate": 1.486926327239259e-05, + "loss": 2.091, "step": 6133 }, { - "epoch": 1.84, - "grad_norm": 15.625998497009277, - "learning_rate": 7.705723163275534e-06, - "loss": 1.8234, + "epoch": 0.77, + "grad_norm": 60.671478271484375, + "learning_rate": 1.48684265573359e-05, + "loss": 3.327, "step": 6134 }, { - "epoch": 1.84, - "grad_norm": 42.901615142822266, - "learning_rate": 7.703718552671144e-06, - "loss": 2.0355, + "epoch": 0.77, + "grad_norm": 9.930606842041016, + "learning_rate": 1.4867589842279213e-05, + "loss": 1.24, "step": 6135 }, { - "epoch": 1.84, - "grad_norm": 15.231833457946777, - "learning_rate": 7.701713942066754e-06, - "loss": 1.1297, + "epoch": 0.77, + "grad_norm": 23.737319946289062, + "learning_rate": 1.4866753127222527e-05, + "loss": 1.906, "step": 6136 }, { - "epoch": 1.85, - "grad_norm": 21.943153381347656, - "learning_rate": 7.699709331462364e-06, - "loss": 1.2517, + "epoch": 0.77, + "grad_norm": 15.565106391906738, + "learning_rate": 1.4865916412165837e-05, + "loss": 1.4209, "step": 6137 }, { - "epoch": 1.85, - "grad_norm": 22.251258850097656, - "learning_rate": 7.697704720857974e-06, - "loss": 1.5025, + "epoch": 0.77, + "grad_norm": 14.134838104248047, + "learning_rate": 1.4865079697109151e-05, + "loss": 1.0566, "step": 6138 }, { - "epoch": 1.85, - "grad_norm": 15.252081871032715, - "learning_rate": 7.695700110253583e-06, - "loss": 1.0183, + "epoch": 0.77, + "grad_norm": 29.05521583557129, + "learning_rate": 1.4864242982052463e-05, + "loss": 1.4515, "step": 6139 }, { - "epoch": 1.85, - "grad_norm": 26.634531021118164, - "learning_rate": 7.693695499649194e-06, - "loss": 1.5842, + "epoch": 0.77, + "grad_norm": 23.578039169311523, + "learning_rate": 1.4863406266995777e-05, + "loss": 1.5878, "step": 6140 }, { - "epoch": 1.85, - "grad_norm": 29.84535789489746, - "learning_rate": 7.691690889044805e-06, - "loss": 1.862, + "epoch": 0.77, + "grad_norm": 31.573808670043945, + "learning_rate": 1.4862569551939088e-05, + "loss": 3.083, "step": 6141 }, { - "epoch": 1.85, - "grad_norm": 24.699621200561523, - "learning_rate": 7.689686278440413e-06, - "loss": 1.8952, + "epoch": 0.77, + "grad_norm": 14.645096778869629, + "learning_rate": 1.48617328368824e-05, + "loss": 1.0865, "step": 6142 }, { - "epoch": 1.85, - "grad_norm": 26.514081954956055, - "learning_rate": 7.687681667836023e-06, - "loss": 1.948, + "epoch": 0.77, + "grad_norm": 22.972885131835938, + "learning_rate": 1.4860896121825714e-05, + "loss": 1.9469, "step": 6143 }, { - "epoch": 1.85, - "grad_norm": 11.536233901977539, - "learning_rate": 7.685677057231633e-06, - "loss": 2.2692, + "epoch": 0.77, + "grad_norm": 6.411251544952393, + "learning_rate": 1.4860059406769024e-05, + "loss": 0.3834, "step": 6144 }, { - "epoch": 1.85, - "grad_norm": 30.69923973083496, - "learning_rate": 7.683672446627243e-06, - "loss": 1.2475, + "epoch": 0.77, + "grad_norm": 21.848386764526367, + "learning_rate": 1.4859222691712338e-05, + "loss": 1.6994, "step": 6145 }, { - "epoch": 1.85, - "grad_norm": 19.856365203857422, - "learning_rate": 7.681667836022853e-06, - "loss": 1.361, + "epoch": 0.77, + "grad_norm": 57.35751724243164, + "learning_rate": 1.4858385976655652e-05, + "loss": 2.9288, "step": 6146 }, { - "epoch": 1.85, - "grad_norm": 10.169954299926758, - "learning_rate": 7.679663225418463e-06, - "loss": 1.3548, + "epoch": 0.77, + "grad_norm": 10.219704627990723, + "learning_rate": 1.4857549261598965e-05, + "loss": 0.3866, "step": 6147 }, { - "epoch": 1.85, - "grad_norm": 36.704524993896484, - "learning_rate": 7.677658614814073e-06, - "loss": 1.8048, + "epoch": 0.77, + "grad_norm": 17.33302879333496, + "learning_rate": 1.4856712546542276e-05, + "loss": 2.0465, "step": 6148 }, { - "epoch": 1.85, - "grad_norm": 14.586971282958984, - "learning_rate": 7.675654004209683e-06, - "loss": 1.6924, + "epoch": 0.77, + "grad_norm": 6.763572692871094, + "learning_rate": 1.485587583148559e-05, + "loss": 0.2575, "step": 6149 }, { - "epoch": 1.85, - "grad_norm": 31.322877883911133, - "learning_rate": 7.673649393605292e-06, - "loss": 2.1292, + "epoch": 0.77, + "grad_norm": 8.814136505126953, + "learning_rate": 1.4855039116428903e-05, + "loss": 2.5439, "step": 6150 }, { - "epoch": 1.85, - "grad_norm": 13.903470993041992, - "learning_rate": 7.671644783000902e-06, - "loss": 1.5199, + "epoch": 0.77, + "grad_norm": 16.717769622802734, + "learning_rate": 1.4854202401372213e-05, + "loss": 1.1557, "step": 6151 }, { - "epoch": 1.85, - "grad_norm": 206.5603485107422, - "learning_rate": 7.669640172396514e-06, - "loss": 2.1998, + "epoch": 0.77, + "grad_norm": 10.755870819091797, + "learning_rate": 1.4853365686315527e-05, + "loss": 1.4779, "step": 6152 }, { - "epoch": 1.85, - "grad_norm": 42.4091682434082, - "learning_rate": 7.667635561792122e-06, - "loss": 1.8714, + "epoch": 0.77, + "grad_norm": 79.06465911865234, + "learning_rate": 1.4852528971258839e-05, + "loss": 1.947, "step": 6153 }, { - "epoch": 1.85, - "grad_norm": 10.952982902526855, - "learning_rate": 7.665630951187732e-06, - "loss": 1.7223, + "epoch": 0.77, + "grad_norm": 13.590725898742676, + "learning_rate": 1.4851692256202152e-05, + "loss": 2.8569, "step": 6154 }, { - "epoch": 1.85, - "grad_norm": 12.478805541992188, - "learning_rate": 7.663626340583342e-06, - "loss": 0.9292, + "epoch": 0.77, + "grad_norm": 7.518975734710693, + "learning_rate": 1.4850855541145464e-05, + "loss": 1.182, "step": 6155 }, { - "epoch": 1.85, - "grad_norm": 45.16120910644531, - "learning_rate": 7.661621729978952e-06, - "loss": 1.1461, + "epoch": 0.77, + "grad_norm": 8.839305877685547, + "learning_rate": 1.4850018826088776e-05, + "loss": 1.5005, "step": 6156 }, { - "epoch": 1.85, - "grad_norm": 11.00290298461914, - "learning_rate": 7.659617119374562e-06, - "loss": 1.3505, + "epoch": 0.77, + "grad_norm": 10.575740814208984, + "learning_rate": 1.484918211103209e-05, + "loss": 0.9277, "step": 6157 }, { - "epoch": 1.85, - "grad_norm": 12.987632751464844, - "learning_rate": 7.657612508770173e-06, - "loss": 1.1632, + "epoch": 0.77, + "grad_norm": 11.546903610229492, + "learning_rate": 1.48483453959754e-05, + "loss": 1.5759, "step": 6158 }, { - "epoch": 1.85, - "grad_norm": 23.716419219970703, - "learning_rate": 7.655607898165783e-06, - "loss": 1.9068, + "epoch": 0.77, + "grad_norm": 310.0779113769531, + "learning_rate": 1.4847508680918714e-05, + "loss": 0.823, "step": 6159 }, { - "epoch": 1.85, - "grad_norm": 30.917993545532227, - "learning_rate": 7.653603287561393e-06, - "loss": 1.9522, + "epoch": 0.77, + "grad_norm": 16.000261306762695, + "learning_rate": 1.4846671965862027e-05, + "loss": 1.8281, "step": 6160 }, { - "epoch": 1.85, - "grad_norm": 9.036660194396973, - "learning_rate": 7.651598676957001e-06, - "loss": 1.1854, + "epoch": 0.77, + "grad_norm": 13.16162395477295, + "learning_rate": 1.4845835250805341e-05, + "loss": 1.8281, "step": 6161 }, { - "epoch": 1.85, - "grad_norm": 53.36625289916992, - "learning_rate": 7.649594066352611e-06, - "loss": 1.6606, + "epoch": 0.77, + "grad_norm": 37.1568603515625, + "learning_rate": 1.4844998535748651e-05, + "loss": 2.4353, "step": 6162 }, { - "epoch": 1.85, - "grad_norm": 23.467453002929688, - "learning_rate": 7.647589455748221e-06, - "loss": 1.7637, + "epoch": 0.77, + "grad_norm": 7.650110721588135, + "learning_rate": 1.4844161820691965e-05, + "loss": 2.1214, "step": 6163 }, { - "epoch": 1.85, - "grad_norm": 16.767295837402344, - "learning_rate": 7.645584845143831e-06, - "loss": 1.3892, + "epoch": 0.77, + "grad_norm": 19.80661964416504, + "learning_rate": 1.4843325105635279e-05, + "loss": 1.6672, "step": 6164 }, { - "epoch": 1.85, - "grad_norm": 21.95707893371582, - "learning_rate": 7.643580234539441e-06, - "loss": 2.415, + "epoch": 0.77, + "grad_norm": 14.368863105773926, + "learning_rate": 1.4842488390578589e-05, + "loss": 1.7552, "step": 6165 }, { - "epoch": 1.85, - "grad_norm": 14.439309120178223, - "learning_rate": 7.641575623935052e-06, - "loss": 1.575, + "epoch": 0.77, + "grad_norm": 14.653712272644043, + "learning_rate": 1.4841651675521903e-05, + "loss": 0.6312, "step": 6166 }, { - "epoch": 1.85, - "grad_norm": 8.983049392700195, - "learning_rate": 7.639571013330662e-06, - "loss": 1.2169, + "epoch": 0.77, + "grad_norm": 19.684627532958984, + "learning_rate": 1.4840814960465215e-05, + "loss": 0.2277, "step": 6167 }, { - "epoch": 1.85, - "grad_norm": 15.929500579833984, - "learning_rate": 7.63756640272627e-06, - "loss": 1.3968, + "epoch": 0.77, + "grad_norm": 11.68897819519043, + "learning_rate": 1.4839978245408528e-05, + "loss": 0.9268, "step": 6168 }, { - "epoch": 1.85, - "grad_norm": 16.895946502685547, - "learning_rate": 7.63556179212188e-06, - "loss": 1.773, + "epoch": 0.77, + "grad_norm": 13.049945831298828, + "learning_rate": 1.483914153035184e-05, + "loss": 0.775, "step": 6169 }, { - "epoch": 1.86, - "grad_norm": 11.67673110961914, - "learning_rate": 7.633557181517492e-06, - "loss": 0.8359, + "epoch": 0.77, + "grad_norm": 23.58036231994629, + "learning_rate": 1.4838304815295152e-05, + "loss": 1.8576, "step": 6170 }, { - "epoch": 1.86, - "grad_norm": 12.90839958190918, - "learning_rate": 7.6315525709131e-06, - "loss": 1.2435, + "epoch": 0.77, + "grad_norm": 4.773311138153076, + "learning_rate": 1.4837468100238466e-05, + "loss": 1.0645, "step": 6171 }, { - "epoch": 1.86, - "grad_norm": 50.556400299072266, - "learning_rate": 7.62954796030871e-06, - "loss": 3.8311, + "epoch": 0.77, + "grad_norm": 20.70108985900879, + "learning_rate": 1.4836631385181776e-05, + "loss": 3.5102, "step": 6172 }, { - "epoch": 1.86, - "grad_norm": 9.137083053588867, - "learning_rate": 7.6275433497043204e-06, - "loss": 0.9147, + "epoch": 0.77, + "grad_norm": 8.755514144897461, + "learning_rate": 1.483579467012509e-05, + "loss": 1.0022, "step": 6173 }, { - "epoch": 1.86, - "grad_norm": 39.34935760498047, - "learning_rate": 7.6255387390999305e-06, - "loss": 2.6975, + "epoch": 0.77, + "grad_norm": 17.600788116455078, + "learning_rate": 1.4834957955068403e-05, + "loss": 0.7465, "step": 6174 }, { - "epoch": 1.86, - "grad_norm": 14.056702613830566, - "learning_rate": 7.62353412849554e-06, - "loss": 1.3314, + "epoch": 0.77, + "grad_norm": 21.20954132080078, + "learning_rate": 1.4834121240011717e-05, + "loss": 2.1938, "step": 6175 }, { - "epoch": 1.86, - "grad_norm": 14.793180465698242, - "learning_rate": 7.621529517891151e-06, - "loss": 1.8193, + "epoch": 0.78, + "grad_norm": 9.03451156616211, + "learning_rate": 1.4833284524955027e-05, + "loss": 1.047, "step": 6176 }, { - "epoch": 1.86, - "grad_norm": 6.99417781829834, - "learning_rate": 7.619524907286761e-06, - "loss": 0.9592, + "epoch": 0.78, + "grad_norm": 10.841198921203613, + "learning_rate": 1.4832447809898341e-05, + "loss": 2.5945, "step": 6177 }, { - "epoch": 1.86, - "grad_norm": 8.927657127380371, - "learning_rate": 7.61752029668237e-06, - "loss": 0.7251, + "epoch": 0.78, + "grad_norm": 8.859786033630371, + "learning_rate": 1.4831611094841653e-05, + "loss": 1.9457, "step": 6178 }, { - "epoch": 1.86, - "grad_norm": 22.89257049560547, - "learning_rate": 7.61551568607798e-06, - "loss": 2.0944, + "epoch": 0.78, + "grad_norm": 3.40340518951416, + "learning_rate": 1.4830774379784965e-05, + "loss": 0.1393, "step": 6179 }, { - "epoch": 1.86, - "grad_norm": 33.927913665771484, - "learning_rate": 7.613511075473589e-06, - "loss": 2.6198, + "epoch": 0.78, + "grad_norm": 24.379648208618164, + "learning_rate": 1.4829937664728278e-05, + "loss": 1.8299, "step": 6180 }, { - "epoch": 1.86, - "grad_norm": 18.200057983398438, - "learning_rate": 7.611506464869199e-06, - "loss": 1.3679, + "epoch": 0.78, + "grad_norm": 10.48060131072998, + "learning_rate": 1.482910094967159e-05, + "loss": 2.0846, "step": 6181 }, { - "epoch": 1.86, - "grad_norm": 17.662506103515625, - "learning_rate": 7.6095018542648095e-06, - "loss": 2.0992, + "epoch": 0.78, + "grad_norm": 48.35779571533203, + "learning_rate": 1.4828264234614904e-05, + "loss": 3.2575, "step": 6182 }, { - "epoch": 1.86, - "grad_norm": 11.235939979553223, - "learning_rate": 7.60749724366042e-06, - "loss": 1.4116, + "epoch": 0.78, + "grad_norm": 9.917369842529297, + "learning_rate": 1.4827427519558214e-05, + "loss": 0.4835, "step": 6183 }, { - "epoch": 1.86, - "grad_norm": 19.397539138793945, - "learning_rate": 7.60549263305603e-06, - "loss": 1.3618, + "epoch": 0.78, + "grad_norm": 4.46663236618042, + "learning_rate": 1.4826590804501528e-05, + "loss": 0.9349, "step": 6184 }, { - "epoch": 1.86, - "grad_norm": 10.240893363952637, - "learning_rate": 7.60348802245164e-06, - "loss": 1.0369, + "epoch": 0.78, + "grad_norm": 11.6162691116333, + "learning_rate": 1.4825754089444842e-05, + "loss": 1.7973, "step": 6185 }, { - "epoch": 1.86, - "grad_norm": 14.981084823608398, - "learning_rate": 7.601483411847249e-06, - "loss": 0.8672, + "epoch": 0.78, + "grad_norm": 26.446144104003906, + "learning_rate": 1.4824917374388152e-05, + "loss": 1.2248, "step": 6186 }, { - "epoch": 1.86, - "grad_norm": 12.943142890930176, - "learning_rate": 7.599478801242859e-06, - "loss": 1.19, + "epoch": 0.78, + "grad_norm": 6.8402838706970215, + "learning_rate": 1.4824080659331466e-05, + "loss": 0.6867, "step": 6187 }, { - "epoch": 1.86, - "grad_norm": 15.80017375946045, - "learning_rate": 7.597474190638468e-06, - "loss": 1.2749, + "epoch": 0.78, + "grad_norm": 13.780156135559082, + "learning_rate": 1.4823243944274779e-05, + "loss": 2.4299, "step": 6188 }, { - "epoch": 1.86, - "grad_norm": 15.71901798248291, - "learning_rate": 7.595469580034079e-06, - "loss": 1.0749, + "epoch": 0.78, + "grad_norm": 7.482303142547607, + "learning_rate": 1.4822407229218093e-05, + "loss": 0.7451, "step": 6189 }, { - "epoch": 1.86, - "grad_norm": 13.598134994506836, - "learning_rate": 7.593464969429689e-06, - "loss": 1.2334, + "epoch": 0.78, + "grad_norm": 10.03032398223877, + "learning_rate": 1.4821570514161403e-05, + "loss": 0.628, "step": 6190 }, { - "epoch": 1.86, - "grad_norm": 13.935446739196777, - "learning_rate": 7.5914603588252986e-06, - "loss": 1.2212, + "epoch": 0.78, + "grad_norm": 20.40422248840332, + "learning_rate": 1.4820733799104717e-05, + "loss": 1.8446, "step": 6191 }, { - "epoch": 1.86, - "grad_norm": 28.57442855834961, - "learning_rate": 7.589455748220909e-06, - "loss": 2.0672, + "epoch": 0.78, + "grad_norm": 10.26680850982666, + "learning_rate": 1.4819897084048029e-05, + "loss": 2.0607, "step": 6192 }, { - "epoch": 1.86, - "grad_norm": 35.023197174072266, - "learning_rate": 7.587451137616518e-06, - "loss": 1.8612, + "epoch": 0.78, + "grad_norm": 16.696704864501953, + "learning_rate": 1.481906036899134e-05, + "loss": 2.4927, "step": 6193 }, { - "epoch": 1.86, - "grad_norm": 11.907183647155762, - "learning_rate": 7.585446527012128e-06, - "loss": 1.6789, + "epoch": 0.78, + "grad_norm": 21.497238159179688, + "learning_rate": 1.4818223653934654e-05, + "loss": 1.3641, "step": 6194 }, { - "epoch": 1.86, - "grad_norm": 18.430017471313477, - "learning_rate": 7.583441916407739e-06, - "loss": 1.3619, + "epoch": 0.78, + "grad_norm": 5.724583148956299, + "learning_rate": 1.4817386938877966e-05, + "loss": 0.6211, "step": 6195 }, { - "epoch": 1.86, - "grad_norm": 29.030160903930664, - "learning_rate": 7.581437305803349e-06, - "loss": 1.116, + "epoch": 0.78, + "grad_norm": 14.418787956237793, + "learning_rate": 1.481655022382128e-05, + "loss": 1.2559, "step": 6196 }, { - "epoch": 1.86, - "grad_norm": 18.958419799804688, - "learning_rate": 7.579432695198958e-06, - "loss": 1.6912, + "epoch": 0.78, + "grad_norm": 18.260189056396484, + "learning_rate": 1.481571350876459e-05, + "loss": 1.4076, "step": 6197 }, { - "epoch": 1.86, - "grad_norm": 19.51006317138672, - "learning_rate": 7.577428084594568e-06, - "loss": 1.6718, + "epoch": 0.78, + "grad_norm": 16.18914794921875, + "learning_rate": 1.4814876793707904e-05, + "loss": 1.6051, "step": 6198 }, { - "epoch": 1.86, - "grad_norm": 13.113887786865234, - "learning_rate": 7.5754234739901775e-06, - "loss": 1.2158, + "epoch": 0.78, + "grad_norm": 11.067285537719727, + "learning_rate": 1.4814040078651217e-05, + "loss": 1.6897, "step": 6199 }, { - "epoch": 1.86, - "grad_norm": 13.66335678100586, - "learning_rate": 7.573418863385788e-06, - "loss": 0.8829, + "epoch": 0.78, + "grad_norm": 13.584056854248047, + "learning_rate": 1.4813203363594528e-05, + "loss": 2.4539, "step": 6200 }, { - "epoch": 1.86, - "grad_norm": 12.062017440795898, - "learning_rate": 7.5714142527813986e-06, - "loss": 1.7451, + "epoch": 0.78, + "grad_norm": 18.683652877807617, + "learning_rate": 1.4812366648537841e-05, + "loss": 1.0933, "step": 6201 }, { - "epoch": 1.86, - "grad_norm": 12.591232299804688, - "learning_rate": 7.569409642177008e-06, - "loss": 1.4194, + "epoch": 0.78, + "grad_norm": 60.37272262573242, + "learning_rate": 1.4811529933481155e-05, + "loss": 2.5123, "step": 6202 }, { - "epoch": 1.87, - "grad_norm": 9.007003784179688, - "learning_rate": 7.567405031572618e-06, - "loss": 0.6593, + "epoch": 0.78, + "grad_norm": 9.876708030700684, + "learning_rate": 1.4810693218424469e-05, + "loss": 0.7678, "step": 6203 }, { - "epoch": 1.87, - "grad_norm": 15.050545692443848, - "learning_rate": 7.565400420968227e-06, - "loss": 1.2698, + "epoch": 0.78, + "grad_norm": 17.32562255859375, + "learning_rate": 1.4809856503367779e-05, + "loss": 0.7407, "step": 6204 }, { - "epoch": 1.87, - "grad_norm": 53.996299743652344, - "learning_rate": 7.563395810363837e-06, - "loss": 2.5506, + "epoch": 0.78, + "grad_norm": 28.528152465820312, + "learning_rate": 1.4809019788311093e-05, + "loss": 1.6875, "step": 6205 }, { - "epoch": 1.87, - "grad_norm": 16.08456039428711, - "learning_rate": 7.561391199759447e-06, - "loss": 0.7814, + "epoch": 0.78, + "grad_norm": 30.383686065673828, + "learning_rate": 1.4808183073254404e-05, + "loss": 2.2793, "step": 6206 }, { - "epoch": 1.87, - "grad_norm": 19.180896759033203, - "learning_rate": 7.559386589155057e-06, - "loss": 2.2722, + "epoch": 0.78, + "grad_norm": 13.102036476135254, + "learning_rate": 1.4807346358197716e-05, + "loss": 1.1206, "step": 6207 }, { - "epoch": 1.87, - "grad_norm": 12.53034782409668, - "learning_rate": 7.5573819785506675e-06, - "loss": 1.113, + "epoch": 0.78, + "grad_norm": 16.857431411743164, + "learning_rate": 1.480650964314103e-05, + "loss": 2.5142, "step": 6208 }, { - "epoch": 1.87, - "grad_norm": 12.597771644592285, - "learning_rate": 7.5553773679462775e-06, - "loss": 1.7598, + "epoch": 0.78, + "grad_norm": 9.684059143066406, + "learning_rate": 1.4805672928084342e-05, + "loss": 1.1404, "step": 6209 }, { - "epoch": 1.87, - "grad_norm": 8.483002662658691, - "learning_rate": 7.553372757341887e-06, - "loss": 0.8353, + "epoch": 0.78, + "grad_norm": 10.59528923034668, + "learning_rate": 1.4804836213027656e-05, + "loss": 1.5664, "step": 6210 }, { - "epoch": 1.87, - "grad_norm": 9.132856369018555, - "learning_rate": 7.551368146737497e-06, - "loss": 1.0596, + "epoch": 0.78, + "grad_norm": 14.23654556274414, + "learning_rate": 1.4803999497970966e-05, + "loss": 1.0221, "step": 6211 }, { - "epoch": 1.87, - "grad_norm": 12.06258773803711, - "learning_rate": 7.549363536133106e-06, - "loss": 1.3816, + "epoch": 0.78, + "grad_norm": 7.1277923583984375, + "learning_rate": 1.480316278291428e-05, + "loss": 1.7848, "step": 6212 }, { - "epoch": 1.87, - "grad_norm": 12.495207786560059, - "learning_rate": 7.547358925528717e-06, - "loss": 2.0172, + "epoch": 0.78, + "grad_norm": 10.347265243530273, + "learning_rate": 1.4802326067857593e-05, + "loss": 0.6381, "step": 6213 }, { - "epoch": 1.87, - "grad_norm": 15.568368911743164, - "learning_rate": 7.545354314924327e-06, - "loss": 2.3213, + "epoch": 0.78, + "grad_norm": 15.681981086730957, + "learning_rate": 1.4801489352800904e-05, + "loss": 1.7937, "step": 6214 }, { - "epoch": 1.87, - "grad_norm": 57.913516998291016, - "learning_rate": 7.543349704319936e-06, - "loss": 1.5673, + "epoch": 0.78, + "grad_norm": 12.805305480957031, + "learning_rate": 1.4800652637744217e-05, + "loss": 2.8912, "step": 6215 }, { - "epoch": 1.87, - "grad_norm": 11.138136863708496, - "learning_rate": 7.5413450937155464e-06, - "loss": 1.1155, + "epoch": 0.78, + "grad_norm": 11.391907691955566, + "learning_rate": 1.479981592268753e-05, + "loss": 2.3061, "step": 6216 }, { - "epoch": 1.87, - "grad_norm": 36.35742950439453, - "learning_rate": 7.539340483111156e-06, - "loss": 0.877, + "epoch": 0.78, + "grad_norm": 13.441726684570312, + "learning_rate": 1.4798979207630843e-05, + "loss": 1.463, "step": 6217 }, { - "epoch": 1.87, - "grad_norm": 11.55097484588623, - "learning_rate": 7.537335872506766e-06, - "loss": 0.539, + "epoch": 0.78, + "grad_norm": 11.16281509399414, + "learning_rate": 1.4798142492574155e-05, + "loss": 1.4862, "step": 6218 }, { - "epoch": 1.87, - "grad_norm": 18.280834197998047, - "learning_rate": 7.535331261902377e-06, - "loss": 1.5513, + "epoch": 0.78, + "grad_norm": 9.24189567565918, + "learning_rate": 1.4797305777517468e-05, + "loss": 1.9563, "step": 6219 }, { - "epoch": 1.87, - "grad_norm": 30.530216217041016, - "learning_rate": 7.533326651297986e-06, - "loss": 1.9069, + "epoch": 0.78, + "grad_norm": 39.83635330200195, + "learning_rate": 1.479646906246078e-05, + "loss": 2.1532, "step": 6220 }, { - "epoch": 1.87, - "grad_norm": 28.866329193115234, - "learning_rate": 7.531322040693596e-06, - "loss": 1.4386, + "epoch": 0.78, + "grad_norm": 21.487403869628906, + "learning_rate": 1.4795632347404092e-05, + "loss": 1.46, "step": 6221 }, { - "epoch": 1.87, - "grad_norm": 22.6090145111084, - "learning_rate": 7.529317430089206e-06, - "loss": 2.3103, + "epoch": 0.78, + "grad_norm": 16.41718292236328, + "learning_rate": 1.4794795632347406e-05, + "loss": 2.1983, "step": 6222 }, { - "epoch": 1.87, - "grad_norm": 9.455753326416016, - "learning_rate": 7.527312819484815e-06, - "loss": 1.0053, + "epoch": 0.78, + "grad_norm": 30.581064224243164, + "learning_rate": 1.4793958917290718e-05, + "loss": 1.1738, "step": 6223 }, { - "epoch": 1.87, - "grad_norm": 47.22819519042969, - "learning_rate": 7.525308208880425e-06, - "loss": 2.0746, + "epoch": 0.78, + "grad_norm": 11.414182662963867, + "learning_rate": 1.4793122202234032e-05, + "loss": 1.7153, "step": 6224 }, { - "epoch": 1.87, - "grad_norm": 16.05484962463379, - "learning_rate": 7.523303598276035e-06, - "loss": 1.4145, + "epoch": 0.78, + "grad_norm": 12.448118209838867, + "learning_rate": 1.4792285487177342e-05, + "loss": 1.2993, "step": 6225 }, { - "epoch": 1.87, - "grad_norm": 23.540828704833984, - "learning_rate": 7.521298987671646e-06, - "loss": 1.1681, + "epoch": 0.78, + "grad_norm": 7.1149373054504395, + "learning_rate": 1.4791448772120655e-05, + "loss": 0.6568, "step": 6226 }, { - "epoch": 1.87, - "grad_norm": 14.86224365234375, - "learning_rate": 7.519294377067256e-06, - "loss": 1.6844, + "epoch": 0.78, + "grad_norm": 6.569999694824219, + "learning_rate": 1.4790612057063969e-05, + "loss": 0.5046, "step": 6227 }, { - "epoch": 1.87, - "grad_norm": 76.27838134765625, - "learning_rate": 7.517289766462865e-06, - "loss": 1.9908, + "epoch": 0.78, + "grad_norm": 14.83082103729248, + "learning_rate": 1.478977534200728e-05, + "loss": 1.4867, "step": 6228 }, { - "epoch": 1.87, - "grad_norm": 18.715179443359375, - "learning_rate": 7.515285155858475e-06, - "loss": 1.2129, + "epoch": 0.78, + "grad_norm": 19.81695556640625, + "learning_rate": 1.4788938626950593e-05, + "loss": 1.2355, "step": 6229 }, { - "epoch": 1.87, - "grad_norm": 16.809297561645508, - "learning_rate": 7.513280545254085e-06, - "loss": 1.1011, + "epoch": 0.78, + "grad_norm": 7.853550434112549, + "learning_rate": 1.4788101911893907e-05, + "loss": 0.8718, "step": 6230 }, { - "epoch": 1.87, - "grad_norm": 8.646817207336426, - "learning_rate": 7.511275934649694e-06, - "loss": 1.5495, + "epoch": 0.78, + "grad_norm": 16.03679656982422, + "learning_rate": 1.4787265196837219e-05, + "loss": 2.5044, "step": 6231 }, { - "epoch": 1.87, - "grad_norm": 35.9256591796875, - "learning_rate": 7.509271324045305e-06, - "loss": 1.7407, + "epoch": 0.78, + "grad_norm": 10.5372896194458, + "learning_rate": 1.478642848178053e-05, + "loss": 1.4197, "step": 6232 }, { - "epoch": 1.87, - "grad_norm": 8.104480743408203, - "learning_rate": 7.507266713440915e-06, - "loss": 0.5633, + "epoch": 0.78, + "grad_norm": 6.5133466720581055, + "learning_rate": 1.4785591766723844e-05, + "loss": 1.7093, "step": 6233 }, { - "epoch": 1.87, - "grad_norm": 10.297518730163574, - "learning_rate": 7.5052621028365246e-06, - "loss": 1.2832, + "epoch": 0.78, + "grad_norm": 11.621196746826172, + "learning_rate": 1.4784755051667156e-05, + "loss": 1.5063, "step": 6234 }, { - "epoch": 1.87, - "grad_norm": 15.735064506530762, - "learning_rate": 7.503257492232135e-06, - "loss": 1.4141, + "epoch": 0.78, + "grad_norm": 31.792678833007812, + "learning_rate": 1.4783918336610468e-05, + "loss": 1.9922, "step": 6235 }, { - "epoch": 1.87, - "grad_norm": 12.001105308532715, - "learning_rate": 7.501252881627744e-06, - "loss": 1.4383, + "epoch": 0.78, + "grad_norm": 23.694442749023438, + "learning_rate": 1.478308162155378e-05, + "loss": 1.438, "step": 6236 }, { - "epoch": 1.88, - "grad_norm": 15.454205513000488, - "learning_rate": 7.499248271023354e-06, - "loss": 1.0, + "epoch": 0.78, + "grad_norm": 3.3710951805114746, + "learning_rate": 1.4782244906497094e-05, + "loss": 0.1078, "step": 6237 }, { - "epoch": 1.88, - "grad_norm": 19.108205795288086, - "learning_rate": 7.497243660418965e-06, - "loss": 1.6183, + "epoch": 0.78, + "grad_norm": 11.370076179504395, + "learning_rate": 1.4781408191440407e-05, + "loss": 0.801, "step": 6238 }, { - "epoch": 1.88, - "grad_norm": 38.10220718383789, - "learning_rate": 7.495239049814574e-06, - "loss": 2.5108, + "epoch": 0.78, + "grad_norm": 33.836185455322266, + "learning_rate": 1.4780571476383718e-05, + "loss": 1.8408, "step": 6239 }, { - "epoch": 1.88, - "grad_norm": 10.768503189086914, - "learning_rate": 7.493234439210184e-06, - "loss": 1.4355, - "step": 6240 - }, - { - "epoch": 1.88, - "eval_loss": 0.19893887639045715, - "eval_runtime": 43.6319, - "eval_samples_per_second": 33.897, - "eval_steps_per_second": 33.897, + "epoch": 0.78, + "grad_norm": 6.308077812194824, + "learning_rate": 1.4779734761327031e-05, + "loss": 0.2894, "step": 6240 }, { - "epoch": 1.88, - "grad_norm": 7.937902450561523, - "learning_rate": 7.4912298286057935e-06, - "loss": 0.9729, + "epoch": 0.78, + "grad_norm": 7.155177593231201, + "learning_rate": 1.4778898046270345e-05, + "loss": 0.2025, "step": 6241 }, { - "epoch": 1.88, - "grad_norm": 10.797032356262207, - "learning_rate": 7.4892252180014035e-06, - "loss": 0.9792, + "epoch": 0.78, + "grad_norm": 29.36728858947754, + "learning_rate": 1.4778061331213655e-05, + "loss": 1.8414, "step": 6242 }, { - "epoch": 1.88, - "grad_norm": 8.380309104919434, - "learning_rate": 7.487220607397014e-06, - "loss": 0.7776, + "epoch": 0.78, + "grad_norm": 24.564516067504883, + "learning_rate": 1.4777224616156969e-05, + "loss": 2.6769, "step": 6243 }, { - "epoch": 1.88, - "grad_norm": 10.550333023071289, - "learning_rate": 7.485215996792624e-06, - "loss": 1.1951, + "epoch": 0.78, + "grad_norm": 18.393569946289062, + "learning_rate": 1.4776387901100282e-05, + "loss": 2.8948, "step": 6244 }, { - "epoch": 1.88, - "grad_norm": 16.15055274963379, - "learning_rate": 7.483211386188234e-06, - "loss": 0.9995, + "epoch": 0.78, + "grad_norm": 8.56533432006836, + "learning_rate": 1.4775551186043593e-05, + "loss": 2.4428, "step": 6245 }, { - "epoch": 1.88, - "grad_norm": 18.002609252929688, - "learning_rate": 7.481206775583844e-06, - "loss": 1.6159, + "epoch": 0.78, + "grad_norm": 83.42866516113281, + "learning_rate": 1.4774714470986906e-05, + "loss": 2.374, "step": 6246 }, { - "epoch": 1.88, - "grad_norm": 19.96839714050293, - "learning_rate": 7.479202164979453e-06, - "loss": 1.9925, + "epoch": 0.78, + "grad_norm": 40.70931625366211, + "learning_rate": 1.477387775593022e-05, + "loss": 1.0785, "step": 6247 }, { - "epoch": 1.88, - "grad_norm": 22.932233810424805, - "learning_rate": 7.477197554375063e-06, - "loss": 1.5709, + "epoch": 0.78, + "grad_norm": 11.857887268066406, + "learning_rate": 1.4773041040873532e-05, + "loss": 1.0883, "step": 6248 }, { - "epoch": 1.88, - "grad_norm": 10.494919776916504, - "learning_rate": 7.4751929437706724e-06, - "loss": 1.1168, + "epoch": 0.78, + "grad_norm": 11.886443138122559, + "learning_rate": 1.4772204325816844e-05, + "loss": 1.2131, "step": 6249 }, { - "epoch": 1.88, - "grad_norm": 37.70262145996094, - "learning_rate": 7.473188333166283e-06, - "loss": 1.6439, + "epoch": 0.78, + "grad_norm": 20.011035919189453, + "learning_rate": 1.4771367610760156e-05, + "loss": 1.8072, "step": 6250 }, { - "epoch": 1.88, - "grad_norm": 14.26046085357666, - "learning_rate": 7.4711837225618935e-06, - "loss": 1.4012, + "epoch": 0.78, + "grad_norm": 10.199731826782227, + "learning_rate": 1.477053089570347e-05, + "loss": 0.7874, "step": 6251 }, { - "epoch": 1.88, - "grad_norm": 19.355712890625, - "learning_rate": 7.469179111957503e-06, - "loss": 1.3238, + "epoch": 0.78, + "grad_norm": 6.612044334411621, + "learning_rate": 1.4769694180646782e-05, + "loss": 0.6469, "step": 6252 }, { - "epoch": 1.88, - "grad_norm": 43.30799865722656, - "learning_rate": 7.467174501353113e-06, - "loss": 2.3852, + "epoch": 0.78, + "grad_norm": 10.592775344848633, + "learning_rate": 1.4768857465590093e-05, + "loss": 0.3784, "step": 6253 }, { - "epoch": 1.88, - "grad_norm": 17.540157318115234, - "learning_rate": 7.465169890748723e-06, - "loss": 0.9667, + "epoch": 0.78, + "grad_norm": 10.249109268188477, + "learning_rate": 1.4768020750533407e-05, + "loss": 1.2488, "step": 6254 }, { - "epoch": 1.88, - "grad_norm": 14.730655670166016, - "learning_rate": 7.463165280144332e-06, - "loss": 1.454, + "epoch": 0.78, + "grad_norm": 22.15861701965332, + "learning_rate": 1.476718403547672e-05, + "loss": 1.2772, "step": 6255 }, { - "epoch": 1.88, - "grad_norm": 11.64047908782959, - "learning_rate": 7.461160669539943e-06, - "loss": 1.209, + "epoch": 0.79, + "grad_norm": 15.571314811706543, + "learning_rate": 1.4766347320420031e-05, + "loss": 1.6813, "step": 6256 }, { - "epoch": 1.88, - "grad_norm": 8.219613075256348, - "learning_rate": 7.459156058935553e-06, - "loss": 1.3621, + "epoch": 0.79, + "grad_norm": 10.145541191101074, + "learning_rate": 1.4765510605363345e-05, + "loss": 2.0737, "step": 6257 }, { - "epoch": 1.88, - "grad_norm": 15.764402389526367, - "learning_rate": 7.457151448331162e-06, - "loss": 1.7374, + "epoch": 0.79, + "grad_norm": 11.789886474609375, + "learning_rate": 1.4764673890306658e-05, + "loss": 1.0396, "step": 6258 }, { - "epoch": 1.88, - "grad_norm": 17.738845825195312, - "learning_rate": 7.4551468377267724e-06, - "loss": 1.5129, + "epoch": 0.79, + "grad_norm": 29.804969787597656, + "learning_rate": 1.4763837175249969e-05, + "loss": 3.3272, "step": 6259 }, { - "epoch": 1.88, - "grad_norm": 18.939716339111328, - "learning_rate": 7.453142227122382e-06, - "loss": 1.6715, + "epoch": 0.79, + "grad_norm": 16.483253479003906, + "learning_rate": 1.4763000460193282e-05, + "loss": 1.5897, "step": 6260 }, { - "epoch": 1.88, - "grad_norm": 25.509105682373047, - "learning_rate": 7.451137616517992e-06, - "loss": 1.6497, + "epoch": 0.79, + "grad_norm": 11.531888008117676, + "learning_rate": 1.4762163745136596e-05, + "loss": 2.6644, "step": 6261 }, { - "epoch": 1.88, - "grad_norm": 30.341272354125977, - "learning_rate": 7.449133005913603e-06, - "loss": 2.2224, + "epoch": 0.79, + "grad_norm": 6.730110168457031, + "learning_rate": 1.4761327030079908e-05, + "loss": 0.5048, "step": 6262 }, { - "epoch": 1.88, - "grad_norm": 29.086957931518555, - "learning_rate": 7.447128395309212e-06, - "loss": 2.018, + "epoch": 0.79, + "grad_norm": 36.700557708740234, + "learning_rate": 1.476049031502322e-05, + "loss": 3.3426, "step": 6263 }, { - "epoch": 1.88, - "grad_norm": 13.672849655151367, - "learning_rate": 7.445123784704822e-06, - "loss": 1.4355, + "epoch": 0.79, + "grad_norm": 25.597129821777344, + "learning_rate": 1.4759653599966532e-05, + "loss": 2.0085, "step": 6264 }, { - "epoch": 1.88, - "grad_norm": 27.003616333007812, - "learning_rate": 7.443119174100431e-06, - "loss": 1.4442, + "epoch": 0.79, + "grad_norm": 19.021533966064453, + "learning_rate": 1.4758816884909845e-05, + "loss": 1.959, "step": 6265 }, { - "epoch": 1.88, - "grad_norm": 19.808364868164062, - "learning_rate": 7.441114563496041e-06, - "loss": 1.2619, + "epoch": 0.79, + "grad_norm": 169.17330932617188, + "learning_rate": 1.4757980169853157e-05, + "loss": 1.5518, "step": 6266 }, { - "epoch": 1.88, - "grad_norm": 17.011503219604492, - "learning_rate": 7.439109952891651e-06, - "loss": 1.2087, + "epoch": 0.79, + "grad_norm": 27.448816299438477, + "learning_rate": 1.475714345479647e-05, + "loss": 2.5831, "step": 6267 }, { - "epoch": 1.88, - "grad_norm": 16.227067947387695, - "learning_rate": 7.437105342287261e-06, - "loss": 2.4985, + "epoch": 0.79, + "grad_norm": 8.960739135742188, + "learning_rate": 1.4756306739739783e-05, + "loss": 1.958, "step": 6268 }, { - "epoch": 1.88, - "grad_norm": 15.978217124938965, - "learning_rate": 7.435100731682872e-06, - "loss": 1.3265, + "epoch": 0.79, + "grad_norm": 11.165156364440918, + "learning_rate": 1.4755470024683097e-05, + "loss": 1.4424, "step": 6269 }, { - "epoch": 1.89, - "grad_norm": 16.302669525146484, - "learning_rate": 7.433096121078482e-06, - "loss": 0.8265, + "epoch": 0.79, + "grad_norm": 9.504926681518555, + "learning_rate": 1.4754633309626407e-05, + "loss": 0.9984, "step": 6270 }, { - "epoch": 1.89, - "grad_norm": 20.792295455932617, - "learning_rate": 7.431091510474091e-06, - "loss": 1.4503, + "epoch": 0.79, + "grad_norm": 10.795631408691406, + "learning_rate": 1.475379659456972e-05, + "loss": 0.886, "step": 6271 }, { - "epoch": 1.89, - "grad_norm": 9.446883201599121, - "learning_rate": 7.429086899869701e-06, - "loss": 1.8997, + "epoch": 0.79, + "grad_norm": 9.808711051940918, + "learning_rate": 1.4752959879513034e-05, + "loss": 1.8094, "step": 6272 }, { - "epoch": 1.89, - "grad_norm": 24.59245491027832, - "learning_rate": 7.42708228926531e-06, - "loss": 1.1568, + "epoch": 0.79, + "grad_norm": 13.138855934143066, + "learning_rate": 1.4752123164456344e-05, + "loss": 2.1202, "step": 6273 }, { - "epoch": 1.89, - "grad_norm": 13.666716575622559, - "learning_rate": 7.42507767866092e-06, - "loss": 0.9083, + "epoch": 0.79, + "grad_norm": 4.985891819000244, + "learning_rate": 1.4751286449399658e-05, + "loss": 0.0602, "step": 6274 }, { - "epoch": 1.89, - "grad_norm": 17.118921279907227, - "learning_rate": 7.423073068056531e-06, - "loss": 1.5059, + "epoch": 0.79, + "grad_norm": 10.045243263244629, + "learning_rate": 1.4750449734342972e-05, + "loss": 1.6623, "step": 6275 }, { - "epoch": 1.89, - "grad_norm": 56.16904067993164, - "learning_rate": 7.4210684574521405e-06, - "loss": 3.6439, + "epoch": 0.79, + "grad_norm": 13.370321273803711, + "learning_rate": 1.4749613019286284e-05, + "loss": 2.2939, "step": 6276 }, { - "epoch": 1.89, - "grad_norm": 13.676959037780762, - "learning_rate": 7.4190638468477506e-06, - "loss": 1.6671, + "epoch": 0.79, + "grad_norm": 12.222705841064453, + "learning_rate": 1.4748776304229596e-05, + "loss": 1.8068, "step": 6277 }, { - "epoch": 1.89, - "grad_norm": 10.493898391723633, - "learning_rate": 7.41705923624336e-06, - "loss": 1.4543, + "epoch": 0.79, + "grad_norm": 74.30975341796875, + "learning_rate": 1.4747939589172908e-05, + "loss": 4.0831, "step": 6278 }, { - "epoch": 1.89, - "grad_norm": 22.156248092651367, - "learning_rate": 7.41505462563897e-06, - "loss": 1.5765, + "epoch": 0.79, + "grad_norm": 57.3051872253418, + "learning_rate": 1.4747102874116221e-05, + "loss": 2.1201, "step": 6279 }, { - "epoch": 1.89, - "grad_norm": 15.999430656433105, - "learning_rate": 7.41305001503458e-06, - "loss": 1.7229, + "epoch": 0.79, + "grad_norm": 21.052371978759766, + "learning_rate": 1.4746266159059533e-05, + "loss": 2.073, "step": 6280 }, { - "epoch": 1.89, - "grad_norm": 10.846489906311035, - "learning_rate": 7.411045404430191e-06, - "loss": 1.9086, + "epoch": 0.79, + "grad_norm": 29.350799560546875, + "learning_rate": 1.4745429444002845e-05, + "loss": 1.2385, "step": 6281 }, { - "epoch": 1.89, - "grad_norm": 17.6227970123291, - "learning_rate": 7.4090407938258e-06, - "loss": 1.2638, + "epoch": 0.79, + "grad_norm": 6.653513431549072, + "learning_rate": 1.4744592728946159e-05, + "loss": 0.3835, "step": 6282 }, { - "epoch": 1.89, - "grad_norm": 16.64601707458496, - "learning_rate": 7.40703618322141e-06, - "loss": 1.2801, + "epoch": 0.79, + "grad_norm": 13.995912551879883, + "learning_rate": 1.4743756013889472e-05, + "loss": 1.3589, "step": 6283 }, { - "epoch": 1.89, - "grad_norm": 18.028026580810547, - "learning_rate": 7.4050315726170195e-06, - "loss": 1.4945, + "epoch": 0.79, + "grad_norm": 8.13442325592041, + "learning_rate": 1.4742919298832783e-05, + "loss": 2.8616, "step": 6284 }, { - "epoch": 1.89, - "grad_norm": 35.858795166015625, - "learning_rate": 7.4030269620126295e-06, - "loss": 1.969, + "epoch": 0.79, + "grad_norm": 12.456286430358887, + "learning_rate": 1.4742082583776096e-05, + "loss": 1.8573, "step": 6285 }, { - "epoch": 1.89, - "grad_norm": 9.260512351989746, - "learning_rate": 7.401022351408239e-06, - "loss": 1.0171, + "epoch": 0.79, + "grad_norm": 9.940149307250977, + "learning_rate": 1.474124586871941e-05, + "loss": 3.2977, "step": 6286 }, { - "epoch": 1.89, - "grad_norm": 28.248775482177734, - "learning_rate": 7.39901774080385e-06, - "loss": 1.2457, + "epoch": 0.79, + "grad_norm": 48.481712341308594, + "learning_rate": 1.474040915366272e-05, + "loss": 4.0217, "step": 6287 }, { - "epoch": 1.89, - "grad_norm": 23.339683532714844, - "learning_rate": 7.39701313019946e-06, - "loss": 1.8959, + "epoch": 0.79, + "grad_norm": 5.557957172393799, + "learning_rate": 1.4739572438606034e-05, + "loss": 0.6419, "step": 6288 }, { - "epoch": 1.89, - "grad_norm": 16.68250274658203, - "learning_rate": 7.395008519595069e-06, - "loss": 1.1061, + "epoch": 0.79, + "grad_norm": 12.64964485168457, + "learning_rate": 1.4738735723549346e-05, + "loss": 0.9878, "step": 6289 }, { - "epoch": 1.89, - "grad_norm": 22.26504898071289, - "learning_rate": 7.393003908990679e-06, - "loss": 1.7896, + "epoch": 0.79, + "grad_norm": 28.808948516845703, + "learning_rate": 1.473789900849266e-05, + "loss": 1.926, "step": 6290 }, { - "epoch": 1.89, - "grad_norm": 16.31560707092285, - "learning_rate": 7.390999298386289e-06, - "loss": 0.9241, + "epoch": 0.79, + "grad_norm": 8.438365936279297, + "learning_rate": 1.4737062293435971e-05, + "loss": 1.3627, "step": 6291 }, { - "epoch": 1.89, - "grad_norm": 132.97479248046875, - "learning_rate": 7.3889946877818984e-06, - "loss": 2.0392, + "epoch": 0.79, + "grad_norm": 46.933963775634766, + "learning_rate": 1.4736225578379283e-05, + "loss": 3.0685, "step": 6292 }, { - "epoch": 1.89, - "grad_norm": 26.72028350830078, - "learning_rate": 7.386990077177509e-06, - "loss": 1.2421, + "epoch": 0.79, + "grad_norm": 8.370945930480957, + "learning_rate": 1.4735388863322597e-05, + "loss": 1.3335, "step": 6293 }, { - "epoch": 1.89, - "grad_norm": 7.662388324737549, - "learning_rate": 7.3849854665731195e-06, - "loss": 1.324, + "epoch": 0.79, + "grad_norm": 10.044309616088867, + "learning_rate": 1.4734552148265907e-05, + "loss": 1.9237, "step": 6294 }, { - "epoch": 1.89, - "grad_norm": 17.365476608276367, - "learning_rate": 7.382980855968729e-06, - "loss": 1.5321, + "epoch": 0.79, + "grad_norm": 12.739347457885742, + "learning_rate": 1.4733715433209221e-05, + "loss": 0.7081, "step": 6295 }, { - "epoch": 1.89, - "grad_norm": 12.46912956237793, - "learning_rate": 7.380976245364339e-06, - "loss": 1.4461, + "epoch": 0.79, + "grad_norm": 9.426420211791992, + "learning_rate": 1.4732878718152535e-05, + "loss": 1.0961, "step": 6296 }, { - "epoch": 1.89, - "grad_norm": 11.624770164489746, - "learning_rate": 7.378971634759948e-06, - "loss": 1.7608, + "epoch": 0.79, + "grad_norm": 9.006810188293457, + "learning_rate": 1.4732042003095848e-05, + "loss": 1.2268, "step": 6297 }, { - "epoch": 1.89, - "grad_norm": 18.62898826599121, - "learning_rate": 7.376967024155558e-06, - "loss": 1.0102, + "epoch": 0.79, + "grad_norm": 30.43055534362793, + "learning_rate": 1.4731205288039159e-05, + "loss": 1.4886, "step": 6298 }, { - "epoch": 1.89, - "grad_norm": 7.544929504394531, - "learning_rate": 7.374962413551169e-06, - "loss": 0.8058, + "epoch": 0.79, + "grad_norm": 21.447704315185547, + "learning_rate": 1.4730368572982472e-05, + "loss": 2.3547, "step": 6299 }, { - "epoch": 1.89, - "grad_norm": 8.430646896362305, - "learning_rate": 7.372957802946778e-06, - "loss": 1.2943, + "epoch": 0.79, + "grad_norm": 7.026731967926025, + "learning_rate": 1.4729531857925786e-05, + "loss": 0.6442, "step": 6300 }, { - "epoch": 1.89, - "grad_norm": 16.084999084472656, - "learning_rate": 7.370953192342388e-06, - "loss": 1.6322, + "epoch": 0.79, + "grad_norm": 4.663534641265869, + "learning_rate": 1.4728695142869096e-05, + "loss": 0.3037, "step": 6301 }, { - "epoch": 1.89, - "grad_norm": 26.333581924438477, - "learning_rate": 7.368948581737998e-06, - "loss": 1.3346, + "epoch": 0.79, + "grad_norm": 25.217287063598633, + "learning_rate": 1.472785842781241e-05, + "loss": 1.9375, "step": 6302 }, { - "epoch": 1.9, - "grad_norm": 12.999780654907227, - "learning_rate": 7.366943971133608e-06, - "loss": 1.312, + "epoch": 0.79, + "grad_norm": 3.978163719177246, + "learning_rate": 1.4727021712755722e-05, + "loss": 0.4616, "step": 6303 }, { - "epoch": 1.9, - "grad_norm": 20.878276824951172, - "learning_rate": 7.364939360529218e-06, - "loss": 1.235, + "epoch": 0.79, + "grad_norm": 13.031975746154785, + "learning_rate": 1.4726184997699035e-05, + "loss": 1.4241, "step": 6304 }, { - "epoch": 1.9, - "grad_norm": 39.3919677734375, - "learning_rate": 7.362934749924827e-06, - "loss": 1.3573, + "epoch": 0.79, + "grad_norm": 33.387062072753906, + "learning_rate": 1.4725348282642347e-05, + "loss": 2.9831, "step": 6305 }, { - "epoch": 1.9, - "grad_norm": 42.52452087402344, - "learning_rate": 7.360930139320438e-06, - "loss": 2.1336, + "epoch": 0.79, + "grad_norm": 11.21850872039795, + "learning_rate": 1.472451156758566e-05, + "loss": 2.8881, "step": 6306 }, { - "epoch": 1.9, - "grad_norm": 10.164920806884766, - "learning_rate": 7.358925528716048e-06, - "loss": 1.1969, + "epoch": 0.79, + "grad_norm": 22.746912002563477, + "learning_rate": 1.4723674852528973e-05, + "loss": 2.044, "step": 6307 }, { - "epoch": 1.9, - "grad_norm": 14.353740692138672, - "learning_rate": 7.356920918111657e-06, - "loss": 1.6327, + "epoch": 0.79, + "grad_norm": 14.89169692993164, + "learning_rate": 1.4722838137472283e-05, + "loss": 1.5603, "step": 6308 }, { - "epoch": 1.9, - "grad_norm": 11.760319709777832, - "learning_rate": 7.354916307507267e-06, - "loss": 1.8274, + "epoch": 0.79, + "grad_norm": 9.180404663085938, + "learning_rate": 1.4722001422415597e-05, + "loss": 3.4738, "step": 6309 }, { - "epoch": 1.9, - "grad_norm": 65.9858169555664, - "learning_rate": 7.3529116969028766e-06, - "loss": 1.0124, + "epoch": 0.79, + "grad_norm": 5.776002407073975, + "learning_rate": 1.472116470735891e-05, + "loss": 0.4565, "step": 6310 }, { - "epoch": 1.9, - "grad_norm": 13.734026908874512, - "learning_rate": 7.350907086298487e-06, - "loss": 1.2509, + "epoch": 0.79, + "grad_norm": 9.058882713317871, + "learning_rate": 1.4720327992302224e-05, + "loss": 0.7662, "step": 6311 }, { - "epoch": 1.9, - "grad_norm": 21.561870574951172, - "learning_rate": 7.348902475694098e-06, - "loss": 1.3681, + "epoch": 0.79, + "grad_norm": 26.460262298583984, + "learning_rate": 1.4719491277245534e-05, + "loss": 2.7587, "step": 6312 }, { - "epoch": 1.9, - "grad_norm": 51.21876525878906, - "learning_rate": 7.346897865089707e-06, - "loss": 2.3598, + "epoch": 0.79, + "grad_norm": 13.0591402053833, + "learning_rate": 1.4718654562188848e-05, + "loss": 0.6505, "step": 6313 }, { - "epoch": 1.9, - "grad_norm": 35.55440139770508, - "learning_rate": 7.344893254485317e-06, - "loss": 1.6387, + "epoch": 0.79, + "grad_norm": 13.908039093017578, + "learning_rate": 1.4717817847132162e-05, + "loss": 0.9457, "step": 6314 }, { - "epoch": 1.9, - "grad_norm": 9.630316734313965, - "learning_rate": 7.342888643880927e-06, - "loss": 1.0267, + "epoch": 0.79, + "grad_norm": 18.12594223022461, + "learning_rate": 1.4716981132075472e-05, + "loss": 1.9944, "step": 6315 }, { - "epoch": 1.9, - "grad_norm": 17.24791717529297, - "learning_rate": 7.340884033276536e-06, - "loss": 2.4284, + "epoch": 0.79, + "grad_norm": 7.108876705169678, + "learning_rate": 1.4716144417018786e-05, + "loss": 0.989, "step": 6316 }, { - "epoch": 1.9, - "grad_norm": 12.127941131591797, - "learning_rate": 7.338879422672146e-06, - "loss": 1.3257, + "epoch": 0.79, + "grad_norm": 8.037245750427246, + "learning_rate": 1.4715307701962098e-05, + "loss": 0.368, "step": 6317 }, { - "epoch": 1.9, - "grad_norm": 11.26683235168457, - "learning_rate": 7.336874812067757e-06, - "loss": 0.946, + "epoch": 0.79, + "grad_norm": 20.03801155090332, + "learning_rate": 1.4714470986905411e-05, + "loss": 2.1021, "step": 6318 }, { - "epoch": 1.9, - "grad_norm": 7.805086612701416, - "learning_rate": 7.3348702014633665e-06, - "loss": 0.6828, + "epoch": 0.79, + "grad_norm": 10.88740348815918, + "learning_rate": 1.4713634271848723e-05, + "loss": 0.5601, "step": 6319 }, { - "epoch": 1.9, - "grad_norm": 124.16524505615234, - "learning_rate": 7.3328655908589766e-06, - "loss": 2.6128, + "epoch": 0.79, + "grad_norm": 13.821710586547852, + "learning_rate": 1.4712797556792035e-05, + "loss": 2.988, "step": 6320 }, { - "epoch": 1.9, - "grad_norm": 26.658418655395508, - "learning_rate": 7.330860980254586e-06, - "loss": 1.359, + "epoch": 0.79, + "grad_norm": 25.372257232666016, + "learning_rate": 1.4711960841735349e-05, + "loss": 2.3201, "step": 6321 }, { - "epoch": 1.9, - "grad_norm": 63.66525650024414, - "learning_rate": 7.328856369650196e-06, - "loss": 2.1401, + "epoch": 0.79, + "grad_norm": 18.005115509033203, + "learning_rate": 1.4711124126678659e-05, + "loss": 2.3348, "step": 6322 }, { - "epoch": 1.9, - "grad_norm": 19.522897720336914, - "learning_rate": 7.326851759045805e-06, - "loss": 1.6017, + "epoch": 0.79, + "grad_norm": 11.153677940368652, + "learning_rate": 1.4710287411621973e-05, + "loss": 1.6604, "step": 6323 }, { - "epoch": 1.9, - "grad_norm": 18.54653549194336, - "learning_rate": 7.324847148441416e-06, - "loss": 1.902, + "epoch": 0.79, + "grad_norm": 7.8768510818481445, + "learning_rate": 1.4709450696565286e-05, + "loss": 0.9796, "step": 6324 }, { - "epoch": 1.9, - "grad_norm": 40.539337158203125, - "learning_rate": 7.322842537837026e-06, - "loss": 1.6362, + "epoch": 0.79, + "grad_norm": 7.6989874839782715, + "learning_rate": 1.47086139815086e-05, + "loss": 0.8554, "step": 6325 }, { - "epoch": 1.9, - "grad_norm": 24.200485229492188, - "learning_rate": 7.320837927232635e-06, - "loss": 2.5706, + "epoch": 0.79, + "grad_norm": 16.05092430114746, + "learning_rate": 1.470777726645191e-05, + "loss": 0.9129, "step": 6326 }, { - "epoch": 1.9, - "grad_norm": 38.600364685058594, - "learning_rate": 7.3188333166282455e-06, - "loss": 2.5773, + "epoch": 0.79, + "grad_norm": 20.265979766845703, + "learning_rate": 1.4706940551395224e-05, + "loss": 1.0729, "step": 6327 }, { - "epoch": 1.9, - "grad_norm": 13.316827774047852, - "learning_rate": 7.3168287060238555e-06, - "loss": 1.6111, + "epoch": 0.79, + "grad_norm": 17.98265266418457, + "learning_rate": 1.4706103836338536e-05, + "loss": 1.3576, "step": 6328 }, { - "epoch": 1.9, - "grad_norm": 20.68040657043457, - "learning_rate": 7.314824095419465e-06, - "loss": 1.8822, + "epoch": 0.79, + "grad_norm": 10.445375442504883, + "learning_rate": 1.4705267121281848e-05, + "loss": 1.1647, "step": 6329 }, { - "epoch": 1.9, - "grad_norm": 20.918405532836914, - "learning_rate": 7.312819484815076e-06, - "loss": 0.873, + "epoch": 0.79, + "grad_norm": 21.013591766357422, + "learning_rate": 1.4704430406225161e-05, + "loss": 1.1448, "step": 6330 }, { - "epoch": 1.9, - "grad_norm": 19.907466888427734, - "learning_rate": 7.310814874210686e-06, - "loss": 1.2545, + "epoch": 0.79, + "grad_norm": 17.206157684326172, + "learning_rate": 1.4703593691168473e-05, + "loss": 2.1738, "step": 6331 }, { - "epoch": 1.9, - "grad_norm": 16.205663681030273, - "learning_rate": 7.308810263606295e-06, - "loss": 1.2967, + "epoch": 0.79, + "grad_norm": 15.27292251586914, + "learning_rate": 1.4702756976111787e-05, + "loss": 2.8595, "step": 6332 }, { - "epoch": 1.9, - "grad_norm": 13.830738067626953, - "learning_rate": 7.306805653001905e-06, - "loss": 0.9852, + "epoch": 0.79, + "grad_norm": 14.216376304626465, + "learning_rate": 1.4701920261055099e-05, + "loss": 1.8285, "step": 6333 }, { - "epoch": 1.9, - "grad_norm": 12.994755744934082, - "learning_rate": 7.304801042397514e-06, - "loss": 1.0101, + "epoch": 0.79, + "grad_norm": 71.95661163330078, + "learning_rate": 1.4701083545998411e-05, + "loss": 3.5818, "step": 6334 }, { - "epoch": 1.9, - "grad_norm": 10.922076225280762, - "learning_rate": 7.3027964317931244e-06, - "loss": 1.1132, + "epoch": 0.8, + "grad_norm": 8.187115669250488, + "learning_rate": 1.4700246830941725e-05, + "loss": 2.0697, "step": 6335 }, { - "epoch": 1.9, - "grad_norm": 14.2009859085083, - "learning_rate": 7.300791821188735e-06, - "loss": 1.1271, + "epoch": 0.8, + "grad_norm": 11.393028259277344, + "learning_rate": 1.4699410115885035e-05, + "loss": 0.9834, "step": 6336 }, { - "epoch": 1.91, - "grad_norm": 13.78223705291748, - "learning_rate": 7.298787210584345e-06, - "loss": 1.4629, + "epoch": 0.8, + "grad_norm": 25.30308723449707, + "learning_rate": 1.4698573400828349e-05, + "loss": 4.2712, "step": 6337 }, { - "epoch": 1.91, - "grad_norm": 47.76295852661133, - "learning_rate": 7.296782599979955e-06, - "loss": 1.6983, + "epoch": 0.8, + "grad_norm": 18.839860916137695, + "learning_rate": 1.4697736685771662e-05, + "loss": 2.2294, "step": 6338 }, { - "epoch": 1.91, - "grad_norm": 26.619237899780273, - "learning_rate": 7.294777989375565e-06, - "loss": 2.0114, + "epoch": 0.8, + "grad_norm": 37.84192657470703, + "learning_rate": 1.4696899970714976e-05, + "loss": 1.652, "step": 6339 }, { - "epoch": 1.91, - "grad_norm": 10.558842658996582, - "learning_rate": 7.292773378771174e-06, - "loss": 0.7375, + "epoch": 0.8, + "grad_norm": 11.718338966369629, + "learning_rate": 1.4696063255658286e-05, + "loss": 1.5113, "step": 6340 }, { - "epoch": 1.91, - "grad_norm": 11.236355781555176, - "learning_rate": 7.290768768166784e-06, - "loss": 1.406, + "epoch": 0.8, + "grad_norm": 3.9569454193115234, + "learning_rate": 1.46952265406016e-05, + "loss": 0.1657, "step": 6341 }, { - "epoch": 1.91, - "grad_norm": 10.12108325958252, - "learning_rate": 7.288764157562395e-06, - "loss": 1.4366, + "epoch": 0.8, + "grad_norm": 40.034698486328125, + "learning_rate": 1.4694389825544912e-05, + "loss": 1.7666, "step": 6342 }, { - "epoch": 1.91, - "grad_norm": 23.887067794799805, - "learning_rate": 7.286759546958004e-06, - "loss": 1.2171, + "epoch": 0.8, + "grad_norm": 10.887019157409668, + "learning_rate": 1.4693553110488224e-05, + "loss": 0.9377, "step": 6343 }, { - "epoch": 1.91, - "grad_norm": 11.277857780456543, - "learning_rate": 7.284754936353614e-06, - "loss": 1.1178, + "epoch": 0.8, + "grad_norm": 13.77903938293457, + "learning_rate": 1.4692716395431537e-05, + "loss": 0.883, "step": 6344 }, { - "epoch": 1.91, - "grad_norm": 8.617938041687012, - "learning_rate": 7.282750325749224e-06, - "loss": 0.9352, + "epoch": 0.8, + "grad_norm": 10.93370532989502, + "learning_rate": 1.469187968037485e-05, + "loss": 1.1001, "step": 6345 }, { - "epoch": 1.91, - "grad_norm": 7.352804660797119, - "learning_rate": 7.280745715144834e-06, - "loss": 1.1608, + "epoch": 0.8, + "grad_norm": 18.387582778930664, + "learning_rate": 1.4691042965318163e-05, + "loss": 2.2495, "step": 6346 }, { - "epoch": 1.91, - "grad_norm": 19.099529266357422, - "learning_rate": 7.278741104540443e-06, - "loss": 1.3358, + "epoch": 0.8, + "grad_norm": 43.04143142700195, + "learning_rate": 1.4690206250261473e-05, + "loss": 1.8831, "step": 6347 }, { - "epoch": 1.91, - "grad_norm": 7.905605792999268, - "learning_rate": 7.276736493936053e-06, - "loss": 1.8386, + "epoch": 0.8, + "grad_norm": 6.625004291534424, + "learning_rate": 1.4689369535204787e-05, + "loss": 0.6768, "step": 6348 }, { - "epoch": 1.91, - "grad_norm": 23.328462600708008, - "learning_rate": 7.274731883331664e-06, - "loss": 1.5516, + "epoch": 0.8, + "grad_norm": 7.089539051055908, + "learning_rate": 1.46885328201481e-05, + "loss": 0.7392, "step": 6349 }, { - "epoch": 1.91, - "grad_norm": 55.610599517822266, - "learning_rate": 7.272727272727273e-06, - "loss": 1.2527, + "epoch": 0.8, + "grad_norm": 35.005977630615234, + "learning_rate": 1.468769610509141e-05, + "loss": 3.1561, "step": 6350 }, { - "epoch": 1.91, - "grad_norm": 14.633689880371094, - "learning_rate": 7.270722662122883e-06, - "loss": 1.4997, + "epoch": 0.8, + "grad_norm": 10.768105506896973, + "learning_rate": 1.4686859390034724e-05, + "loss": 2.7292, "step": 6351 }, { - "epoch": 1.91, - "grad_norm": 15.966309547424316, - "learning_rate": 7.268718051518493e-06, - "loss": 1.4803, + "epoch": 0.8, + "grad_norm": 12.711040496826172, + "learning_rate": 1.4686022674978038e-05, + "loss": 1.4311, "step": 6352 }, { - "epoch": 1.91, - "grad_norm": 15.084650039672852, - "learning_rate": 7.2667134409141026e-06, - "loss": 0.8466, + "epoch": 0.8, + "grad_norm": 9.029401779174805, + "learning_rate": 1.4685185959921352e-05, + "loss": 1.0504, "step": 6353 }, { - "epoch": 1.91, - "grad_norm": 24.249235153198242, - "learning_rate": 7.264708830309713e-06, - "loss": 1.2289, + "epoch": 0.8, + "grad_norm": 12.517583847045898, + "learning_rate": 1.4684349244864662e-05, + "loss": 2.1899, "step": 6354 }, { - "epoch": 1.91, - "grad_norm": 30.721338272094727, - "learning_rate": 7.262704219705324e-06, - "loss": 1.5891, + "epoch": 0.8, + "grad_norm": 18.929569244384766, + "learning_rate": 1.4683512529807976e-05, + "loss": 4.0764, "step": 6355 }, { - "epoch": 1.91, - "grad_norm": 16.246719360351562, - "learning_rate": 7.260699609100933e-06, - "loss": 1.4035, + "epoch": 0.8, + "grad_norm": 15.242326736450195, + "learning_rate": 1.4682675814751287e-05, + "loss": 1.8787, "step": 6356 }, { - "epoch": 1.91, - "grad_norm": 21.17243194580078, - "learning_rate": 7.258694998496543e-06, - "loss": 1.8486, + "epoch": 0.8, + "grad_norm": 19.440195083618164, + "learning_rate": 1.46818390996946e-05, + "loss": 1.7888, "step": 6357 }, { - "epoch": 1.91, - "grad_norm": 16.293087005615234, - "learning_rate": 7.256690387892152e-06, - "loss": 0.9408, + "epoch": 0.8, + "grad_norm": 16.57932472229004, + "learning_rate": 1.4681002384637913e-05, + "loss": 1.6356, "step": 6358 }, { - "epoch": 1.91, - "grad_norm": 96.68749237060547, - "learning_rate": 7.254685777287762e-06, - "loss": 2.8577, + "epoch": 0.8, + "grad_norm": 18.87995719909668, + "learning_rate": 1.4680165669581225e-05, + "loss": 2.8658, "step": 6359 }, { - "epoch": 1.91, - "grad_norm": 17.761423110961914, - "learning_rate": 7.2526811666833715e-06, - "loss": 2.8659, - "step": 6360 - }, - { - "epoch": 1.91, - "eval_loss": 0.17964056134223938, - "eval_runtime": 43.926, - "eval_samples_per_second": 33.67, - "eval_steps_per_second": 33.67, + "epoch": 0.8, + "grad_norm": 16.52043914794922, + "learning_rate": 1.4679328954524539e-05, + "loss": 1.8064, "step": 6360 }, { - "epoch": 1.91, - "grad_norm": 23.259288787841797, - "learning_rate": 7.250676556078982e-06, - "loss": 1.1937, + "epoch": 0.8, + "grad_norm": 10.4945068359375, + "learning_rate": 1.4678492239467849e-05, + "loss": 1.7405, "step": 6361 }, { - "epoch": 1.91, - "grad_norm": 28.89937400817871, - "learning_rate": 7.2486719454745925e-06, - "loss": 1.9101, + "epoch": 0.8, + "grad_norm": 36.318603515625, + "learning_rate": 1.4677655524411163e-05, + "loss": 2.7147, "step": 6362 }, { - "epoch": 1.91, - "grad_norm": 13.6145601272583, - "learning_rate": 7.246667334870202e-06, - "loss": 1.4405, + "epoch": 0.8, + "grad_norm": 10.358726501464844, + "learning_rate": 1.4676818809354476e-05, + "loss": 0.9465, "step": 6363 }, { - "epoch": 1.91, - "grad_norm": 28.463932037353516, - "learning_rate": 7.244662724265812e-06, - "loss": 1.7418, + "epoch": 0.8, + "grad_norm": 11.27572250366211, + "learning_rate": 1.4675982094297787e-05, + "loss": 3.3759, "step": 6364 }, { - "epoch": 1.91, - "grad_norm": 29.748435974121094, - "learning_rate": 7.242658113661422e-06, - "loss": 1.7554, + "epoch": 0.8, + "grad_norm": 8.469902038574219, + "learning_rate": 1.46751453792411e-05, + "loss": 0.8402, "step": 6365 }, { - "epoch": 1.91, - "grad_norm": 10.47834300994873, - "learning_rate": 7.240653503057031e-06, - "loss": 0.9418, + "epoch": 0.8, + "grad_norm": 17.69377899169922, + "learning_rate": 1.4674308664184414e-05, + "loss": 2.8532, "step": 6366 }, { - "epoch": 1.91, - "grad_norm": 66.0386962890625, - "learning_rate": 7.238648892452642e-06, - "loss": 1.4975, + "epoch": 0.8, + "grad_norm": 10.347311019897461, + "learning_rate": 1.4673471949127727e-05, + "loss": 1.5435, "step": 6367 }, { - "epoch": 1.91, - "grad_norm": 17.965469360351562, - "learning_rate": 7.236644281848252e-06, - "loss": 1.2229, + "epoch": 0.8, + "grad_norm": 10.75600814819336, + "learning_rate": 1.4672635234071038e-05, + "loss": 0.7196, "step": 6368 }, { - "epoch": 1.91, - "grad_norm": 12.599654197692871, - "learning_rate": 7.234639671243861e-06, - "loss": 1.606, + "epoch": 0.8, + "grad_norm": 76.24691009521484, + "learning_rate": 1.4671798519014351e-05, + "loss": 2.5328, "step": 6369 }, { - "epoch": 1.92, - "grad_norm": 17.390832901000977, - "learning_rate": 7.2326350606394715e-06, - "loss": 1.387, + "epoch": 0.8, + "grad_norm": 32.28241729736328, + "learning_rate": 1.4670961803957663e-05, + "loss": 1.665, "step": 6370 }, { - "epoch": 1.92, - "grad_norm": 15.174606323242188, - "learning_rate": 7.230630450035081e-06, - "loss": 1.3947, + "epoch": 0.8, + "grad_norm": 12.82848072052002, + "learning_rate": 1.4670125088900975e-05, + "loss": 0.4803, "step": 6371 }, { - "epoch": 1.92, - "grad_norm": 16.567323684692383, - "learning_rate": 7.228625839430691e-06, - "loss": 1.5866, + "epoch": 0.8, + "grad_norm": 8.32317066192627, + "learning_rate": 1.4669288373844289e-05, + "loss": 0.5588, "step": 6372 }, { - "epoch": 1.92, - "grad_norm": 33.17914962768555, - "learning_rate": 7.226621228826302e-06, - "loss": 1.8752, + "epoch": 0.8, + "grad_norm": 18.013607025146484, + "learning_rate": 1.4668451658787601e-05, + "loss": 0.9563, "step": 6373 }, { - "epoch": 1.92, - "grad_norm": 18.26946258544922, - "learning_rate": 7.224616618221911e-06, - "loss": 1.1493, + "epoch": 0.8, + "grad_norm": 10.781624794006348, + "learning_rate": 1.4667614943730915e-05, + "loss": 1.842, "step": 6374 }, { - "epoch": 1.92, - "grad_norm": 26.841148376464844, - "learning_rate": 7.222612007617521e-06, - "loss": 1.86, + "epoch": 0.8, + "grad_norm": 6.22960901260376, + "learning_rate": 1.4666778228674225e-05, + "loss": 1.6661, "step": 6375 }, { - "epoch": 1.92, - "grad_norm": 15.99163818359375, - "learning_rate": 7.220607397013131e-06, - "loss": 1.7793, + "epoch": 0.8, + "grad_norm": 9.189697265625, + "learning_rate": 1.4665941513617538e-05, + "loss": 2.0607, "step": 6376 }, { - "epoch": 1.92, - "grad_norm": 85.62297821044922, - "learning_rate": 7.21860278640874e-06, - "loss": 2.2498, + "epoch": 0.8, + "grad_norm": 51.77899932861328, + "learning_rate": 1.4665104798560852e-05, + "loss": 1.2212, "step": 6377 }, { - "epoch": 1.92, - "grad_norm": 14.444759368896484, - "learning_rate": 7.2165981758043504e-06, - "loss": 1.0914, + "epoch": 0.8, + "grad_norm": 14.942885398864746, + "learning_rate": 1.4664268083504162e-05, + "loss": 2.9491, "step": 6378 }, { - "epoch": 1.92, - "grad_norm": 14.665372848510742, - "learning_rate": 7.214593565199961e-06, - "loss": 2.3208, + "epoch": 0.8, + "grad_norm": 6.897355079650879, + "learning_rate": 1.4663431368447476e-05, + "loss": 0.2136, "step": 6379 }, { - "epoch": 1.92, - "grad_norm": 6.8967604637146, - "learning_rate": 7.212588954595571e-06, - "loss": 1.0709, + "epoch": 0.8, + "grad_norm": 6.80172872543335, + "learning_rate": 1.466259465339079e-05, + "loss": 1.2004, "step": 6380 }, { - "epoch": 1.92, - "grad_norm": 27.484949111938477, - "learning_rate": 7.210584343991181e-06, - "loss": 1.6652, + "epoch": 0.8, + "grad_norm": 18.366886138916016, + "learning_rate": 1.4661757938334102e-05, + "loss": 1.1385, "step": 6381 }, { - "epoch": 1.92, - "grad_norm": 36.965274810791016, - "learning_rate": 7.20857973338679e-06, - "loss": 2.1163, + "epoch": 0.8, + "grad_norm": 38.32312774658203, + "learning_rate": 1.4660921223277414e-05, + "loss": 2.4598, "step": 6382 }, { - "epoch": 1.92, - "grad_norm": 16.792247772216797, - "learning_rate": 7.2065751227824e-06, - "loss": 1.4545, + "epoch": 0.8, + "grad_norm": 10.939009666442871, + "learning_rate": 1.4660084508220727e-05, + "loss": 1.1789, "step": 6383 }, { - "epoch": 1.92, - "grad_norm": 18.809303283691406, - "learning_rate": 7.204570512178009e-06, - "loss": 1.9265, + "epoch": 0.8, + "grad_norm": 19.45528221130371, + "learning_rate": 1.465924779316404e-05, + "loss": 0.8006, "step": 6384 }, { - "epoch": 1.92, - "grad_norm": 23.61911964416504, - "learning_rate": 7.20256590157362e-06, - "loss": 1.4629, + "epoch": 0.8, + "grad_norm": 29.048940658569336, + "learning_rate": 1.4658411078107351e-05, + "loss": 2.3528, "step": 6385 }, { - "epoch": 1.92, - "grad_norm": 15.787665367126465, - "learning_rate": 7.20056129096923e-06, - "loss": 1.5397, + "epoch": 0.8, + "grad_norm": 20.245954513549805, + "learning_rate": 1.4657574363050665e-05, + "loss": 2.627, "step": 6386 }, { - "epoch": 1.92, - "grad_norm": 25.38530158996582, - "learning_rate": 7.1985566803648395e-06, - "loss": 1.9278, + "epoch": 0.8, + "grad_norm": 14.931648254394531, + "learning_rate": 1.4656737647993977e-05, + "loss": 1.7569, "step": 6387 }, { - "epoch": 1.92, - "grad_norm": 52.69976806640625, - "learning_rate": 7.19655206976045e-06, - "loss": 1.0921, + "epoch": 0.8, + "grad_norm": 11.111387252807617, + "learning_rate": 1.465590093293729e-05, + "loss": 1.1751, "step": 6388 }, { - "epoch": 1.92, - "grad_norm": 12.26889419555664, - "learning_rate": 7.19454745915606e-06, - "loss": 0.9164, + "epoch": 0.8, + "grad_norm": 9.465842247009277, + "learning_rate": 1.46550642178806e-05, + "loss": 1.4811, "step": 6389 }, { - "epoch": 1.92, - "grad_norm": 11.600250244140625, - "learning_rate": 7.192542848551669e-06, - "loss": 0.8845, + "epoch": 0.8, + "grad_norm": 7.897397041320801, + "learning_rate": 1.4654227502823914e-05, + "loss": 0.426, "step": 6390 }, { - "epoch": 1.92, - "grad_norm": 21.00747299194336, - "learning_rate": 7.190538237947279e-06, - "loss": 1.6517, + "epoch": 0.8, + "grad_norm": 6.984261512756348, + "learning_rate": 1.4653390787767228e-05, + "loss": 0.326, "step": 6391 }, { - "epoch": 1.92, - "grad_norm": 34.60345458984375, - "learning_rate": 7.18853362734289e-06, - "loss": 1.5433, + "epoch": 0.8, + "grad_norm": 34.08882141113281, + "learning_rate": 1.4652554072710538e-05, + "loss": 1.84, "step": 6392 }, { - "epoch": 1.92, - "grad_norm": 11.045711517333984, - "learning_rate": 7.186529016738499e-06, - "loss": 1.2821, + "epoch": 0.8, + "grad_norm": 12.20494270324707, + "learning_rate": 1.4651717357653852e-05, + "loss": 2.2515, "step": 6393 }, { - "epoch": 1.92, - "grad_norm": 15.181732177734375, - "learning_rate": 7.184524406134109e-06, - "loss": 1.382, + "epoch": 0.8, + "grad_norm": 11.835962295532227, + "learning_rate": 1.4650880642597165e-05, + "loss": 0.2851, "step": 6394 }, { - "epoch": 1.92, - "grad_norm": 15.159256935119629, - "learning_rate": 7.1825197955297185e-06, - "loss": 1.1485, + "epoch": 0.8, + "grad_norm": 22.450462341308594, + "learning_rate": 1.4650043927540477e-05, + "loss": 1.2688, "step": 6395 }, { - "epoch": 1.92, - "grad_norm": 18.364330291748047, - "learning_rate": 7.1805151849253286e-06, - "loss": 1.5439, + "epoch": 0.8, + "grad_norm": 11.962854385375977, + "learning_rate": 1.464920721248379e-05, + "loss": 1.7477, "step": 6396 }, { - "epoch": 1.92, - "grad_norm": 25.671911239624023, - "learning_rate": 7.178510574320938e-06, - "loss": 1.1944, + "epoch": 0.8, + "grad_norm": 21.87105941772461, + "learning_rate": 1.4648370497427103e-05, + "loss": 1.4168, "step": 6397 }, { - "epoch": 1.92, - "grad_norm": 20.186847686767578, - "learning_rate": 7.176505963716549e-06, - "loss": 1.7509, + "epoch": 0.8, + "grad_norm": 17.599401473999023, + "learning_rate": 1.4647533782370415e-05, + "loss": 1.0415, "step": 6398 }, { - "epoch": 1.92, - "grad_norm": 13.25575065612793, - "learning_rate": 7.174501353112159e-06, - "loss": 0.9348, + "epoch": 0.8, + "grad_norm": 18.64591407775879, + "learning_rate": 1.4646697067313727e-05, + "loss": 1.5148, "step": 6399 }, { - "epoch": 1.92, - "grad_norm": 16.35478401184082, - "learning_rate": 7.172496742507769e-06, - "loss": 1.4031, + "epoch": 0.8, + "grad_norm": 14.95875072479248, + "learning_rate": 1.4645860352257039e-05, + "loss": 1.3421, "step": 6400 }, { - "epoch": 1.92, - "grad_norm": 13.498659133911133, - "learning_rate": 7.170492131903378e-06, - "loss": 1.4828, + "epoch": 0.8, + "eval_loss": 0.09361087530851364, + "eval_runtime": 100.2017, + "eval_samples_per_second": 35.349, + "eval_steps_per_second": 35.349, + "step": 6400 + }, + { + "epoch": 0.8, + "grad_norm": 9.143239974975586, + "learning_rate": 1.4645023637200353e-05, + "loss": 2.1652, "step": 6401 }, { - "epoch": 1.92, - "grad_norm": 88.43494415283203, - "learning_rate": 7.168487521298988e-06, - "loss": 1.9878, + "epoch": 0.8, + "grad_norm": 24.113967895507812, + "learning_rate": 1.4644186922143666e-05, + "loss": 3.1909, "step": 6402 }, { - "epoch": 1.93, - "grad_norm": 11.417867660522461, - "learning_rate": 7.1664829106945975e-06, - "loss": 1.5075, + "epoch": 0.8, + "grad_norm": 4.8510355949401855, + "learning_rate": 1.4643350207086976e-05, + "loss": 0.4873, "step": 6403 }, { - "epoch": 1.93, - "grad_norm": 12.581781387329102, - "learning_rate": 7.164478300090208e-06, - "loss": 1.1652, + "epoch": 0.8, + "grad_norm": 10.595499992370605, + "learning_rate": 1.464251349203029e-05, + "loss": 2.3981, "step": 6404 }, { - "epoch": 1.93, - "grad_norm": 19.474443435668945, - "learning_rate": 7.1624736894858185e-06, - "loss": 1.7053, + "epoch": 0.8, + "grad_norm": 15.347944259643555, + "learning_rate": 1.4641676776973604e-05, + "loss": 1.7141, "step": 6405 }, { - "epoch": 1.93, - "grad_norm": 21.434553146362305, - "learning_rate": 7.160469078881428e-06, - "loss": 2.3571, + "epoch": 0.8, + "grad_norm": 7.038366317749023, + "learning_rate": 1.4640840061916914e-05, + "loss": 0.3818, "step": 6406 }, { - "epoch": 1.93, - "grad_norm": 17.52594566345215, - "learning_rate": 7.158464468277038e-06, - "loss": 2.2389, + "epoch": 0.8, + "grad_norm": 16.829757690429688, + "learning_rate": 1.4640003346860228e-05, + "loss": 1.2497, "step": 6407 }, { - "epoch": 1.93, - "grad_norm": 26.64896583557129, - "learning_rate": 7.156459857672647e-06, - "loss": 1.8134, + "epoch": 0.8, + "grad_norm": 18.520206451416016, + "learning_rate": 1.4639166631803541e-05, + "loss": 1.2516, "step": 6408 }, { - "epoch": 1.93, - "grad_norm": 13.084494590759277, - "learning_rate": 7.154455247068257e-06, - "loss": 1.584, + "epoch": 0.8, + "grad_norm": 13.922795295715332, + "learning_rate": 1.4638329916746853e-05, + "loss": 2.1521, "step": 6409 }, { - "epoch": 1.93, - "grad_norm": 12.483121871948242, - "learning_rate": 7.152450636463868e-06, - "loss": 1.8767, + "epoch": 0.8, + "grad_norm": 19.33915901184082, + "learning_rate": 1.4637493201690165e-05, + "loss": 1.6892, "step": 6410 }, { - "epoch": 1.93, - "grad_norm": 8.219230651855469, - "learning_rate": 7.150446025859477e-06, - "loss": 1.177, + "epoch": 0.8, + "grad_norm": 24.729047775268555, + "learning_rate": 1.4636656486633479e-05, + "loss": 0.8628, "step": 6411 }, { - "epoch": 1.93, - "grad_norm": 31.770601272583008, - "learning_rate": 7.148441415255087e-06, - "loss": 1.9082, + "epoch": 0.8, + "grad_norm": 22.032670974731445, + "learning_rate": 1.4635819771576791e-05, + "loss": 1.7879, "step": 6412 }, { - "epoch": 1.93, - "grad_norm": 18.06419563293457, - "learning_rate": 7.1464368046506975e-06, - "loss": 1.2848, + "epoch": 0.8, + "grad_norm": 15.246326446533203, + "learning_rate": 1.4634983056520103e-05, + "loss": 2.3433, "step": 6413 }, { - "epoch": 1.93, - "grad_norm": 6.39808464050293, - "learning_rate": 7.144432194046307e-06, - "loss": 0.4952, + "epoch": 0.8, + "grad_norm": 10.587986946105957, + "learning_rate": 1.4634146341463415e-05, + "loss": 2.016, "step": 6414 }, { - "epoch": 1.93, - "grad_norm": 16.47151756286621, - "learning_rate": 7.142427583441917e-06, - "loss": 1.4156, + "epoch": 0.81, + "grad_norm": 15.93343734741211, + "learning_rate": 1.4633309626406728e-05, + "loss": 2.5226, "step": 6415 }, { - "epoch": 1.93, - "grad_norm": 35.63877487182617, - "learning_rate": 7.140422972837528e-06, - "loss": 1.6205, + "epoch": 0.81, + "grad_norm": 18.305713653564453, + "learning_rate": 1.4632472911350042e-05, + "loss": 1.4843, "step": 6416 }, { - "epoch": 1.93, - "grad_norm": 16.693328857421875, - "learning_rate": 7.138418362233137e-06, - "loss": 1.1112, + "epoch": 0.81, + "grad_norm": 27.7530460357666, + "learning_rate": 1.4631636196293352e-05, + "loss": 1.9575, "step": 6417 }, { - "epoch": 1.93, - "grad_norm": 34.685909271240234, - "learning_rate": 7.136413751628747e-06, - "loss": 1.704, + "epoch": 0.81, + "grad_norm": 15.6149263381958, + "learning_rate": 1.4630799481236666e-05, + "loss": 1.0243, "step": 6418 }, { - "epoch": 1.93, - "grad_norm": 17.206161499023438, - "learning_rate": 7.134409141024356e-06, - "loss": 1.1891, + "epoch": 0.81, + "grad_norm": 18.455860137939453, + "learning_rate": 1.462996276617998e-05, + "loss": 2.7726, "step": 6419 }, { - "epoch": 1.93, - "grad_norm": 19.83222770690918, - "learning_rate": 7.132404530419966e-06, - "loss": 2.2121, + "epoch": 0.81, + "grad_norm": 19.104188919067383, + "learning_rate": 1.462912605112329e-05, + "loss": 2.0499, "step": 6420 }, { - "epoch": 1.93, - "grad_norm": 26.431730270385742, - "learning_rate": 7.130399919815576e-06, - "loss": 1.7221, + "epoch": 0.81, + "grad_norm": 12.461531639099121, + "learning_rate": 1.4628289336066604e-05, + "loss": 0.9269, "step": 6421 }, { - "epoch": 1.93, - "grad_norm": 9.490403175354004, - "learning_rate": 7.1283953092111865e-06, - "loss": 1.1032, + "epoch": 0.81, + "grad_norm": 10.216887474060059, + "learning_rate": 1.4627452621009917e-05, + "loss": 1.6536, "step": 6422 }, { - "epoch": 1.93, - "grad_norm": 52.717281341552734, - "learning_rate": 7.126390698606797e-06, - "loss": 1.5446, + "epoch": 0.81, + "grad_norm": 9.225250244140625, + "learning_rate": 1.4626615905953229e-05, + "loss": 1.0244, "step": 6423 }, { - "epoch": 1.93, - "grad_norm": 14.396468162536621, - "learning_rate": 7.124386088002406e-06, - "loss": 0.762, + "epoch": 0.81, + "grad_norm": 15.16700553894043, + "learning_rate": 1.4625779190896541e-05, + "loss": 0.8063, "step": 6424 }, { - "epoch": 1.93, - "grad_norm": 34.430179595947266, - "learning_rate": 7.122381477398016e-06, - "loss": 2.3349, + "epoch": 0.81, + "grad_norm": 41.029144287109375, + "learning_rate": 1.4624942475839855e-05, + "loss": 2.733, "step": 6425 }, { - "epoch": 1.93, - "grad_norm": 7.51977014541626, - "learning_rate": 7.120376866793626e-06, - "loss": 0.1446, + "epoch": 0.81, + "grad_norm": 10.162013053894043, + "learning_rate": 1.4624105760783167e-05, + "loss": 1.6527, "step": 6426 }, { - "epoch": 1.93, - "grad_norm": 11.881677627563477, - "learning_rate": 7.118372256189235e-06, - "loss": 1.2101, + "epoch": 0.81, + "grad_norm": 8.653647422790527, + "learning_rate": 1.4623269045726479e-05, + "loss": 1.5381, "step": 6427 }, { - "epoch": 1.93, - "grad_norm": 17.82876968383789, - "learning_rate": 7.116367645584846e-06, - "loss": 1.2283, + "epoch": 0.81, + "grad_norm": 14.731822967529297, + "learning_rate": 1.462243233066979e-05, + "loss": 0.7498, "step": 6428 }, { - "epoch": 1.93, - "grad_norm": 16.561769485473633, - "learning_rate": 7.114363034980456e-06, - "loss": 2.0483, + "epoch": 0.81, + "grad_norm": 19.83860206604004, + "learning_rate": 1.4621595615613104e-05, + "loss": 1.3507, "step": 6429 }, { - "epoch": 1.93, - "grad_norm": 35.203155517578125, - "learning_rate": 7.1123584243760655e-06, - "loss": 1.7524, + "epoch": 0.81, + "grad_norm": 9.492634773254395, + "learning_rate": 1.4620758900556418e-05, + "loss": 0.7376, "step": 6430 }, { - "epoch": 1.93, - "grad_norm": 24.895706176757812, - "learning_rate": 7.110353813771676e-06, - "loss": 1.6033, + "epoch": 0.81, + "grad_norm": 9.63869857788086, + "learning_rate": 1.4619922185499728e-05, + "loss": 1.3482, "step": 6431 }, { - "epoch": 1.93, - "grad_norm": 23.870237350463867, - "learning_rate": 7.108349203167285e-06, - "loss": 1.4187, + "epoch": 0.81, + "grad_norm": 8.897042274475098, + "learning_rate": 1.4619085470443042e-05, + "loss": 1.8334, "step": 6432 }, { - "epoch": 1.93, - "grad_norm": 24.825237274169922, - "learning_rate": 7.106344592562895e-06, - "loss": 1.594, + "epoch": 0.81, + "grad_norm": 21.0125789642334, + "learning_rate": 1.4618248755386355e-05, + "loss": 2.6799, "step": 6433 }, { - "epoch": 1.93, - "grad_norm": 8.913065910339355, - "learning_rate": 7.104339981958505e-06, - "loss": 1.0598, + "epoch": 0.81, + "grad_norm": 12.40052604675293, + "learning_rate": 1.4617412040329666e-05, + "loss": 2.913, "step": 6434 }, { - "epoch": 1.93, - "grad_norm": 41.398555755615234, - "learning_rate": 7.102335371354115e-06, - "loss": 2.3157, + "epoch": 0.81, + "grad_norm": 13.130826950073242, + "learning_rate": 1.461657532527298e-05, + "loss": 1.4907, "step": 6435 }, { - "epoch": 1.94, - "grad_norm": 18.028615951538086, - "learning_rate": 7.100330760749725e-06, - "loss": 2.0904, + "epoch": 0.81, + "grad_norm": 18.544252395629883, + "learning_rate": 1.4615738610216293e-05, + "loss": 1.7603, "step": 6436 }, { - "epoch": 1.94, - "grad_norm": 11.3342866897583, - "learning_rate": 7.098326150145335e-06, - "loss": 1.7827, + "epoch": 0.81, + "grad_norm": 14.421226501464844, + "learning_rate": 1.4614901895159605e-05, + "loss": 0.8415, "step": 6437 }, { - "epoch": 1.94, - "grad_norm": 20.189563751220703, - "learning_rate": 7.0963215395409445e-06, - "loss": 1.6553, + "epoch": 0.81, + "grad_norm": 10.523085594177246, + "learning_rate": 1.4614065180102917e-05, + "loss": 1.099, "step": 6438 }, { - "epoch": 1.94, - "grad_norm": 12.127889633178711, - "learning_rate": 7.094316928936555e-06, - "loss": 1.4503, + "epoch": 0.81, + "grad_norm": 20.280637741088867, + "learning_rate": 1.4613228465046229e-05, + "loss": 1.1873, "step": 6439 }, { - "epoch": 1.94, - "grad_norm": 37.8315315246582, - "learning_rate": 7.092312318332164e-06, - "loss": 1.7977, + "epoch": 0.81, + "grad_norm": 11.694278717041016, + "learning_rate": 1.4612391749989543e-05, + "loss": 1.9685, "step": 6440 }, { - "epoch": 1.94, - "grad_norm": 13.1519775390625, - "learning_rate": 7.090307707727775e-06, - "loss": 1.6393, + "epoch": 0.81, + "grad_norm": 19.720291137695312, + "learning_rate": 1.4611555034932854e-05, + "loss": 2.8196, "step": 6441 }, { - "epoch": 1.94, - "grad_norm": 35.46371841430664, - "learning_rate": 7.088303097123385e-06, - "loss": 2.3399, + "epoch": 0.81, + "grad_norm": 4.62885856628418, + "learning_rate": 1.4610718319876166e-05, + "loss": 1.571, "step": 6442 }, { - "epoch": 1.94, - "grad_norm": 12.697428703308105, - "learning_rate": 7.086298486518994e-06, - "loss": 2.5277, + "epoch": 0.81, + "grad_norm": 21.804670333862305, + "learning_rate": 1.460988160481948e-05, + "loss": 0.8583, "step": 6443 }, { - "epoch": 1.94, - "grad_norm": 26.87334632873535, - "learning_rate": 7.084293875914604e-06, - "loss": 2.0531, + "epoch": 0.81, + "grad_norm": 7.430726528167725, + "learning_rate": 1.4609044889762794e-05, + "loss": 0.3739, "step": 6444 }, { - "epoch": 1.94, - "grad_norm": 14.13779354095459, - "learning_rate": 7.082289265310213e-06, - "loss": 1.8136, + "epoch": 0.81, + "grad_norm": 7.364584445953369, + "learning_rate": 1.4608208174706104e-05, + "loss": 2.0177, "step": 6445 }, { - "epoch": 1.94, - "grad_norm": 12.27436637878418, - "learning_rate": 7.0802846547058235e-06, - "loss": 1.2326, + "epoch": 0.81, + "grad_norm": 7.745721340179443, + "learning_rate": 1.4607371459649418e-05, + "loss": 1.0005, "step": 6446 }, { - "epoch": 1.94, - "grad_norm": 19.918701171875, - "learning_rate": 7.078280044101434e-06, - "loss": 1.4257, + "epoch": 0.81, + "grad_norm": 15.221183776855469, + "learning_rate": 1.4606534744592731e-05, + "loss": 1.7872, "step": 6447 }, { - "epoch": 1.94, - "grad_norm": 13.029349327087402, - "learning_rate": 7.076275433497044e-06, - "loss": 1.7537, + "epoch": 0.81, + "grad_norm": 11.117356300354004, + "learning_rate": 1.4605698029536042e-05, + "loss": 0.4948, "step": 6448 }, { - "epoch": 1.94, - "grad_norm": 9.116921424865723, - "learning_rate": 7.074270822892654e-06, - "loss": 0.7983, + "epoch": 0.81, + "grad_norm": 22.149002075195312, + "learning_rate": 1.4604861314479355e-05, + "loss": 1.5521, "step": 6449 }, { - "epoch": 1.94, - "grad_norm": 30.727689743041992, - "learning_rate": 7.072266212288264e-06, - "loss": 1.2988, + "epoch": 0.81, + "grad_norm": 17.480669021606445, + "learning_rate": 1.4604024599422669e-05, + "loss": 1.7611, "step": 6450 }, { - "epoch": 1.94, - "grad_norm": 15.236931800842285, - "learning_rate": 7.070261601683873e-06, - "loss": 1.4535, + "epoch": 0.81, + "grad_norm": 10.693264961242676, + "learning_rate": 1.460318788436598e-05, + "loss": 1.9562, "step": 6451 }, { - "epoch": 1.94, - "grad_norm": 38.2266845703125, - "learning_rate": 7.068256991079483e-06, - "loss": 0.8669, + "epoch": 0.81, + "grad_norm": 8.48538589477539, + "learning_rate": 1.4602351169309293e-05, + "loss": 0.6823, "step": 6452 }, { - "epoch": 1.94, - "grad_norm": 42.05986785888672, - "learning_rate": 7.066252380475094e-06, - "loss": 1.7596, + "epoch": 0.81, + "grad_norm": 13.08189868927002, + "learning_rate": 1.4601514454252605e-05, + "loss": 2.1066, "step": 6453 }, { - "epoch": 1.94, - "grad_norm": 28.586145401000977, - "learning_rate": 7.064247769870703e-06, - "loss": 1.823, + "epoch": 0.81, + "grad_norm": 13.800010681152344, + "learning_rate": 1.4600677739195918e-05, + "loss": 1.4282, "step": 6454 }, { - "epoch": 1.94, - "grad_norm": 6.259302616119385, - "learning_rate": 7.062243159266313e-06, - "loss": 0.3312, + "epoch": 0.81, + "grad_norm": 15.999839782714844, + "learning_rate": 1.459984102413923e-05, + "loss": 1.8667, "step": 6455 }, { - "epoch": 1.94, - "grad_norm": 8.961036682128906, - "learning_rate": 7.060238548661923e-06, - "loss": 0.999, + "epoch": 0.81, + "grad_norm": 20.90586280822754, + "learning_rate": 1.4599004309082542e-05, + "loss": 2.6342, "step": 6456 }, { - "epoch": 1.94, - "grad_norm": 19.617019653320312, - "learning_rate": 7.058233938057533e-06, - "loss": 1.9528, + "epoch": 0.81, + "grad_norm": 24.189279556274414, + "learning_rate": 1.4598167594025856e-05, + "loss": 2.6589, "step": 6457 }, { - "epoch": 1.94, - "grad_norm": 30.003917694091797, - "learning_rate": 7.056229327453143e-06, - "loss": 1.5707, + "epoch": 0.81, + "grad_norm": 7.959173202514648, + "learning_rate": 1.459733087896917e-05, + "loss": 1.2131, "step": 6458 }, { - "epoch": 1.94, - "grad_norm": 16.13441276550293, - "learning_rate": 7.054224716848753e-06, - "loss": 2.1332, + "epoch": 0.81, + "grad_norm": 19.223169326782227, + "learning_rate": 1.459649416391248e-05, + "loss": 2.325, "step": 6459 }, { - "epoch": 1.94, - "grad_norm": 11.956790924072266, - "learning_rate": 7.052220106244363e-06, - "loss": 1.4419, + "epoch": 0.81, + "grad_norm": 7.551638126373291, + "learning_rate": 1.4595657448855793e-05, + "loss": 0.3251, "step": 6460 }, { - "epoch": 1.94, - "grad_norm": 17.357301712036133, - "learning_rate": 7.050215495639973e-06, - "loss": 1.1238, + "epoch": 0.81, + "grad_norm": 15.9935941696167, + "learning_rate": 1.4594820733799107e-05, + "loss": 1.999, "step": 6461 }, { - "epoch": 1.94, - "grad_norm": 16.655643463134766, - "learning_rate": 7.048210885035582e-06, - "loss": 2.1496, + "epoch": 0.81, + "grad_norm": 8.858784675598145, + "learning_rate": 1.4593984018742417e-05, + "loss": 0.5507, "step": 6462 }, { - "epoch": 1.94, - "grad_norm": 17.952760696411133, - "learning_rate": 7.046206274431192e-06, - "loss": 1.5901, + "epoch": 0.81, + "grad_norm": 18.052534103393555, + "learning_rate": 1.4593147303685731e-05, + "loss": 1.3051, "step": 6463 }, { - "epoch": 1.94, - "grad_norm": 20.259384155273438, - "learning_rate": 7.044201663826802e-06, - "loss": 1.2221, + "epoch": 0.81, + "grad_norm": 19.27977752685547, + "learning_rate": 1.4592310588629045e-05, + "loss": 1.6754, "step": 6464 }, { - "epoch": 1.94, - "grad_norm": 25.712038040161133, - "learning_rate": 7.0421970532224125e-06, - "loss": 2.141, + "epoch": 0.81, + "grad_norm": 22.184526443481445, + "learning_rate": 1.4591473873572357e-05, + "loss": 1.365, "step": 6465 }, { - "epoch": 1.94, - "grad_norm": 17.299470901489258, - "learning_rate": 7.040192442618023e-06, - "loss": 0.9474, + "epoch": 0.81, + "grad_norm": 15.561197280883789, + "learning_rate": 1.4590637158515669e-05, + "loss": 1.4414, "step": 6466 }, { - "epoch": 1.94, - "grad_norm": 18.08493423461914, - "learning_rate": 7.038187832013632e-06, - "loss": 1.711, + "epoch": 0.81, + "grad_norm": 14.853275299072266, + "learning_rate": 1.458980044345898e-05, + "loss": 3.0884, "step": 6467 }, { - "epoch": 1.94, - "grad_norm": 54.114418029785156, - "learning_rate": 7.036183221409242e-06, - "loss": 0.6773, + "epoch": 0.81, + "grad_norm": 9.874292373657227, + "learning_rate": 1.4588963728402294e-05, + "loss": 0.7533, "step": 6468 }, { - "epoch": 1.94, - "grad_norm": 13.001557350158691, - "learning_rate": 7.034178610804851e-06, - "loss": 1.1428, + "epoch": 0.81, + "grad_norm": 4.604920864105225, + "learning_rate": 1.4588127013345606e-05, + "loss": 0.4188, "step": 6469 }, { - "epoch": 1.95, - "grad_norm": 8.948216438293457, - "learning_rate": 7.032174000200461e-06, - "loss": 1.2129, + "epoch": 0.81, + "grad_norm": 19.53866195678711, + "learning_rate": 1.4587290298288918e-05, + "loss": 3.1209, "step": 6470 }, { - "epoch": 1.95, - "grad_norm": 96.37139892578125, - "learning_rate": 7.030169389596072e-06, - "loss": 1.3139, + "epoch": 0.81, + "grad_norm": 39.01898956298828, + "learning_rate": 1.4586453583232232e-05, + "loss": 1.2419, "step": 6471 }, { - "epoch": 1.95, - "grad_norm": 14.782844543457031, - "learning_rate": 7.0281647789916814e-06, - "loss": 1.4799, + "epoch": 0.81, + "grad_norm": 17.05350685119629, + "learning_rate": 1.4585616868175545e-05, + "loss": 1.5823, "step": 6472 }, { - "epoch": 1.95, - "grad_norm": 14.976436614990234, - "learning_rate": 7.0261601683872915e-06, - "loss": 1.4801, + "epoch": 0.81, + "grad_norm": 15.502795219421387, + "learning_rate": 1.4584780153118856e-05, + "loss": 0.9528, "step": 6473 }, { - "epoch": 1.95, - "grad_norm": 26.140769958496094, - "learning_rate": 7.024155557782902e-06, - "loss": 1.9828, + "epoch": 0.81, + "grad_norm": 36.6273307800293, + "learning_rate": 1.458394343806217e-05, + "loss": 3.6041, "step": 6474 }, { - "epoch": 1.95, - "grad_norm": 14.462868690490723, - "learning_rate": 7.022150947178511e-06, - "loss": 1.3346, + "epoch": 0.81, + "grad_norm": 7.87349271774292, + "learning_rate": 1.4583106723005483e-05, + "loss": 1.4797, "step": 6475 }, { - "epoch": 1.95, - "grad_norm": 12.547101974487305, - "learning_rate": 7.020146336574121e-06, - "loss": 1.8225, + "epoch": 0.81, + "grad_norm": 32.351585388183594, + "learning_rate": 1.4582270007948793e-05, + "loss": 4.1317, "step": 6476 }, { - "epoch": 1.95, - "grad_norm": 13.108290672302246, - "learning_rate": 7.01814172596973e-06, - "loss": 1.7796, + "epoch": 0.81, + "grad_norm": 23.48708724975586, + "learning_rate": 1.4581433292892107e-05, + "loss": 1.7281, "step": 6477 }, { - "epoch": 1.95, - "grad_norm": 20.673439025878906, - "learning_rate": 7.016137115365341e-06, - "loss": 1.7685, + "epoch": 0.81, + "grad_norm": 22.382984161376953, + "learning_rate": 1.458059657783542e-05, + "loss": 0.6815, "step": 6478 }, { - "epoch": 1.95, - "grad_norm": 9.68248462677002, - "learning_rate": 7.014132504760951e-06, - "loss": 1.1545, + "epoch": 0.81, + "grad_norm": 5.5734381675720215, + "learning_rate": 1.4579759862778732e-05, + "loss": 1.6547, "step": 6479 }, { - "epoch": 1.95, - "grad_norm": 11.824162483215332, - "learning_rate": 7.01212789415656e-06, - "loss": 1.1257, - "step": 6480 - }, - { - "epoch": 1.95, - "eval_loss": 0.17668664455413818, - "eval_runtime": 43.6307, - "eval_samples_per_second": 33.898, - "eval_steps_per_second": 33.898, + "epoch": 0.81, + "grad_norm": 17.19704246520996, + "learning_rate": 1.4578923147722044e-05, + "loss": 1.7212, "step": 6480 }, { - "epoch": 1.95, - "grad_norm": 19.132976531982422, - "learning_rate": 7.0101232835521705e-06, - "loss": 1.4466, + "epoch": 0.81, + "grad_norm": 9.862728118896484, + "learning_rate": 1.4578086432665356e-05, + "loss": 0.9488, "step": 6481 }, { - "epoch": 1.95, - "grad_norm": 13.578184127807617, - "learning_rate": 7.00811867294778e-06, - "loss": 1.8283, + "epoch": 0.81, + "grad_norm": 21.093936920166016, + "learning_rate": 1.457724971760867e-05, + "loss": 2.7171, "step": 6482 }, { - "epoch": 1.95, - "grad_norm": 14.636981010437012, - "learning_rate": 7.00611406234339e-06, - "loss": 1.5467, + "epoch": 0.81, + "grad_norm": 20.390226364135742, + "learning_rate": 1.4576413002551982e-05, + "loss": 1.7474, "step": 6483 }, { - "epoch": 1.95, - "grad_norm": 14.252835273742676, - "learning_rate": 7.004109451739001e-06, - "loss": 1.0799, + "epoch": 0.81, + "grad_norm": 8.369909286499023, + "learning_rate": 1.4575576287495294e-05, + "loss": 2.2826, "step": 6484 }, { - "epoch": 1.95, - "grad_norm": 7.7530837059021, - "learning_rate": 7.002104841134611e-06, - "loss": 1.7159, + "epoch": 0.81, + "grad_norm": 12.084911346435547, + "learning_rate": 1.4574739572438608e-05, + "loss": 1.048, "step": 6485 }, { - "epoch": 1.95, - "grad_norm": 19.219745635986328, - "learning_rate": 7.00010023053022e-06, - "loss": 1.3202, + "epoch": 0.81, + "grad_norm": 7.501756191253662, + "learning_rate": 1.4573902857381921e-05, + "loss": 1.6434, "step": 6486 }, { - "epoch": 1.95, - "grad_norm": 10.907150268554688, - "learning_rate": 6.99809561992583e-06, - "loss": 1.3228, + "epoch": 0.81, + "grad_norm": 16.92917251586914, + "learning_rate": 1.4573066142325232e-05, + "loss": 2.7004, "step": 6487 }, { - "epoch": 1.95, - "grad_norm": 7.514638423919678, - "learning_rate": 6.996091009321439e-06, - "loss": 1.1397, + "epoch": 0.81, + "grad_norm": 7.9938836097717285, + "learning_rate": 1.4572229427268545e-05, + "loss": 1.5723, "step": 6488 }, { - "epoch": 1.95, - "grad_norm": 21.773658752441406, - "learning_rate": 6.9940863987170495e-06, - "loss": 0.9278, + "epoch": 0.81, + "grad_norm": 38.44096755981445, + "learning_rate": 1.4571392712211859e-05, + "loss": 0.8792, "step": 6489 }, { - "epoch": 1.95, - "grad_norm": 10.506806373596191, - "learning_rate": 6.99208178811266e-06, - "loss": 1.709, + "epoch": 0.81, + "grad_norm": 19.077499389648438, + "learning_rate": 1.4570555997155169e-05, + "loss": 3.0232, "step": 6490 }, { - "epoch": 1.95, - "grad_norm": 15.891654014587402, - "learning_rate": 6.99007717750827e-06, - "loss": 1.4618, + "epoch": 0.81, + "grad_norm": 13.681564331054688, + "learning_rate": 1.4569719282098483e-05, + "loss": 1.5459, "step": 6491 }, { - "epoch": 1.95, - "grad_norm": 31.406612396240234, - "learning_rate": 6.98807256690388e-06, - "loss": 1.977, + "epoch": 0.81, + "grad_norm": 20.16901397705078, + "learning_rate": 1.4568882567041795e-05, + "loss": 1.6177, "step": 6492 }, { - "epoch": 1.95, - "grad_norm": 20.738195419311523, - "learning_rate": 6.986067956299489e-06, - "loss": 1.4744, + "epoch": 0.81, + "grad_norm": 10.907618522644043, + "learning_rate": 1.4568045851985108e-05, + "loss": 1.2013, "step": 6493 }, { - "epoch": 1.95, - "grad_norm": 17.87335968017578, - "learning_rate": 6.984063345695099e-06, - "loss": 1.8068, + "epoch": 0.81, + "grad_norm": 24.92099952697754, + "learning_rate": 1.456720913692842e-05, + "loss": 1.4485, "step": 6494 }, { - "epoch": 1.95, - "grad_norm": 13.646010398864746, - "learning_rate": 6.982058735090709e-06, - "loss": 0.9828, + "epoch": 0.82, + "grad_norm": 17.624778747558594, + "learning_rate": 1.4566372421871732e-05, + "loss": 1.7824, "step": 6495 }, { - "epoch": 1.95, - "grad_norm": 12.331052780151367, - "learning_rate": 6.980054124486319e-06, - "loss": 1.6284, + "epoch": 0.82, + "grad_norm": 10.988775253295898, + "learning_rate": 1.4565535706815046e-05, + "loss": 1.9106, "step": 6496 }, { - "epoch": 1.95, - "grad_norm": 52.41801452636719, - "learning_rate": 6.978049513881929e-06, - "loss": 0.973, + "epoch": 0.82, + "grad_norm": 6.560670852661133, + "learning_rate": 1.4564698991758358e-05, + "loss": 0.2385, "step": 6497 }, { - "epoch": 1.95, - "grad_norm": 27.965364456176758, - "learning_rate": 6.976044903277539e-06, - "loss": 1.0861, + "epoch": 0.82, + "grad_norm": 18.28946304321289, + "learning_rate": 1.456386227670167e-05, + "loss": 1.6398, "step": 6498 }, { - "epoch": 1.95, - "grad_norm": 25.636484146118164, - "learning_rate": 6.974040292673149e-06, - "loss": 1.8763, + "epoch": 0.82, + "grad_norm": 9.239535331726074, + "learning_rate": 1.4563025561644983e-05, + "loss": 0.9596, "step": 6499 }, { - "epoch": 1.95, - "grad_norm": 15.626718521118164, - "learning_rate": 6.972035682068759e-06, - "loss": 1.2224, + "epoch": 0.82, + "grad_norm": 42.755859375, + "learning_rate": 1.4562188846588297e-05, + "loss": 2.6544, "step": 6500 }, { - "epoch": 1.95, - "grad_norm": 12.968523979187012, - "learning_rate": 6.970031071464368e-06, - "loss": 1.4936, + "epoch": 0.82, + "grad_norm": 23.385692596435547, + "learning_rate": 1.4561352131531607e-05, + "loss": 2.5512, "step": 6501 }, { - "epoch": 1.95, - "grad_norm": 38.640384674072266, - "learning_rate": 6.968026460859979e-06, - "loss": 1.847, + "epoch": 0.82, + "grad_norm": 16.858610153198242, + "learning_rate": 1.4560515416474921e-05, + "loss": 1.8767, "step": 6502 }, { - "epoch": 1.96, - "grad_norm": 35.005760192871094, - "learning_rate": 6.966021850255589e-06, - "loss": 1.8344, + "epoch": 0.82, + "grad_norm": 13.347295761108398, + "learning_rate": 1.4559678701418235e-05, + "loss": 1.7668, "step": 6503 }, { - "epoch": 1.96, - "grad_norm": 59.90297317504883, - "learning_rate": 6.964017239651198e-06, - "loss": 1.5804, + "epoch": 0.82, + "grad_norm": 4.6510329246521, + "learning_rate": 1.4558841986361545e-05, + "loss": 0.4936, "step": 6504 }, { - "epoch": 1.96, - "grad_norm": 8.664277076721191, - "learning_rate": 6.962012629046808e-06, - "loss": 1.1554, + "epoch": 0.82, + "grad_norm": 10.455964088439941, + "learning_rate": 1.4558005271304859e-05, + "loss": 1.5058, "step": 6505 }, { - "epoch": 1.96, - "grad_norm": 15.182535171508789, - "learning_rate": 6.9600080184424175e-06, - "loss": 1.107, + "epoch": 0.82, + "grad_norm": 18.887109756469727, + "learning_rate": 1.455716855624817e-05, + "loss": 0.6563, "step": 6506 }, { - "epoch": 1.96, - "grad_norm": 25.827924728393555, - "learning_rate": 6.958003407838028e-06, - "loss": 1.1151, + "epoch": 0.82, + "grad_norm": 12.829666137695312, + "learning_rate": 1.4556331841191484e-05, + "loss": 2.7943, "step": 6507 }, { - "epoch": 1.96, - "grad_norm": 9.57219409942627, - "learning_rate": 6.9559987972336385e-06, - "loss": 1.3565, + "epoch": 0.82, + "grad_norm": 6.674988269805908, + "learning_rate": 1.4555495126134796e-05, + "loss": 0.8994, "step": 6508 }, { - "epoch": 1.96, - "grad_norm": 113.87895202636719, - "learning_rate": 6.953994186629248e-06, - "loss": 1.4153, + "epoch": 0.82, + "grad_norm": 25.241641998291016, + "learning_rate": 1.4554658411078108e-05, + "loss": 2.5382, "step": 6509 }, { - "epoch": 1.96, - "grad_norm": 17.828977584838867, - "learning_rate": 6.951989576024858e-06, - "loss": 1.8861, + "epoch": 0.82, + "grad_norm": 2.330019474029541, + "learning_rate": 1.4553821696021422e-05, + "loss": 0.0793, "step": 6510 }, { - "epoch": 1.96, - "grad_norm": 27.323060989379883, - "learning_rate": 6.949984965420468e-06, - "loss": 1.4276, + "epoch": 0.82, + "grad_norm": 19.723289489746094, + "learning_rate": 1.4552984980964732e-05, + "loss": 1.4636, "step": 6511 }, { - "epoch": 1.96, - "grad_norm": 11.81863021850586, - "learning_rate": 6.947980354816077e-06, - "loss": 0.785, + "epoch": 0.82, + "grad_norm": 17.301788330078125, + "learning_rate": 1.4552148265908046e-05, + "loss": 1.09, "step": 6512 }, { - "epoch": 1.96, - "grad_norm": 10.938015937805176, - "learning_rate": 6.945975744211687e-06, - "loss": 1.096, + "epoch": 0.82, + "grad_norm": 30.26137351989746, + "learning_rate": 1.455131155085136e-05, + "loss": 2.0605, "step": 6513 }, { - "epoch": 1.96, - "grad_norm": 11.981101036071777, - "learning_rate": 6.9439711336072965e-06, - "loss": 1.3268, + "epoch": 0.82, + "grad_norm": 19.499509811401367, + "learning_rate": 1.4550474835794673e-05, + "loss": 1.6498, "step": 6514 }, { - "epoch": 1.96, - "grad_norm": 19.58521842956543, - "learning_rate": 6.9419665230029074e-06, - "loss": 1.8049, + "epoch": 0.82, + "grad_norm": 12.36483383178711, + "learning_rate": 1.4549638120737983e-05, + "loss": 2.8728, "step": 6515 }, { - "epoch": 1.96, - "grad_norm": 24.513118743896484, - "learning_rate": 6.9399619123985175e-06, - "loss": 1.3997, + "epoch": 0.82, + "grad_norm": 20.424285888671875, + "learning_rate": 1.4548801405681297e-05, + "loss": 0.8148, "step": 6516 }, { - "epoch": 1.96, - "grad_norm": 13.00726318359375, - "learning_rate": 6.937957301794127e-06, - "loss": 2.1062, + "epoch": 0.82, + "grad_norm": 8.933712005615234, + "learning_rate": 1.454796469062461e-05, + "loss": 2.214, "step": 6517 }, { - "epoch": 1.96, - "grad_norm": 16.61869239807129, - "learning_rate": 6.935952691189737e-06, - "loss": 1.4637, + "epoch": 0.82, + "grad_norm": 28.269676208496094, + "learning_rate": 1.454712797556792e-05, + "loss": 1.7172, "step": 6518 }, { - "epoch": 1.96, - "grad_norm": 18.2260684967041, - "learning_rate": 6.933948080585347e-06, - "loss": 1.3907, + "epoch": 0.82, + "grad_norm": 9.066912651062012, + "learning_rate": 1.4546291260511234e-05, + "loss": 1.6103, "step": 6519 }, { - "epoch": 1.96, - "grad_norm": 27.4097957611084, - "learning_rate": 6.931943469980956e-06, - "loss": 2.0212, + "epoch": 0.82, + "grad_norm": 17.551225662231445, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.2698, "step": 6520 }, { - "epoch": 1.96, - "grad_norm": 13.984885215759277, - "learning_rate": 6.929938859376567e-06, - "loss": 0.8537, + "epoch": 0.82, + "grad_norm": 15.091469764709473, + "learning_rate": 1.454461783039786e-05, + "loss": 2.0922, "step": 6521 }, { - "epoch": 1.96, - "grad_norm": 29.465072631835938, - "learning_rate": 6.927934248772177e-06, - "loss": 2.082, + "epoch": 0.82, + "grad_norm": 11.744821548461914, + "learning_rate": 1.4543781115341172e-05, + "loss": 0.7955, "step": 6522 }, { - "epoch": 1.96, - "grad_norm": 13.194611549377441, - "learning_rate": 6.925929638167786e-06, - "loss": 1.1381, + "epoch": 0.82, + "grad_norm": 8.435943603515625, + "learning_rate": 1.4542944400284484e-05, + "loss": 0.6975, "step": 6523 }, { - "epoch": 1.96, - "grad_norm": 7.926638603210449, - "learning_rate": 6.9239250275633965e-06, - "loss": 0.9932, + "epoch": 0.82, + "grad_norm": 13.73911190032959, + "learning_rate": 1.4542107685227798e-05, + "loss": 0.9118, "step": 6524 }, { - "epoch": 1.96, - "grad_norm": 16.14949607849121, - "learning_rate": 6.921920416959006e-06, - "loss": 1.4698, + "epoch": 0.82, + "grad_norm": 8.145610809326172, + "learning_rate": 1.4541270970171108e-05, + "loss": 0.4499, "step": 6525 }, { - "epoch": 1.96, - "grad_norm": 33.72904968261719, - "learning_rate": 6.919915806354616e-06, - "loss": 3.2966, + "epoch": 0.82, + "grad_norm": 11.110485076904297, + "learning_rate": 1.4540434255114421e-05, + "loss": 2.4025, "step": 6526 }, { - "epoch": 1.96, - "grad_norm": 8.642800331115723, - "learning_rate": 6.917911195750227e-06, - "loss": 1.1102, + "epoch": 0.82, + "grad_norm": 91.34126281738281, + "learning_rate": 1.4539597540057735e-05, + "loss": 2.0706, "step": 6527 }, { - "epoch": 1.96, - "grad_norm": 12.295087814331055, - "learning_rate": 6.915906585145836e-06, - "loss": 0.9972, + "epoch": 0.82, + "grad_norm": 16.182498931884766, + "learning_rate": 1.4538760825001049e-05, + "loss": 1.4164, "step": 6528 }, { - "epoch": 1.96, - "grad_norm": 24.457801818847656, - "learning_rate": 6.913901974541446e-06, - "loss": 1.4332, + "epoch": 0.82, + "grad_norm": 9.682636260986328, + "learning_rate": 1.4537924109944359e-05, + "loss": 1.55, "step": 6529 }, { - "epoch": 1.96, - "grad_norm": 14.026710510253906, - "learning_rate": 6.911897363937055e-06, - "loss": 1.8641, + "epoch": 0.82, + "grad_norm": 16.95926856994629, + "learning_rate": 1.4537087394887673e-05, + "loss": 1.9937, "step": 6530 }, { - "epoch": 1.96, - "grad_norm": 12.17983341217041, - "learning_rate": 6.909892753332665e-06, - "loss": 1.4611, + "epoch": 0.82, + "grad_norm": 7.322328567504883, + "learning_rate": 1.4536250679830986e-05, + "loss": 1.2618, "step": 6531 }, { - "epoch": 1.96, - "grad_norm": 12.283072471618652, - "learning_rate": 6.9078881427282755e-06, - "loss": 1.3947, + "epoch": 0.82, + "grad_norm": 30.958463668823242, + "learning_rate": 1.4535413964774297e-05, + "loss": 0.4021, "step": 6532 }, { - "epoch": 1.96, - "grad_norm": 17.96526527404785, - "learning_rate": 6.9058835321238856e-06, - "loss": 1.7153, + "epoch": 0.82, + "grad_norm": 13.310428619384766, + "learning_rate": 1.453457724971761e-05, + "loss": 1.0993, "step": 6533 }, { - "epoch": 1.96, - "grad_norm": 24.099628448486328, - "learning_rate": 6.903878921519496e-06, - "loss": 1.2495, + "epoch": 0.82, + "grad_norm": 15.63068675994873, + "learning_rate": 1.4533740534660922e-05, + "loss": 1.9267, "step": 6534 }, { - "epoch": 1.96, - "grad_norm": 16.107059478759766, - "learning_rate": 6.901874310915106e-06, - "loss": 1.4728, + "epoch": 0.82, + "grad_norm": 21.709959030151367, + "learning_rate": 1.4532903819604236e-05, + "loss": 3.3146, "step": 6535 }, { - "epoch": 1.97, - "grad_norm": 15.214371681213379, - "learning_rate": 6.899869700310715e-06, - "loss": 1.1217, + "epoch": 0.82, + "grad_norm": 13.965682029724121, + "learning_rate": 1.4532067104547548e-05, + "loss": 3.9313, "step": 6536 }, { - "epoch": 1.97, - "grad_norm": 27.67184829711914, - "learning_rate": 6.897865089706325e-06, - "loss": 1.4113, + "epoch": 0.82, + "grad_norm": 14.858061790466309, + "learning_rate": 1.453123038949086e-05, + "loss": 1.5124, "step": 6537 }, { - "epoch": 1.97, - "grad_norm": 20.205726623535156, - "learning_rate": 6.895860479101934e-06, - "loss": 0.8228, + "epoch": 0.82, + "grad_norm": 50.09086608886719, + "learning_rate": 1.4530393674434173e-05, + "loss": 2.489, "step": 6538 }, { - "epoch": 1.97, - "grad_norm": 12.976736068725586, - "learning_rate": 6.893855868497545e-06, - "loss": 1.0787, + "epoch": 0.82, + "grad_norm": 26.609758377075195, + "learning_rate": 1.4529556959377484e-05, + "loss": 1.7796, "step": 6539 }, { - "epoch": 1.97, - "grad_norm": 141.27496337890625, - "learning_rate": 6.891851257893155e-06, - "loss": 2.659, + "epoch": 0.82, + "grad_norm": 10.551162719726562, + "learning_rate": 1.4528720244320797e-05, + "loss": 1.2116, "step": 6540 }, { - "epoch": 1.97, - "grad_norm": 25.662107467651367, - "learning_rate": 6.8898466472887645e-06, - "loss": 2.3812, + "epoch": 0.82, + "grad_norm": 13.381997108459473, + "learning_rate": 1.4527883529264111e-05, + "loss": 0.7883, "step": 6541 }, { - "epoch": 1.97, - "grad_norm": 17.610265731811523, - "learning_rate": 6.887842036684375e-06, - "loss": 1.1345, + "epoch": 0.82, + "grad_norm": 33.09524917602539, + "learning_rate": 1.4527046814207425e-05, + "loss": 2.4682, "step": 6542 }, { - "epoch": 1.97, - "grad_norm": 13.565105438232422, - "learning_rate": 6.885837426079985e-06, - "loss": 1.7477, + "epoch": 0.82, + "grad_norm": 13.487656593322754, + "learning_rate": 1.4526210099150735e-05, + "loss": 0.9119, "step": 6543 }, { - "epoch": 1.97, - "grad_norm": 72.72836303710938, - "learning_rate": 6.883832815475594e-06, - "loss": 1.0895, + "epoch": 0.82, + "grad_norm": 13.092456817626953, + "learning_rate": 1.4525373384094048e-05, + "loss": 3.033, "step": 6544 }, { - "epoch": 1.97, - "grad_norm": 11.967425346374512, - "learning_rate": 6.881828204871205e-06, - "loss": 1.2826, + "epoch": 0.82, + "grad_norm": 59.662330627441406, + "learning_rate": 1.452453666903736e-05, + "loss": 2.0694, "step": 6545 }, { - "epoch": 1.97, - "grad_norm": 27.99849510192871, - "learning_rate": 6.879823594266815e-06, - "loss": 1.4163, + "epoch": 0.82, + "grad_norm": 4.204308986663818, + "learning_rate": 1.4523699953980672e-05, + "loss": 0.1224, "step": 6546 }, { - "epoch": 1.97, - "grad_norm": 17.99080467224121, - "learning_rate": 6.877818983662424e-06, - "loss": 0.7908, + "epoch": 0.82, + "grad_norm": 6.319829940795898, + "learning_rate": 1.4522863238923986e-05, + "loss": 0.9945, "step": 6547 }, { - "epoch": 1.97, - "grad_norm": 8.94594955444336, - "learning_rate": 6.875814373058034e-06, - "loss": 1.5791, + "epoch": 0.82, + "grad_norm": 14.876279830932617, + "learning_rate": 1.4522026523867298e-05, + "loss": 1.4276, "step": 6548 }, { - "epoch": 1.97, - "grad_norm": 18.120492935180664, - "learning_rate": 6.8738097624536435e-06, - "loss": 1.5981, - "step": 6549 + "epoch": 0.82, + "grad_norm": 11.182560920715332, + "learning_rate": 1.4521189808810612e-05, + "loss": 1.499, + "step": 6549 }, { - "epoch": 1.97, - "grad_norm": 11.887974739074707, - "learning_rate": 6.871805151849254e-06, - "loss": 2.1877, + "epoch": 0.82, + "grad_norm": 23.092039108276367, + "learning_rate": 1.4520353093753924e-05, + "loss": 3.6536, "step": 6550 }, { - "epoch": 1.97, - "grad_norm": 14.82646369934082, - "learning_rate": 6.8698005412448645e-06, - "loss": 1.2886, + "epoch": 0.82, + "grad_norm": 16.563962936401367, + "learning_rate": 1.4519516378697236e-05, + "loss": 2.0491, "step": 6551 }, { - "epoch": 1.97, - "grad_norm": 15.733491897583008, - "learning_rate": 6.867795930640474e-06, - "loss": 0.836, + "epoch": 0.82, + "grad_norm": 11.452547073364258, + "learning_rate": 1.451867966364055e-05, + "loss": 1.4192, "step": 6552 }, { - "epoch": 1.97, - "grad_norm": 20.574148178100586, - "learning_rate": 6.865791320036084e-06, - "loss": 0.9531, + "epoch": 0.82, + "grad_norm": 19.012704849243164, + "learning_rate": 1.451784294858386e-05, + "loss": 2.8846, "step": 6553 }, { - "epoch": 1.97, - "grad_norm": 28.415647506713867, - "learning_rate": 6.863786709431693e-06, - "loss": 2.1882, + "epoch": 0.82, + "grad_norm": 27.29177474975586, + "learning_rate": 1.4517006233527173e-05, + "loss": 1.7701, "step": 6554 }, { - "epoch": 1.97, - "grad_norm": 17.647674560546875, - "learning_rate": 6.861782098827303e-06, - "loss": 2.3014, + "epoch": 0.82, + "grad_norm": 6.170631408691406, + "learning_rate": 1.4516169518470487e-05, + "loss": 0.7061, "step": 6555 }, { - "epoch": 1.97, - "grad_norm": 49.27579879760742, - "learning_rate": 6.859777488222913e-06, - "loss": 1.5323, + "epoch": 0.82, + "grad_norm": 8.111652374267578, + "learning_rate": 1.45153328034138e-05, + "loss": 0.8509, "step": 6556 }, { - "epoch": 1.97, - "grad_norm": 19.76232147216797, - "learning_rate": 6.8577728776185225e-06, - "loss": 1.6397, + "epoch": 0.82, + "grad_norm": 31.29617691040039, + "learning_rate": 1.451449608835711e-05, + "loss": 1.1399, "step": 6557 }, { - "epoch": 1.97, - "grad_norm": 11.850191116333008, - "learning_rate": 6.8557682670141334e-06, - "loss": 1.027, + "epoch": 0.82, + "grad_norm": 9.659380912780762, + "learning_rate": 1.4513659373300424e-05, + "loss": 1.1278, "step": 6558 }, { - "epoch": 1.97, - "grad_norm": 26.760108947753906, - "learning_rate": 6.8537636564097435e-06, - "loss": 1.7687, + "epoch": 0.82, + "grad_norm": 9.498473167419434, + "learning_rate": 1.4512822658243736e-05, + "loss": 1.8005, "step": 6559 }, { - "epoch": 1.97, - "grad_norm": 11.1150484085083, - "learning_rate": 6.851759045805353e-06, - "loss": 1.418, + "epoch": 0.82, + "grad_norm": 11.763265609741211, + "learning_rate": 1.4511985943187048e-05, + "loss": 1.965, "step": 6560 }, { - "epoch": 1.97, - "grad_norm": 23.722135543823242, - "learning_rate": 6.849754435200963e-06, - "loss": 1.0756, + "epoch": 0.82, + "grad_norm": 15.806600570678711, + "learning_rate": 1.4511149228130362e-05, + "loss": 1.2199, "step": 6561 }, { - "epoch": 1.97, - "grad_norm": 17.482393264770508, - "learning_rate": 6.847749824596572e-06, - "loss": 1.2066, + "epoch": 0.82, + "grad_norm": 13.062655448913574, + "learning_rate": 1.4510312513073674e-05, + "loss": 0.9208, "step": 6562 }, { - "epoch": 1.97, - "grad_norm": 17.04688835144043, - "learning_rate": 6.845745213992182e-06, - "loss": 1.4078, + "epoch": 0.82, + "grad_norm": 6.248857498168945, + "learning_rate": 1.4509475798016987e-05, + "loss": 0.5726, "step": 6563 }, { - "epoch": 1.97, - "grad_norm": 15.257699012756348, - "learning_rate": 6.843740603387793e-06, - "loss": 1.8723, + "epoch": 0.82, + "grad_norm": 19.46588706970215, + "learning_rate": 1.4508639082960298e-05, + "loss": 1.0639, "step": 6564 }, { - "epoch": 1.97, - "grad_norm": 13.324009895324707, - "learning_rate": 6.841735992783402e-06, - "loss": 1.3293, + "epoch": 0.82, + "grad_norm": 17.309402465820312, + "learning_rate": 1.4507802367903611e-05, + "loss": 1.03, "step": 6565 }, { - "epoch": 1.97, - "grad_norm": 18.284709930419922, - "learning_rate": 6.839731382179012e-06, - "loss": 1.8599, + "epoch": 0.82, + "grad_norm": 2.397465229034424, + "learning_rate": 1.4506965652846925e-05, + "loss": 0.0929, "step": 6566 }, { - "epoch": 1.97, - "grad_norm": 13.653274536132812, - "learning_rate": 6.837726771574622e-06, - "loss": 1.7576, + "epoch": 0.82, + "grad_norm": 13.140107154846191, + "learning_rate": 1.4506128937790235e-05, + "loss": 0.843, "step": 6567 }, { - "epoch": 1.97, - "grad_norm": 38.0052490234375, - "learning_rate": 6.835722160970232e-06, - "loss": 1.656, + "epoch": 0.82, + "grad_norm": 9.642655372619629, + "learning_rate": 1.4505292222733549e-05, + "loss": 0.7449, "step": 6568 }, { - "epoch": 1.98, - "grad_norm": 15.570902824401855, - "learning_rate": 6.833717550365842e-06, - "loss": 1.3572, + "epoch": 0.82, + "grad_norm": 18.0646915435791, + "learning_rate": 1.4504455507676863e-05, + "loss": 2.3933, "step": 6569 }, { - "epoch": 1.98, - "grad_norm": 10.287487030029297, - "learning_rate": 6.831712939761453e-06, - "loss": 1.0965, + "epoch": 0.82, + "grad_norm": 6.070803165435791, + "learning_rate": 1.4503618792620176e-05, + "loss": 0.9627, "step": 6570 }, { - "epoch": 1.98, - "grad_norm": 21.403841018676758, - "learning_rate": 6.829708329157062e-06, - "loss": 1.657, + "epoch": 0.82, + "grad_norm": 10.14867877960205, + "learning_rate": 1.4502782077563487e-05, + "loss": 1.2322, "step": 6571 }, { - "epoch": 1.98, - "grad_norm": 11.690693855285645, - "learning_rate": 6.827703718552672e-06, - "loss": 1.9188, + "epoch": 0.82, + "grad_norm": 24.438552856445312, + "learning_rate": 1.45019453625068e-05, + "loss": 0.8593, "step": 6572 }, { - "epoch": 1.98, - "grad_norm": 15.413796424865723, - "learning_rate": 6.825699107948281e-06, - "loss": 1.1102, + "epoch": 0.82, + "grad_norm": 10.15174388885498, + "learning_rate": 1.4501108647450112e-05, + "loss": 0.7729, "step": 6573 }, { - "epoch": 1.98, - "grad_norm": 14.91450023651123, - "learning_rate": 6.823694497343891e-06, - "loss": 1.0896, + "epoch": 0.83, + "grad_norm": 35.63993835449219, + "learning_rate": 1.4500271932393424e-05, + "loss": 1.5794, "step": 6574 }, { - "epoch": 1.98, - "grad_norm": 12.081074714660645, - "learning_rate": 6.821689886739501e-06, - "loss": 1.7378, + "epoch": 0.83, + "grad_norm": 13.550148963928223, + "learning_rate": 1.4499435217336738e-05, + "loss": 1.2075, "step": 6575 }, { - "epoch": 1.98, - "grad_norm": 16.04603385925293, - "learning_rate": 6.8196852761351116e-06, - "loss": 1.4417, + "epoch": 0.83, + "grad_norm": 24.06031036376953, + "learning_rate": 1.449859850228005e-05, + "loss": 3.1595, "step": 6576 }, { - "epoch": 1.98, - "grad_norm": 22.271512985229492, - "learning_rate": 6.817680665530722e-06, - "loss": 1.7555, + "epoch": 0.83, + "grad_norm": 31.2019100189209, + "learning_rate": 1.4497761787223363e-05, + "loss": 1.8097, "step": 6577 }, { - "epoch": 1.98, - "grad_norm": 13.81209945678711, - "learning_rate": 6.815676054926331e-06, - "loss": 1.1311, + "epoch": 0.83, + "grad_norm": 8.17690372467041, + "learning_rate": 1.4496925072166674e-05, + "loss": 0.6208, "step": 6578 }, { - "epoch": 1.98, - "grad_norm": 16.152433395385742, - "learning_rate": 6.813671444321941e-06, - "loss": 1.1026, + "epoch": 0.83, + "grad_norm": 19.666261672973633, + "learning_rate": 1.4496088357109987e-05, + "loss": 1.6847, "step": 6579 }, { - "epoch": 1.98, - "grad_norm": 12.994856834411621, - "learning_rate": 6.811666833717551e-06, - "loss": 1.1081, + "epoch": 0.83, + "grad_norm": 8.355585098266602, + "learning_rate": 1.4495251642053301e-05, + "loss": 1.4162, "step": 6580 }, { - "epoch": 1.98, - "grad_norm": 12.3396577835083, - "learning_rate": 6.80966222311316e-06, - "loss": 1.3073, + "epoch": 0.83, + "grad_norm": 17.03511619567871, + "learning_rate": 1.4494414926996611e-05, + "loss": 1.1782, "step": 6581 }, { - "epoch": 1.98, - "grad_norm": 8.903776168823242, - "learning_rate": 6.807657612508771e-06, - "loss": 0.9238, + "epoch": 0.83, + "grad_norm": 11.48168659210205, + "learning_rate": 1.4493578211939925e-05, + "loss": 1.9596, "step": 6582 }, { - "epoch": 1.98, - "grad_norm": 15.200712203979492, - "learning_rate": 6.805653001904381e-06, - "loss": 1.3466, + "epoch": 0.83, + "grad_norm": 9.669479370117188, + "learning_rate": 1.4492741496883238e-05, + "loss": 0.6663, "step": 6583 }, { - "epoch": 1.98, - "grad_norm": 19.97858428955078, - "learning_rate": 6.8036483912999905e-06, - "loss": 1.7572, + "epoch": 0.83, + "grad_norm": 12.308369636535645, + "learning_rate": 1.449190478182655e-05, + "loss": 1.0629, "step": 6584 }, { - "epoch": 1.98, - "grad_norm": 24.35478401184082, - "learning_rate": 6.801643780695601e-06, - "loss": 1.929, + "epoch": 0.83, + "grad_norm": 15.225822448730469, + "learning_rate": 1.4491068066769862e-05, + "loss": 1.1809, "step": 6585 }, { - "epoch": 1.98, - "grad_norm": 30.171939849853516, - "learning_rate": 6.79963917009121e-06, - "loss": 1.5492, + "epoch": 0.83, + "grad_norm": 15.169183731079102, + "learning_rate": 1.4490231351713176e-05, + "loss": 0.9538, "step": 6586 }, { - "epoch": 1.98, - "grad_norm": 12.768831253051758, - "learning_rate": 6.79763455948682e-06, - "loss": 1.0463, + "epoch": 0.83, + "grad_norm": 5.91043758392334, + "learning_rate": 1.4489394636656488e-05, + "loss": 0.4872, "step": 6587 }, { - "epoch": 1.98, - "grad_norm": 21.945470809936523, - "learning_rate": 6.795629948882431e-06, - "loss": 2.2842, + "epoch": 0.83, + "grad_norm": 19.87512969970703, + "learning_rate": 1.44885579215998e-05, + "loss": 1.5201, "step": 6588 }, { - "epoch": 1.98, - "grad_norm": 22.516357421875, - "learning_rate": 6.79362533827804e-06, - "loss": 2.0211, + "epoch": 0.83, + "grad_norm": 8.933917045593262, + "learning_rate": 1.4487721206543114e-05, + "loss": 0.555, "step": 6589 }, { - "epoch": 1.98, - "grad_norm": 8.880302429199219, - "learning_rate": 6.79162072767365e-06, - "loss": 1.2581, + "epoch": 0.83, + "grad_norm": 7.488944053649902, + "learning_rate": 1.4486884491486426e-05, + "loss": 0.2726, "step": 6590 }, { - "epoch": 1.98, - "grad_norm": 9.477561950683594, - "learning_rate": 6.7896161170692594e-06, - "loss": 0.7168, + "epoch": 0.83, + "grad_norm": 12.712186813354492, + "learning_rate": 1.448604777642974e-05, + "loss": 1.8749, "step": 6591 }, { - "epoch": 1.98, - "grad_norm": 21.51420783996582, - "learning_rate": 6.7876115064648695e-06, - "loss": 1.3912, + "epoch": 0.83, + "grad_norm": 14.857038497924805, + "learning_rate": 1.448521106137305e-05, + "loss": 1.1538, "step": 6592 }, { - "epoch": 1.98, - "grad_norm": 50.1191291809082, - "learning_rate": 6.78560689586048e-06, - "loss": 3.0844, + "epoch": 0.83, + "grad_norm": 12.747661590576172, + "learning_rate": 1.4484374346316363e-05, + "loss": 0.7235, "step": 6593 }, { - "epoch": 1.98, - "grad_norm": 19.533275604248047, - "learning_rate": 6.78360228525609e-06, - "loss": 2.1365, + "epoch": 0.83, + "grad_norm": 39.1489372253418, + "learning_rate": 1.4483537631259677e-05, + "loss": 1.893, "step": 6594 }, { - "epoch": 1.98, - "grad_norm": 11.014450073242188, - "learning_rate": 6.7815976746517e-06, - "loss": 0.7238, + "epoch": 0.83, + "grad_norm": 9.506407737731934, + "learning_rate": 1.4482700916202987e-05, + "loss": 2.2091, "step": 6595 }, { - "epoch": 1.98, - "grad_norm": 35.398197174072266, - "learning_rate": 6.77959306404731e-06, - "loss": 2.9231, + "epoch": 0.83, + "grad_norm": 21.36495590209961, + "learning_rate": 1.44818642011463e-05, + "loss": 2.8877, "step": 6596 }, { - "epoch": 1.98, - "grad_norm": 59.179141998291016, - "learning_rate": 6.777588453442919e-06, - "loss": 1.4903, + "epoch": 0.83, + "grad_norm": 12.15341854095459, + "learning_rate": 1.4481027486089614e-05, + "loss": 2.097, "step": 6597 }, { - "epoch": 1.98, - "grad_norm": 26.32767677307129, - "learning_rate": 6.775583842838529e-06, - "loss": 1.0562, + "epoch": 0.83, + "grad_norm": 29.05472755432129, + "learning_rate": 1.4480190771032926e-05, + "loss": 1.5282, "step": 6598 }, { - "epoch": 1.98, - "grad_norm": 14.91978931427002, - "learning_rate": 6.773579232234138e-06, - "loss": 1.8945, + "epoch": 0.83, + "grad_norm": 12.839364051818848, + "learning_rate": 1.4479354055976238e-05, + "loss": 1.5512, "step": 6599 }, { - "epoch": 1.98, - "grad_norm": 43.29436111450195, - "learning_rate": 6.7715746216297485e-06, - "loss": 2.4119, - "step": 6600 - }, - { - "epoch": 1.98, - "eval_loss": 0.17498844861984253, - "eval_runtime": 43.9202, - "eval_samples_per_second": 33.675, - "eval_steps_per_second": 33.675, + "epoch": 0.83, + "grad_norm": 7.66991662979126, + "learning_rate": 1.4478517340919552e-05, + "loss": 1.5824, "step": 6600 }, { - "epoch": 1.98, - "grad_norm": 20.57763671875, - "learning_rate": 6.7695700110253594e-06, - "loss": 1.2593, + "epoch": 0.83, + "grad_norm": 15.398689270019531, + "learning_rate": 1.4477680625862864e-05, + "loss": 1.4943, "step": 6601 }, { - "epoch": 1.98, - "grad_norm": 11.902490615844727, - "learning_rate": 6.767565400420969e-06, - "loss": 0.8341, + "epoch": 0.83, + "grad_norm": 24.575775146484375, + "learning_rate": 1.4476843910806176e-05, + "loss": 1.7269, "step": 6602 }, { - "epoch": 1.99, - "grad_norm": 11.146263122558594, - "learning_rate": 6.765560789816579e-06, - "loss": 1.3651, + "epoch": 0.83, + "grad_norm": 11.358213424682617, + "learning_rate": 1.4476007195749488e-05, + "loss": 1.1742, "step": 6603 }, { - "epoch": 1.99, - "grad_norm": 11.864297866821289, - "learning_rate": 6.763556179212189e-06, - "loss": 0.9559, + "epoch": 0.83, + "grad_norm": 20.26782989501953, + "learning_rate": 1.4475170480692801e-05, + "loss": 2.2997, "step": 6604 }, { - "epoch": 1.99, - "grad_norm": 12.255757331848145, - "learning_rate": 6.761551568607798e-06, - "loss": 1.1177, + "epoch": 0.83, + "grad_norm": 31.358434677124023, + "learning_rate": 1.4474333765636113e-05, + "loss": 0.9536, "step": 6605 }, { - "epoch": 1.99, - "grad_norm": 6.7448506355285645, - "learning_rate": 6.759546958003408e-06, - "loss": 0.6869, + "epoch": 0.83, + "grad_norm": 13.94992733001709, + "learning_rate": 1.4473497050579425e-05, + "loss": 1.7772, "step": 6606 }, { - "epoch": 1.99, - "grad_norm": 15.666464805603027, - "learning_rate": 6.757542347399019e-06, - "loss": 1.7579, + "epoch": 0.83, + "grad_norm": 11.810144424438477, + "learning_rate": 1.4472660335522739e-05, + "loss": 0.7789, "step": 6607 }, { - "epoch": 1.99, - "grad_norm": 16.473482131958008, - "learning_rate": 6.755537736794628e-06, - "loss": 0.9031, + "epoch": 0.83, + "grad_norm": 38.90315246582031, + "learning_rate": 1.4471823620466053e-05, + "loss": 3.4662, "step": 6608 }, { - "epoch": 1.99, - "grad_norm": 12.140952110290527, - "learning_rate": 6.753533126190238e-06, - "loss": 0.7175, + "epoch": 0.83, + "grad_norm": 18.91117286682129, + "learning_rate": 1.4470986905409363e-05, + "loss": 2.3911, "step": 6609 }, { - "epoch": 1.99, - "grad_norm": 9.83596134185791, - "learning_rate": 6.751528515585848e-06, - "loss": 0.8216, + "epoch": 0.83, + "grad_norm": 13.002034187316895, + "learning_rate": 1.4470150190352676e-05, + "loss": 1.4097, "step": 6610 }, { - "epoch": 1.99, - "grad_norm": 13.17597770690918, - "learning_rate": 6.749523904981458e-06, - "loss": 1.694, + "epoch": 0.83, + "grad_norm": 12.609968185424805, + "learning_rate": 1.446931347529599e-05, + "loss": 2.7201, "step": 6611 }, { - "epoch": 1.99, - "grad_norm": 17.648849487304688, - "learning_rate": 6.747519294377067e-06, - "loss": 2.1423, + "epoch": 0.83, + "grad_norm": 8.692763328552246, + "learning_rate": 1.44684767602393e-05, + "loss": 0.9019, "step": 6612 }, { - "epoch": 1.99, - "grad_norm": 13.655058860778809, - "learning_rate": 6.745514683772678e-06, - "loss": 0.9178, + "epoch": 0.83, + "grad_norm": 20.65419578552246, + "learning_rate": 1.4467640045182614e-05, + "loss": 1.9726, "step": 6613 }, { - "epoch": 1.99, - "grad_norm": 18.347841262817383, - "learning_rate": 6.743510073168288e-06, - "loss": 1.0027, + "epoch": 0.83, + "grad_norm": 16.71739959716797, + "learning_rate": 1.4466803330125928e-05, + "loss": 2.2029, "step": 6614 }, { - "epoch": 1.99, - "grad_norm": 26.16404914855957, - "learning_rate": 6.741505462563897e-06, - "loss": 2.206, + "epoch": 0.83, + "grad_norm": 10.462072372436523, + "learning_rate": 1.446596661506924e-05, + "loss": 2.1755, "step": 6615 }, { - "epoch": 1.99, - "grad_norm": 7.279219150543213, - "learning_rate": 6.739500851959507e-06, - "loss": 0.7718, + "epoch": 0.83, + "grad_norm": 12.887832641601562, + "learning_rate": 1.4465129900012552e-05, + "loss": 0.7345, "step": 6616 }, { - "epoch": 1.99, - "grad_norm": 12.55292797088623, - "learning_rate": 6.737496241355117e-06, - "loss": 1.8409, + "epoch": 0.83, + "grad_norm": 9.375746726989746, + "learning_rate": 1.4464293184955864e-05, + "loss": 1.3041, "step": 6617 }, { - "epoch": 1.99, - "grad_norm": 42.25106430053711, - "learning_rate": 6.735491630750727e-06, - "loss": 2.3152, + "epoch": 0.83, + "grad_norm": 14.980518341064453, + "learning_rate": 1.4463456469899177e-05, + "loss": 2.3885, "step": 6618 }, { - "epoch": 1.99, - "grad_norm": 14.544368743896484, - "learning_rate": 6.7334870201463376e-06, - "loss": 1.2898, + "epoch": 0.83, + "grad_norm": 33.584938049316406, + "learning_rate": 1.4462619754842489e-05, + "loss": 1.862, "step": 6619 }, { - "epoch": 1.99, - "grad_norm": 21.00879669189453, - "learning_rate": 6.731482409541948e-06, - "loss": 1.4371, + "epoch": 0.83, + "grad_norm": 1.6788455247879028, + "learning_rate": 1.4461783039785801e-05, + "loss": 0.0572, "step": 6620 }, { - "epoch": 1.99, - "grad_norm": 15.112432479858398, - "learning_rate": 6.729477798937557e-06, - "loss": 2.2847, + "epoch": 0.83, + "grad_norm": 9.320537567138672, + "learning_rate": 1.4460946324729115e-05, + "loss": 0.4605, "step": 6621 }, { - "epoch": 1.99, - "grad_norm": 16.390920639038086, - "learning_rate": 6.727473188333167e-06, - "loss": 1.1366, + "epoch": 0.83, + "grad_norm": 7.229696750640869, + "learning_rate": 1.4460109609672428e-05, + "loss": 0.9963, "step": 6622 }, { - "epoch": 1.99, - "grad_norm": 17.105491638183594, - "learning_rate": 6.725468577728776e-06, - "loss": 1.7725, + "epoch": 0.83, + "grad_norm": 18.225360870361328, + "learning_rate": 1.4459272894615739e-05, + "loss": 1.2763, "step": 6623 }, { - "epoch": 1.99, - "grad_norm": 17.488632202148438, - "learning_rate": 6.723463967124386e-06, - "loss": 0.5385, + "epoch": 0.83, + "grad_norm": 3.779670000076294, + "learning_rate": 1.4458436179559052e-05, + "loss": 0.1906, "step": 6624 }, { - "epoch": 1.99, - "grad_norm": 29.78978157043457, - "learning_rate": 6.721459356519997e-06, - "loss": 2.3833, + "epoch": 0.83, + "grad_norm": 39.11140441894531, + "learning_rate": 1.4457599464502366e-05, + "loss": 1.4888, "step": 6625 }, { - "epoch": 1.99, - "grad_norm": 15.036394119262695, - "learning_rate": 6.7194547459156065e-06, - "loss": 1.0212, + "epoch": 0.83, + "grad_norm": 10.29348373413086, + "learning_rate": 1.4456762749445676e-05, + "loss": 0.6129, "step": 6626 }, { - "epoch": 1.99, - "grad_norm": 17.2657527923584, - "learning_rate": 6.7174501353112165e-06, - "loss": 1.4752, + "epoch": 0.83, + "grad_norm": 24.620576858520508, + "learning_rate": 1.445592603438899e-05, + "loss": 1.8498, "step": 6627 }, { - "epoch": 1.99, - "grad_norm": 27.82944679260254, - "learning_rate": 6.715445524706826e-06, - "loss": 1.741, + "epoch": 0.83, + "grad_norm": 20.528329849243164, + "learning_rate": 1.4455089319332304e-05, + "loss": 0.7714, "step": 6628 }, { - "epoch": 1.99, - "grad_norm": 14.460271835327148, - "learning_rate": 6.713440914102436e-06, - "loss": 1.6737, + "epoch": 0.83, + "grad_norm": 12.630378723144531, + "learning_rate": 1.4454252604275615e-05, + "loss": 1.9049, "step": 6629 }, { - "epoch": 1.99, - "grad_norm": 30.571147918701172, - "learning_rate": 6.711436303498046e-06, - "loss": 2.2318, + "epoch": 0.83, + "grad_norm": 21.8435001373291, + "learning_rate": 1.4453415889218927e-05, + "loss": 3.5011, "step": 6630 }, { - "epoch": 1.99, - "grad_norm": 18.02724838256836, - "learning_rate": 6.709431692893657e-06, - "loss": 1.0354, + "epoch": 0.83, + "grad_norm": 59.98054122924805, + "learning_rate": 1.445257917416224e-05, + "loss": 2.5349, "step": 6631 }, { - "epoch": 1.99, - "grad_norm": 12.847180366516113, - "learning_rate": 6.707427082289266e-06, - "loss": 1.5948, + "epoch": 0.83, + "grad_norm": 7.572198867797852, + "learning_rate": 1.4451742459105553e-05, + "loss": 0.6822, "step": 6632 }, { - "epoch": 1.99, - "grad_norm": 13.379613876342773, - "learning_rate": 6.705422471684876e-06, - "loss": 1.2021, + "epoch": 0.83, + "grad_norm": 14.560245513916016, + "learning_rate": 1.4450905744048865e-05, + "loss": 0.4531, "step": 6633 }, { - "epoch": 1.99, - "grad_norm": 20.53904914855957, - "learning_rate": 6.7034178610804854e-06, - "loss": 1.9106, + "epoch": 0.83, + "grad_norm": 12.604669570922852, + "learning_rate": 1.4450069028992177e-05, + "loss": 1.1481, "step": 6634 }, { - "epoch": 1.99, - "grad_norm": 23.03459930419922, - "learning_rate": 6.7014132504760955e-06, - "loss": 1.785, + "epoch": 0.83, + "grad_norm": 35.92240905761719, + "learning_rate": 1.444923231393549e-05, + "loss": 1.5445, "step": 6635 }, { - "epoch": 2.0, - "grad_norm": 19.824228286743164, - "learning_rate": 6.699408639871705e-06, - "loss": 0.8569, + "epoch": 0.83, + "grad_norm": 15.33647346496582, + "learning_rate": 1.4448395598878804e-05, + "loss": 0.9631, "step": 6636 }, { - "epoch": 2.0, - "grad_norm": 59.54680633544922, - "learning_rate": 6.697404029267316e-06, - "loss": 2.1475, + "epoch": 0.83, + "grad_norm": 7.366020202636719, + "learning_rate": 1.4447558883822115e-05, + "loss": 0.3153, "step": 6637 }, { - "epoch": 2.0, - "grad_norm": 10.229207992553711, - "learning_rate": 6.695399418662926e-06, - "loss": 1.1441, + "epoch": 0.83, + "grad_norm": 9.031330108642578, + "learning_rate": 1.4446722168765428e-05, + "loss": 0.7969, "step": 6638 }, { - "epoch": 2.0, - "grad_norm": 28.467979431152344, - "learning_rate": 6.693394808058535e-06, - "loss": 1.7708, + "epoch": 0.83, + "grad_norm": 12.082524299621582, + "learning_rate": 1.4445885453708742e-05, + "loss": 0.9905, "step": 6639 }, { - "epoch": 2.0, - "grad_norm": 10.531692504882812, - "learning_rate": 6.691390197454145e-06, - "loss": 0.5444, + "epoch": 0.83, + "grad_norm": 5.6854753494262695, + "learning_rate": 1.4445048738652052e-05, + "loss": 0.3756, "step": 6640 }, { - "epoch": 2.0, - "grad_norm": 13.020025253295898, - "learning_rate": 6.689385586849755e-06, - "loss": 0.8387, + "epoch": 0.83, + "grad_norm": 57.63182067871094, + "learning_rate": 1.4444212023595366e-05, + "loss": 2.161, "step": 6641 }, { - "epoch": 2.0, - "grad_norm": 12.514997482299805, - "learning_rate": 6.687380976245364e-06, - "loss": 1.3939, + "epoch": 0.83, + "grad_norm": 1.8692203760147095, + "learning_rate": 1.444337530853868e-05, + "loss": 0.0638, "step": 6642 }, { - "epoch": 2.0, - "grad_norm": 27.652118682861328, - "learning_rate": 6.6853763656409745e-06, - "loss": 3.0772, + "epoch": 0.83, + "grad_norm": 22.31800651550293, + "learning_rate": 1.4442538593481991e-05, + "loss": 0.5192, "step": 6643 }, { - "epoch": 2.0, - "grad_norm": 24.15280532836914, - "learning_rate": 6.6833717550365854e-06, - "loss": 1.5129, + "epoch": 0.83, + "grad_norm": 9.42281436920166, + "learning_rate": 1.4441701878425303e-05, + "loss": 0.7522, "step": 6644 }, { - "epoch": 2.0, - "grad_norm": 13.905855178833008, - "learning_rate": 6.681367144432195e-06, - "loss": 0.9144, + "epoch": 0.83, + "grad_norm": 23.117460250854492, + "learning_rate": 1.4440865163368615e-05, + "loss": 0.7487, "step": 6645 }, { - "epoch": 2.0, - "grad_norm": 16.91253662109375, - "learning_rate": 6.679362533827805e-06, - "loss": 1.5355, + "epoch": 0.83, + "grad_norm": 26.78668212890625, + "learning_rate": 1.4440028448311929e-05, + "loss": 3.0379, "step": 6646 }, { - "epoch": 2.0, - "grad_norm": 11.888856887817383, - "learning_rate": 6.677357923223414e-06, - "loss": 1.1719, + "epoch": 0.83, + "grad_norm": 25.992734909057617, + "learning_rate": 1.443919173325524e-05, + "loss": 2.8284, "step": 6647 }, { - "epoch": 2.0, - "grad_norm": 14.441067695617676, - "learning_rate": 6.675353312619024e-06, - "loss": 1.3663, + "epoch": 0.83, + "grad_norm": 11.912419319152832, + "learning_rate": 1.4438355018198553e-05, + "loss": 2.9013, "step": 6648 }, { - "epoch": 2.0, - "grad_norm": 13.287384033203125, - "learning_rate": 6.673348702014633e-06, - "loss": 0.5206, + "epoch": 0.83, + "grad_norm": 11.855568885803223, + "learning_rate": 1.4437518303141866e-05, + "loss": 2.0173, "step": 6649 }, { - "epoch": 2.0, - "grad_norm": 12.121244430541992, - "learning_rate": 6.671344091410244e-06, - "loss": 1.0182, + "epoch": 0.83, + "grad_norm": 24.56494903564453, + "learning_rate": 1.443668158808518e-05, + "loss": 3.4956, "step": 6650 }, { - "epoch": 2.0, - "grad_norm": 44.00876998901367, - "learning_rate": 6.669339480805854e-06, - "loss": 0.9455, + "epoch": 0.83, + "grad_norm": 38.72340774536133, + "learning_rate": 1.443584487302849e-05, + "loss": 3.564, "step": 6651 }, { - "epoch": 2.0, - "grad_norm": 15.983967781066895, - "learning_rate": 6.6673348702014636e-06, - "loss": 0.8615, + "epoch": 0.83, + "grad_norm": 21.563867568969727, + "learning_rate": 1.4435008157971804e-05, + "loss": 2.1634, "step": 6652 }, { - "epoch": 2.0, - "grad_norm": 13.022303581237793, - "learning_rate": 6.665330259597074e-06, - "loss": 1.3803, + "epoch": 0.83, + "grad_norm": 40.92628860473633, + "learning_rate": 1.4434171442915118e-05, + "loss": 1.0617, "step": 6653 }, { - "epoch": 2.0, - "grad_norm": 57.43462371826172, - "learning_rate": 6.663325648992684e-06, - "loss": 1.9661, + "epoch": 0.84, + "grad_norm": 18.901601791381836, + "learning_rate": 1.4433334727858428e-05, + "loss": 1.821, "step": 6654 }, { - "epoch": 2.0, - "grad_norm": 13.801424026489258, - "learning_rate": 6.661321038388293e-06, - "loss": 1.6653, + "epoch": 0.84, + "grad_norm": 25.665586471557617, + "learning_rate": 1.4432498012801742e-05, + "loss": 2.5711, "step": 6655 }, { - "epoch": 2.0, - "grad_norm": 14.12394905090332, - "learning_rate": 6.659316427783904e-06, - "loss": 1.0337, + "epoch": 0.84, + "grad_norm": 5.861196994781494, + "learning_rate": 1.4431661297745053e-05, + "loss": 0.3677, "step": 6656 }, { - "epoch": 2.0, - "grad_norm": 16.065610885620117, - "learning_rate": 6.657311817179514e-06, - "loss": 1.6578, + "epoch": 0.84, + "grad_norm": 7.068995952606201, + "learning_rate": 1.4430824582688367e-05, + "loss": 0.4365, "step": 6657 }, { - "epoch": 2.0, - "grad_norm": 23.195810317993164, - "learning_rate": 6.655307206575123e-06, - "loss": 2.0227, + "epoch": 0.84, + "grad_norm": 29.462949752807617, + "learning_rate": 1.4429987867631679e-05, + "loss": 2.3702, "step": 6658 }, { - "epoch": 2.0, - "grad_norm": 39.60324478149414, - "learning_rate": 6.653302595970733e-06, - "loss": 2.9616, + "epoch": 0.84, + "grad_norm": 23.18093490600586, + "learning_rate": 1.4429151152574991e-05, + "loss": 1.9585, "step": 6659 }, { - "epoch": 2.0, - "grad_norm": 12.803409576416016, - "learning_rate": 6.6512979853663425e-06, - "loss": 1.3529, + "epoch": 0.84, + "grad_norm": 9.60701847076416, + "learning_rate": 1.4428314437518305e-05, + "loss": 1.2616, "step": 6660 }, { - "epoch": 2.0, - "grad_norm": 27.853031158447266, - "learning_rate": 6.649293374761953e-06, - "loss": 1.2641, + "epoch": 0.84, + "grad_norm": 23.689254760742188, + "learning_rate": 1.4427477722461617e-05, + "loss": 2.2917, "step": 6661 }, { - "epoch": 2.0, - "grad_norm": 12.550612449645996, - "learning_rate": 6.6472887641575636e-06, - "loss": 1.032, + "epoch": 0.84, + "grad_norm": 11.679755210876465, + "learning_rate": 1.4426641007404929e-05, + "loss": 0.5695, "step": 6662 }, { - "epoch": 2.0, - "grad_norm": 14.057964324951172, - "learning_rate": 6.645284153553173e-06, - "loss": 1.4831, + "epoch": 0.84, + "grad_norm": 12.400811195373535, + "learning_rate": 1.4425804292348242e-05, + "loss": 1.4716, "step": 6663 }, { - "epoch": 2.0, - "grad_norm": 13.735244750976562, - "learning_rate": 6.643279542948783e-06, - "loss": 1.842, + "epoch": 0.84, + "grad_norm": 9.619452476501465, + "learning_rate": 1.4424967577291556e-05, + "loss": 1.1475, "step": 6664 }, { - "epoch": 2.0, - "grad_norm": 10.447699546813965, - "learning_rate": 6.641274932344393e-06, - "loss": 1.1719, + "epoch": 0.84, + "grad_norm": 18.736326217651367, + "learning_rate": 1.4424130862234866e-05, + "loss": 2.2559, "step": 6665 }, { - "epoch": 2.0, - "grad_norm": 24.760276794433594, - "learning_rate": 6.639270321740002e-06, - "loss": 1.0642, + "epoch": 0.84, + "grad_norm": 21.631404876708984, + "learning_rate": 1.442329414717818e-05, + "loss": 1.1375, "step": 6666 }, { - "epoch": 2.0, - "grad_norm": 17.945220947265625, - "learning_rate": 6.637265711135612e-06, - "loss": 2.1282, + "epoch": 0.84, + "grad_norm": 45.52559280395508, + "learning_rate": 1.4422457432121493e-05, + "loss": 2.2882, "step": 6667 }, { - "epoch": 2.0, - "grad_norm": 40.95531463623047, - "learning_rate": 6.635261100531223e-06, - "loss": 1.8811, + "epoch": 0.84, + "grad_norm": 25.720808029174805, + "learning_rate": 1.4421620717064804e-05, + "loss": 2.2039, "step": 6668 }, { - "epoch": 2.01, - "grad_norm": 13.15553092956543, - "learning_rate": 6.6332564899268325e-06, - "loss": 1.134, + "epoch": 0.84, + "grad_norm": 10.99753475189209, + "learning_rate": 1.4420784002008117e-05, + "loss": 0.6437, "step": 6669 }, { - "epoch": 2.01, - "grad_norm": 20.673992156982422, - "learning_rate": 6.6312518793224425e-06, - "loss": 1.2125, + "epoch": 0.84, + "grad_norm": 14.956695556640625, + "learning_rate": 1.441994728695143e-05, + "loss": 1.6667, "step": 6670 }, { - "epoch": 2.01, - "grad_norm": 17.39179801940918, - "learning_rate": 6.629247268718052e-06, - "loss": 1.7538, + "epoch": 0.84, + "grad_norm": 17.53224754333496, + "learning_rate": 1.4419110571894743e-05, + "loss": 2.2026, "step": 6671 }, { - "epoch": 2.01, - "grad_norm": 8.030019760131836, - "learning_rate": 6.627242658113662e-06, - "loss": 1.5265, + "epoch": 0.84, + "grad_norm": 40.021331787109375, + "learning_rate": 1.4418273856838055e-05, + "loss": 1.2136, "step": 6672 }, { - "epoch": 2.01, - "grad_norm": 23.085996627807617, - "learning_rate": 6.625238047509271e-06, - "loss": 1.5642, + "epoch": 0.84, + "grad_norm": 26.441740036010742, + "learning_rate": 1.4417437141781367e-05, + "loss": 1.3784, "step": 6673 }, { - "epoch": 2.01, - "grad_norm": 36.12226104736328, - "learning_rate": 6.623233436904882e-06, - "loss": 2.836, + "epoch": 0.84, + "grad_norm": 5.3640336990356445, + "learning_rate": 1.441660042672468e-05, + "loss": 1.3586, "step": 6674 }, { - "epoch": 2.01, - "grad_norm": 16.673858642578125, - "learning_rate": 6.621228826300492e-06, - "loss": 1.0128, + "epoch": 0.84, + "grad_norm": 30.53775978088379, + "learning_rate": 1.441576371166799e-05, + "loss": 3.0845, "step": 6675 }, { - "epoch": 2.01, - "grad_norm": 18.04882049560547, - "learning_rate": 6.619224215696101e-06, - "loss": 1.2667, + "epoch": 0.84, + "grad_norm": 35.42987823486328, + "learning_rate": 1.4414926996611304e-05, + "loss": 3.7773, "step": 6676 }, { - "epoch": 2.01, - "grad_norm": 14.236947059631348, - "learning_rate": 6.6172196050917114e-06, - "loss": 1.0438, + "epoch": 0.84, + "grad_norm": 10.641247749328613, + "learning_rate": 1.4414090281554618e-05, + "loss": 1.1733, "step": 6677 }, { - "epoch": 2.01, - "grad_norm": 64.36849212646484, - "learning_rate": 6.6152149944873215e-06, - "loss": 2.7021, + "epoch": 0.84, + "grad_norm": 17.126296997070312, + "learning_rate": 1.4413253566497932e-05, + "loss": 2.457, "step": 6678 }, { - "epoch": 2.01, - "grad_norm": 23.676084518432617, - "learning_rate": 6.613210383882931e-06, - "loss": 1.0589, + "epoch": 0.84, + "grad_norm": 16.539026260375977, + "learning_rate": 1.4412416851441242e-05, + "loss": 1.8305, "step": 6679 }, { - "epoch": 2.01, - "grad_norm": 19.20448875427246, - "learning_rate": 6.611205773278542e-06, - "loss": 1.8695, + "epoch": 0.84, + "grad_norm": 21.915935516357422, + "learning_rate": 1.4411580136384556e-05, + "loss": 2.6571, "step": 6680 }, { - "epoch": 2.01, - "grad_norm": 10.511244773864746, - "learning_rate": 6.609201162674152e-06, - "loss": 1.1752, + "epoch": 0.84, + "grad_norm": 17.56529426574707, + "learning_rate": 1.441074342132787e-05, + "loss": 1.7428, "step": 6681 }, { - "epoch": 2.01, - "grad_norm": 40.56865692138672, - "learning_rate": 6.607196552069761e-06, - "loss": 1.6004, + "epoch": 0.84, + "grad_norm": 35.25141525268555, + "learning_rate": 1.440990670627118e-05, + "loss": 0.7839, "step": 6682 }, { - "epoch": 2.01, - "grad_norm": 17.472505569458008, - "learning_rate": 6.605191941465371e-06, - "loss": 1.5864, + "epoch": 0.84, + "grad_norm": 8.508606910705566, + "learning_rate": 1.4409069991214493e-05, + "loss": 0.6937, "step": 6683 }, { - "epoch": 2.01, - "grad_norm": 10.834254264831543, - "learning_rate": 6.60318733086098e-06, - "loss": 1.2419, + "epoch": 0.84, + "grad_norm": 6.896846294403076, + "learning_rate": 1.4408233276157805e-05, + "loss": 0.688, "step": 6684 }, { - "epoch": 2.01, - "grad_norm": 16.969730377197266, - "learning_rate": 6.60118272025659e-06, - "loss": 1.2284, + "epoch": 0.84, + "grad_norm": 10.6935396194458, + "learning_rate": 1.4407396561101119e-05, + "loss": 1.5449, "step": 6685 }, { - "epoch": 2.01, - "grad_norm": 7.831116199493408, - "learning_rate": 6.5991781096522e-06, - "loss": 1.4888, + "epoch": 0.84, + "grad_norm": 12.529866218566895, + "learning_rate": 1.440655984604443e-05, + "loss": 1.0339, "step": 6686 }, { - "epoch": 2.01, - "grad_norm": 24.744110107421875, - "learning_rate": 6.597173499047811e-06, - "loss": 1.6571, + "epoch": 0.84, + "grad_norm": 11.404264450073242, + "learning_rate": 1.4405723130987743e-05, + "loss": 1.3663, "step": 6687 }, { - "epoch": 2.01, - "grad_norm": 21.574138641357422, - "learning_rate": 6.595168888443421e-06, - "loss": 2.0631, + "epoch": 0.84, + "grad_norm": 29.173799514770508, + "learning_rate": 1.4404886415931056e-05, + "loss": 3.1138, "step": 6688 }, { - "epoch": 2.01, - "grad_norm": 11.040058135986328, - "learning_rate": 6.593164277839031e-06, - "loss": 1.1278, + "epoch": 0.84, + "grad_norm": 12.170273780822754, + "learning_rate": 1.4404049700874367e-05, + "loss": 3.3146, "step": 6689 }, { - "epoch": 2.01, - "grad_norm": 46.7620735168457, - "learning_rate": 6.59115966723464e-06, - "loss": 1.4132, + "epoch": 0.84, + "grad_norm": 15.326367378234863, + "learning_rate": 1.440321298581768e-05, + "loss": 1.28, "step": 6690 }, { - "epoch": 2.01, - "grad_norm": 10.559418678283691, - "learning_rate": 6.58915505663025e-06, - "loss": 0.9851, + "epoch": 0.84, + "grad_norm": 23.169282913208008, + "learning_rate": 1.4402376270760994e-05, + "loss": 2.1622, "step": 6691 }, { - "epoch": 2.01, - "grad_norm": 8.289081573486328, - "learning_rate": 6.587150446025859e-06, - "loss": 1.1853, + "epoch": 0.84, + "grad_norm": 7.151150703430176, + "learning_rate": 1.4401539555704308e-05, + "loss": 1.7352, "step": 6692 }, { - "epoch": 2.01, - "grad_norm": 8.112937927246094, - "learning_rate": 6.58514583542147e-06, - "loss": 1.0523, + "epoch": 0.84, + "grad_norm": 12.488361358642578, + "learning_rate": 1.4400702840647618e-05, + "loss": 1.8024, "step": 6693 }, { - "epoch": 2.01, - "grad_norm": 19.774045944213867, - "learning_rate": 6.58314122481708e-06, - "loss": 1.6196, + "epoch": 0.84, + "grad_norm": 16.26409149169922, + "learning_rate": 1.4399866125590931e-05, + "loss": 1.0197, "step": 6694 }, { - "epoch": 2.01, - "grad_norm": 52.585914611816406, - "learning_rate": 6.5811366142126896e-06, - "loss": 1.0791, + "epoch": 0.84, + "grad_norm": 10.992413520812988, + "learning_rate": 1.4399029410534245e-05, + "loss": 1.0422, "step": 6695 }, { - "epoch": 2.01, - "grad_norm": 10.37691879272461, - "learning_rate": 6.5791320036083e-06, - "loss": 0.525, + "epoch": 0.84, + "grad_norm": 24.888111114501953, + "learning_rate": 1.4398192695477555e-05, + "loss": 1.2818, "step": 6696 }, { - "epoch": 2.01, - "grad_norm": 13.222021102905273, - "learning_rate": 6.577127393003909e-06, - "loss": 1.0604, + "epoch": 0.84, + "grad_norm": 63.796077728271484, + "learning_rate": 1.4397355980420869e-05, + "loss": 2.3386, "step": 6697 }, { - "epoch": 2.01, - "grad_norm": 12.805461883544922, - "learning_rate": 6.575122782399519e-06, - "loss": 1.4706, + "epoch": 0.84, + "grad_norm": 10.447388648986816, + "learning_rate": 1.4396519265364181e-05, + "loss": 1.4342, "step": 6698 }, { - "epoch": 2.01, - "grad_norm": 43.80728530883789, - "learning_rate": 6.57311817179513e-06, - "loss": 2.084, + "epoch": 0.84, + "grad_norm": 19.698040008544922, + "learning_rate": 1.4395682550307495e-05, + "loss": 1.805, "step": 6699 }, { - "epoch": 2.01, - "grad_norm": 15.121885299682617, - "learning_rate": 6.571113561190739e-06, - "loss": 0.8853, + "epoch": 0.84, + "grad_norm": 9.697070121765137, + "learning_rate": 1.4394845835250807e-05, + "loss": 1.255, "step": 6700 }, { - "epoch": 2.01, - "grad_norm": 32.659427642822266, - "learning_rate": 6.569108950586349e-06, - "loss": 1.8597, + "epoch": 0.84, + "grad_norm": 19.996601104736328, + "learning_rate": 1.4394009120194119e-05, + "loss": 1.9659, "step": 6701 }, { - "epoch": 2.02, - "grad_norm": 26.797901153564453, - "learning_rate": 6.567104339981959e-06, - "loss": 2.3284, + "epoch": 0.84, + "grad_norm": 9.914801597595215, + "learning_rate": 1.4393172405137432e-05, + "loss": 1.7741, "step": 6702 }, { - "epoch": 2.02, - "grad_norm": 3.8142130374908447, - "learning_rate": 6.5650997293775685e-06, - "loss": 0.2054, + "epoch": 0.84, + "grad_norm": 5.036116123199463, + "learning_rate": 1.4392335690080742e-05, + "loss": 1.1576, "step": 6703 }, { - "epoch": 2.02, - "grad_norm": 15.68226432800293, - "learning_rate": 6.563095118773179e-06, - "loss": 2.487, + "epoch": 0.84, + "grad_norm": 118.02140045166016, + "learning_rate": 1.4391498975024056e-05, + "loss": 0.8999, "step": 6704 }, { - "epoch": 2.02, - "grad_norm": 42.038726806640625, - "learning_rate": 6.5610905081687896e-06, - "loss": 2.1623, + "epoch": 0.84, + "grad_norm": 9.128668785095215, + "learning_rate": 1.439066225996737e-05, + "loss": 1.5079, "step": 6705 }, { - "epoch": 2.02, - "grad_norm": 19.80816650390625, - "learning_rate": 6.559085897564399e-06, - "loss": 1.0708, + "epoch": 0.84, + "grad_norm": 9.483262062072754, + "learning_rate": 1.4389825544910683e-05, + "loss": 0.8413, "step": 6706 }, { - "epoch": 2.02, - "grad_norm": 9.321516990661621, - "learning_rate": 6.557081286960009e-06, - "loss": 1.605, + "epoch": 0.84, + "grad_norm": 32.772186279296875, + "learning_rate": 1.4388988829853994e-05, + "loss": 2.5021, "step": 6707 }, { - "epoch": 2.02, - "grad_norm": 159.25819396972656, - "learning_rate": 6.555076676355618e-06, - "loss": 2.8398, + "epoch": 0.84, + "grad_norm": 9.163297653198242, + "learning_rate": 1.4388152114797307e-05, + "loss": 0.6177, "step": 6708 }, { - "epoch": 2.02, - "grad_norm": 17.203962326049805, - "learning_rate": 6.553072065751228e-06, - "loss": 1.4595, + "epoch": 0.84, + "grad_norm": 19.01010513305664, + "learning_rate": 1.438731539974062e-05, + "loss": 1.5838, "step": 6709 }, { - "epoch": 2.02, - "grad_norm": 17.149423599243164, - "learning_rate": 6.5510674551468374e-06, - "loss": 1.7006, + "epoch": 0.84, + "grad_norm": 219.34219360351562, + "learning_rate": 1.4386478684683931e-05, + "loss": 1.209, "step": 6710 }, { - "epoch": 2.02, - "grad_norm": 12.44357967376709, - "learning_rate": 6.549062844542448e-06, - "loss": 0.8718, + "epoch": 0.84, + "grad_norm": 8.837409019470215, + "learning_rate": 1.4385641969627245e-05, + "loss": 0.5343, "step": 6711 }, { - "epoch": 2.02, - "grad_norm": 9.552597999572754, - "learning_rate": 6.5470582339380585e-06, - "loss": 1.4398, + "epoch": 0.84, + "grad_norm": 10.831384658813477, + "learning_rate": 1.4384805254570557e-05, + "loss": 1.4185, "step": 6712 }, { - "epoch": 2.02, - "grad_norm": 12.471324920654297, - "learning_rate": 6.545053623333668e-06, - "loss": 2.0504, + "epoch": 0.84, + "grad_norm": 6.488797664642334, + "learning_rate": 1.438396853951387e-05, + "loss": 0.1876, "step": 6713 }, { - "epoch": 2.02, - "grad_norm": 13.623446464538574, - "learning_rate": 6.543049012729278e-06, - "loss": 1.7889, + "epoch": 0.84, + "grad_norm": 18.163673400878906, + "learning_rate": 1.438313182445718e-05, + "loss": 2.2669, "step": 6714 }, { - "epoch": 2.02, - "grad_norm": 8.370439529418945, - "learning_rate": 6.541044402124888e-06, - "loss": 1.1749, + "epoch": 0.84, + "grad_norm": 10.061616897583008, + "learning_rate": 1.4382295109400494e-05, + "loss": 1.0209, "step": 6715 }, { - "epoch": 2.02, - "grad_norm": 16.073829650878906, - "learning_rate": 6.539039791520497e-06, - "loss": 1.3742, + "epoch": 0.84, + "grad_norm": 10.187827110290527, + "learning_rate": 1.4381458394343808e-05, + "loss": 0.3215, "step": 6716 }, { - "epoch": 2.02, - "grad_norm": 9.64084529876709, - "learning_rate": 6.537035180916108e-06, - "loss": 1.142, + "epoch": 0.84, + "grad_norm": 30.014314651489258, + "learning_rate": 1.4380621679287118e-05, + "loss": 1.3229, "step": 6717 }, { - "epoch": 2.02, - "grad_norm": 25.97657585144043, - "learning_rate": 6.535030570311718e-06, - "loss": 2.3333, + "epoch": 0.84, + "grad_norm": 11.296913146972656, + "learning_rate": 1.4379784964230432e-05, + "loss": 1.4596, "step": 6718 }, { - "epoch": 2.02, - "grad_norm": 118.77561950683594, - "learning_rate": 6.533025959707327e-06, - "loss": 1.5912, + "epoch": 0.84, + "grad_norm": 7.084781646728516, + "learning_rate": 1.4378948249173746e-05, + "loss": 0.3353, "step": 6719 }, { - "epoch": 2.02, - "grad_norm": 14.903034210205078, - "learning_rate": 6.5310213491029374e-06, - "loss": 1.897, - "step": 6720 - }, - { - "epoch": 2.02, - "eval_loss": 0.16902218759059906, - "eval_runtime": 43.4322, - "eval_samples_per_second": 34.053, - "eval_steps_per_second": 34.053, + "epoch": 0.84, + "grad_norm": 29.677143096923828, + "learning_rate": 1.437811153411706e-05, + "loss": 0.8754, "step": 6720 }, { - "epoch": 2.02, - "grad_norm": 39.25941848754883, - "learning_rate": 6.529016738498547e-06, - "loss": 1.8256, + "epoch": 0.84, + "grad_norm": 44.01093673706055, + "learning_rate": 1.437727481906037e-05, + "loss": 1.5755, "step": 6721 }, { - "epoch": 2.02, - "grad_norm": 9.439811706542969, - "learning_rate": 6.527012127894157e-06, - "loss": 0.9625, + "epoch": 0.84, + "grad_norm": 7.352614402770996, + "learning_rate": 1.4376438104003683e-05, + "loss": 0.72, "step": 6722 }, { - "epoch": 2.02, - "grad_norm": 12.62756061553955, - "learning_rate": 6.525007517289767e-06, - "loss": 0.8624, + "epoch": 0.84, + "grad_norm": 11.98291301727295, + "learning_rate": 1.4375601388946995e-05, + "loss": 0.597, "step": 6723 }, { - "epoch": 2.02, - "grad_norm": 13.67402458190918, - "learning_rate": 6.523002906685377e-06, - "loss": 0.785, + "epoch": 0.84, + "grad_norm": 13.343174934387207, + "learning_rate": 1.4374764673890307e-05, + "loss": 1.7283, "step": 6724 }, { - "epoch": 2.02, - "grad_norm": 29.54031753540039, - "learning_rate": 6.520998296080987e-06, - "loss": 2.0936, + "epoch": 0.84, + "grad_norm": 12.499581336975098, + "learning_rate": 1.437392795883362e-05, + "loss": 0.9371, "step": 6725 }, { - "epoch": 2.02, - "grad_norm": 13.225194931030273, - "learning_rate": 6.518993685476597e-06, - "loss": 1.2107, + "epoch": 0.84, + "grad_norm": 19.109291076660156, + "learning_rate": 1.4373091243776933e-05, + "loss": 2.2072, "step": 6726 }, { - "epoch": 2.02, - "grad_norm": 8.916555404663086, - "learning_rate": 6.516989074872206e-06, - "loss": 1.0452, + "epoch": 0.84, + "grad_norm": 8.464346885681152, + "learning_rate": 1.4372254528720246e-05, + "loss": 1.1365, "step": 6727 }, { - "epoch": 2.02, - "grad_norm": 24.967422485351562, - "learning_rate": 6.514984464267816e-06, - "loss": 2.1105, + "epoch": 0.84, + "grad_norm": 52.96107482910156, + "learning_rate": 1.4371417813663557e-05, + "loss": 3.4795, "step": 6728 }, { - "epoch": 2.02, - "grad_norm": 14.154083251953125, - "learning_rate": 6.512979853663426e-06, - "loss": 0.7428, + "epoch": 0.84, + "grad_norm": 32.23268508911133, + "learning_rate": 1.437058109860687e-05, + "loss": 2.4121, "step": 6729 }, { - "epoch": 2.02, - "grad_norm": 17.682830810546875, - "learning_rate": 6.510975243059037e-06, - "loss": 1.3675, + "epoch": 0.84, + "grad_norm": 11.593282699584961, + "learning_rate": 1.4369744383550184e-05, + "loss": 0.9362, "step": 6730 }, { - "epoch": 2.02, - "grad_norm": 12.129364013671875, - "learning_rate": 6.508970632454647e-06, - "loss": 1.7799, + "epoch": 0.84, + "grad_norm": 10.491436958312988, + "learning_rate": 1.4368907668493494e-05, + "loss": 0.9679, "step": 6731 }, { - "epoch": 2.02, - "grad_norm": 37.760650634765625, - "learning_rate": 6.506966021850256e-06, - "loss": 1.1513, + "epoch": 0.84, + "grad_norm": 14.975268363952637, + "learning_rate": 1.4368070953436808e-05, + "loss": 1.1917, "step": 6732 }, { - "epoch": 2.02, - "grad_norm": 87.66400909423828, - "learning_rate": 6.504961411245866e-06, - "loss": 1.9592, + "epoch": 0.84, + "grad_norm": 9.948664665222168, + "learning_rate": 1.4367234238380121e-05, + "loss": 1.3456, "step": 6733 }, { - "epoch": 2.02, - "grad_norm": 12.75828742980957, - "learning_rate": 6.502956800641475e-06, - "loss": 1.1079, + "epoch": 0.85, + "grad_norm": 12.157992362976074, + "learning_rate": 1.4366397523323435e-05, + "loss": 1.9081, "step": 6734 }, { - "epoch": 2.02, - "grad_norm": 12.436376571655273, - "learning_rate": 6.500952190037085e-06, - "loss": 0.8629, + "epoch": 0.85, + "grad_norm": 20.210771560668945, + "learning_rate": 1.4365560808266745e-05, + "loss": 2.5774, "step": 6735 }, { - "epoch": 2.03, - "grad_norm": 8.984572410583496, - "learning_rate": 6.498947579432696e-06, - "loss": 0.9553, + "epoch": 0.85, + "grad_norm": 5.872859477996826, + "learning_rate": 1.4364724093210059e-05, + "loss": 1.6095, "step": 6736 }, { - "epoch": 2.03, - "grad_norm": 22.55635643005371, - "learning_rate": 6.4969429688283055e-06, - "loss": 1.6062, + "epoch": 0.85, + "grad_norm": 10.525941848754883, + "learning_rate": 1.4363887378153371e-05, + "loss": 1.6281, "step": 6737 }, { - "epoch": 2.03, - "grad_norm": 13.196118354797363, - "learning_rate": 6.4949383582239156e-06, - "loss": 0.5885, + "epoch": 0.85, + "grad_norm": 36.14300537109375, + "learning_rate": 1.4363050663096683e-05, + "loss": 3.0778, "step": 6738 }, { - "epoch": 2.03, - "grad_norm": 46.98003387451172, - "learning_rate": 6.492933747619526e-06, - "loss": 1.3093, + "epoch": 0.85, + "grad_norm": 26.05812644958496, + "learning_rate": 1.4362213948039997e-05, + "loss": 2.2379, "step": 6739 }, { - "epoch": 2.03, - "grad_norm": 23.900150299072266, - "learning_rate": 6.490929137015135e-06, - "loss": 1.4142, + "epoch": 0.85, + "grad_norm": 18.90000343322754, + "learning_rate": 1.4361377232983309e-05, + "loss": 1.5087, "step": 6740 }, { - "epoch": 2.03, - "grad_norm": 17.741270065307617, - "learning_rate": 6.488924526410745e-06, - "loss": 1.4268, + "epoch": 0.85, + "grad_norm": 30.030092239379883, + "learning_rate": 1.4360540517926622e-05, + "loss": 2.7298, "step": 6741 }, { - "epoch": 2.03, - "grad_norm": 26.587739944458008, - "learning_rate": 6.486919915806356e-06, - "loss": 2.1686, + "epoch": 0.85, + "grad_norm": 19.3814697265625, + "learning_rate": 1.4359703802869932e-05, + "loss": 1.3061, "step": 6742 }, { - "epoch": 2.03, - "grad_norm": 35.67372131347656, - "learning_rate": 6.484915305201965e-06, - "loss": 1.6213, + "epoch": 0.85, + "grad_norm": 82.06686401367188, + "learning_rate": 1.4358867087813246e-05, + "loss": 0.9033, "step": 6743 }, { - "epoch": 2.03, - "grad_norm": 26.185956954956055, - "learning_rate": 6.482910694597575e-06, - "loss": 1.1309, + "epoch": 0.85, + "grad_norm": 18.357946395874023, + "learning_rate": 1.435803037275656e-05, + "loss": 2.0855, "step": 6744 }, { - "epoch": 2.03, - "grad_norm": 17.040918350219727, - "learning_rate": 6.4809060839931845e-06, - "loss": 1.0778, + "epoch": 0.85, + "grad_norm": 13.921502113342285, + "learning_rate": 1.435719365769987e-05, + "loss": 1.5051, "step": 6745 }, { - "epoch": 2.03, - "grad_norm": 23.627857208251953, - "learning_rate": 6.4789014733887946e-06, - "loss": 1.1968, + "epoch": 0.85, + "grad_norm": 6.822644233703613, + "learning_rate": 1.4356356942643184e-05, + "loss": 1.0913, "step": 6746 }, { - "epoch": 2.03, - "grad_norm": 29.324899673461914, - "learning_rate": 6.476896862784405e-06, - "loss": 1.9221, + "epoch": 0.85, + "grad_norm": 13.126599311828613, + "learning_rate": 1.4355520227586497e-05, + "loss": 2.861, "step": 6747 }, { - "epoch": 2.03, - "grad_norm": 24.806861877441406, - "learning_rate": 6.474892252180015e-06, - "loss": 1.0605, + "epoch": 0.85, + "grad_norm": 11.187861442565918, + "learning_rate": 1.435468351252981e-05, + "loss": 1.2133, "step": 6748 }, { - "epoch": 2.03, - "grad_norm": 32.63029098510742, - "learning_rate": 6.472887641575625e-06, - "loss": 1.4815, + "epoch": 0.85, + "grad_norm": 12.776350021362305, + "learning_rate": 1.4353846797473121e-05, + "loss": 2.4266, "step": 6749 }, { - "epoch": 2.03, - "grad_norm": 17.42801284790039, - "learning_rate": 6.470883030971235e-06, - "loss": 1.6286, + "epoch": 0.85, + "grad_norm": 135.60122680664062, + "learning_rate": 1.4353010082416435e-05, + "loss": 4.2787, "step": 6750 }, { - "epoch": 2.03, - "grad_norm": 40.37263488769531, - "learning_rate": 6.468878420366844e-06, - "loss": 1.6338, + "epoch": 0.85, + "grad_norm": 6.1601433753967285, + "learning_rate": 1.4352173367359747e-05, + "loss": 1.3039, "step": 6751 }, { - "epoch": 2.03, - "grad_norm": 17.74640464782715, - "learning_rate": 6.466873809762454e-06, - "loss": 1.2096, + "epoch": 0.85, + "grad_norm": 10.253626823425293, + "learning_rate": 1.4351336652303059e-05, + "loss": 0.742, "step": 6752 }, { - "epoch": 2.03, - "grad_norm": 26.730159759521484, - "learning_rate": 6.4648691991580634e-06, - "loss": 2.1374, + "epoch": 0.85, + "grad_norm": 16.65516471862793, + "learning_rate": 1.4350499937246372e-05, + "loss": 1.8368, "step": 6753 }, { - "epoch": 2.03, - "grad_norm": 13.699724197387695, - "learning_rate": 6.462864588553674e-06, - "loss": 1.7276, + "epoch": 0.85, + "grad_norm": 7.756155490875244, + "learning_rate": 1.4349663222189684e-05, + "loss": 0.527, "step": 6754 }, { - "epoch": 2.03, - "grad_norm": 26.75263023376465, - "learning_rate": 6.4608599779492845e-06, - "loss": 1.0525, + "epoch": 0.85, + "grad_norm": 13.48439884185791, + "learning_rate": 1.4348826507132998e-05, + "loss": 1.2254, "step": 6755 }, { - "epoch": 2.03, - "grad_norm": 24.192272186279297, - "learning_rate": 6.458855367344894e-06, - "loss": 1.4798, + "epoch": 0.85, + "grad_norm": 30.818511962890625, + "learning_rate": 1.4347989792076308e-05, + "loss": 2.1113, "step": 6756 }, { - "epoch": 2.03, - "grad_norm": 62.38142013549805, - "learning_rate": 6.456850756740504e-06, - "loss": 1.8068, + "epoch": 0.85, + "grad_norm": 7.61986780166626, + "learning_rate": 1.4347153077019622e-05, + "loss": 1.1408, "step": 6757 }, { - "epoch": 2.03, - "grad_norm": 29.921567916870117, - "learning_rate": 6.454846146136113e-06, - "loss": 1.619, + "epoch": 0.85, + "grad_norm": 6.420275688171387, + "learning_rate": 1.4346316361962936e-05, + "loss": 1.0442, "step": 6758 }, { - "epoch": 2.03, - "grad_norm": 6.582921981811523, - "learning_rate": 6.452841535531723e-06, - "loss": 0.648, + "epoch": 0.85, + "grad_norm": 15.52956485748291, + "learning_rate": 1.4345479646906246e-05, + "loss": 0.8404, "step": 6759 }, { - "epoch": 2.03, - "grad_norm": 13.459547996520996, - "learning_rate": 6.450836924927334e-06, - "loss": 1.4701, + "epoch": 0.85, + "grad_norm": 17.56768798828125, + "learning_rate": 1.434464293184956e-05, + "loss": 1.8449, "step": 6760 }, { - "epoch": 2.03, - "grad_norm": 19.61090660095215, - "learning_rate": 6.448832314322943e-06, - "loss": 1.2505, + "epoch": 0.85, + "grad_norm": 13.578958511352539, + "learning_rate": 1.4343806216792873e-05, + "loss": 1.668, "step": 6761 }, { - "epoch": 2.03, - "grad_norm": 32.77788543701172, - "learning_rate": 6.446827703718553e-06, - "loss": 1.9251, + "epoch": 0.85, + "grad_norm": 21.581754684448242, + "learning_rate": 1.4342969501736185e-05, + "loss": 1.9299, "step": 6762 }, { - "epoch": 2.03, - "grad_norm": 7.951191425323486, - "learning_rate": 6.4448230931141634e-06, - "loss": 0.7957, + "epoch": 0.85, + "grad_norm": 14.992815971374512, + "learning_rate": 1.4342132786679497e-05, + "loss": 2.406, "step": 6763 }, { - "epoch": 2.03, - "grad_norm": 13.6834135055542, - "learning_rate": 6.442818482509773e-06, - "loss": 1.488, + "epoch": 0.85, + "grad_norm": 24.969329833984375, + "learning_rate": 1.434129607162281e-05, + "loss": 0.9601, "step": 6764 }, { - "epoch": 2.03, - "grad_norm": 26.919858932495117, - "learning_rate": 6.440813871905383e-06, - "loss": 1.6017, + "epoch": 0.85, + "grad_norm": 28.413381576538086, + "learning_rate": 1.4340459356566123e-05, + "loss": 0.5697, "step": 6765 }, { - "epoch": 2.03, - "grad_norm": 18.014230728149414, - "learning_rate": 6.438809261300992e-06, - "loss": 0.8233, + "epoch": 0.85, + "grad_norm": 10.309455871582031, + "learning_rate": 1.4339622641509435e-05, + "loss": 0.6994, "step": 6766 }, { - "epoch": 2.03, - "grad_norm": 18.699932098388672, - "learning_rate": 6.436804650696603e-06, - "loss": 1.1873, + "epoch": 0.85, + "grad_norm": 6.308617115020752, + "learning_rate": 1.4338785926452747e-05, + "loss": 2.0733, "step": 6767 }, { - "epoch": 2.03, - "grad_norm": 24.57878303527832, - "learning_rate": 6.434800040092213e-06, - "loss": 1.7365, + "epoch": 0.85, + "grad_norm": 36.197715759277344, + "learning_rate": 1.433794921139606e-05, + "loss": 0.8404, "step": 6768 }, { - "epoch": 2.04, - "grad_norm": 30.225631713867188, - "learning_rate": 6.432795429487822e-06, - "loss": 2.7789, + "epoch": 0.85, + "grad_norm": 14.402820587158203, + "learning_rate": 1.4337112496339374e-05, + "loss": 1.3064, "step": 6769 }, { - "epoch": 2.04, - "grad_norm": 22.14498519897461, - "learning_rate": 6.430790818883432e-06, - "loss": 1.6425, + "epoch": 0.85, + "grad_norm": 3.842750310897827, + "learning_rate": 1.4336275781282684e-05, + "loss": 0.3571, "step": 6770 }, { - "epoch": 2.04, - "grad_norm": 12.216958045959473, - "learning_rate": 6.428786208279042e-06, - "loss": 1.7486, + "epoch": 0.85, + "grad_norm": 12.000794410705566, + "learning_rate": 1.4335439066225998e-05, + "loss": 1.7991, "step": 6771 }, { - "epoch": 2.04, - "grad_norm": 26.501882553100586, - "learning_rate": 6.426781597674652e-06, - "loss": 1.3613, + "epoch": 0.85, + "grad_norm": 15.14625358581543, + "learning_rate": 1.4334602351169311e-05, + "loss": 0.6939, "step": 6772 }, { - "epoch": 2.04, - "grad_norm": 30.44036293029785, - "learning_rate": 6.424776987070263e-06, - "loss": 1.2315, + "epoch": 0.85, + "grad_norm": 9.255779266357422, + "learning_rate": 1.4333765636112622e-05, + "loss": 0.6928, "step": 6773 }, { - "epoch": 2.04, - "grad_norm": 19.72119140625, - "learning_rate": 6.422772376465873e-06, - "loss": 1.8577, + "epoch": 0.85, + "grad_norm": 6.127439022064209, + "learning_rate": 1.4332928921055935e-05, + "loss": 0.4097, "step": 6774 }, { - "epoch": 2.04, - "grad_norm": 11.062511444091797, - "learning_rate": 6.420767765861482e-06, - "loss": 1.4272, + "epoch": 0.85, + "grad_norm": 17.041950225830078, + "learning_rate": 1.4332092205999249e-05, + "loss": 1.0672, "step": 6775 }, { - "epoch": 2.04, - "grad_norm": 16.308460235595703, - "learning_rate": 6.418763155257092e-06, - "loss": 1.3599, + "epoch": 0.85, + "grad_norm": 8.400264739990234, + "learning_rate": 1.4331255490942561e-05, + "loss": 1.4374, "step": 6776 }, { - "epoch": 2.04, - "grad_norm": 35.35619354248047, - "learning_rate": 6.416758544652701e-06, - "loss": 1.477, + "epoch": 0.85, + "grad_norm": 50.31614303588867, + "learning_rate": 1.4330418775885873e-05, + "loss": 3.6345, "step": 6777 }, { - "epoch": 2.04, - "grad_norm": 13.431120872497559, - "learning_rate": 6.414753934048311e-06, - "loss": 0.9098, + "epoch": 0.85, + "grad_norm": 43.917266845703125, + "learning_rate": 1.4329582060829187e-05, + "loss": 2.2039, "step": 6778 }, { - "epoch": 2.04, - "grad_norm": 22.69354820251465, - "learning_rate": 6.412749323443922e-06, - "loss": 1.3123, + "epoch": 0.85, + "grad_norm": 17.45854949951172, + "learning_rate": 1.4328745345772498e-05, + "loss": 1.9558, "step": 6779 }, { - "epoch": 2.04, - "grad_norm": 41.717098236083984, - "learning_rate": 6.4107447128395315e-06, - "loss": 2.688, + "epoch": 0.85, + "grad_norm": 18.78993034362793, + "learning_rate": 1.432790863071581e-05, + "loss": 1.9115, "step": 6780 }, { - "epoch": 2.04, - "grad_norm": 9.498014450073242, - "learning_rate": 6.4087401022351416e-06, - "loss": 0.8678, + "epoch": 0.85, + "grad_norm": 9.703263282775879, + "learning_rate": 1.4327071915659122e-05, + "loss": 0.6795, "step": 6781 }, { - "epoch": 2.04, - "grad_norm": 10.49351692199707, - "learning_rate": 6.406735491630751e-06, - "loss": 1.8695, + "epoch": 0.85, + "grad_norm": 8.62363052368164, + "learning_rate": 1.4326235200602436e-05, + "loss": 1.3623, "step": 6782 }, { - "epoch": 2.04, - "grad_norm": 31.190580368041992, - "learning_rate": 6.404730881026361e-06, - "loss": 1.78, + "epoch": 0.85, + "grad_norm": 103.20475769042969, + "learning_rate": 1.432539848554575e-05, + "loss": 1.1805, "step": 6783 }, { - "epoch": 2.04, - "grad_norm": 15.785634994506836, - "learning_rate": 6.402726270421971e-06, - "loss": 1.0891, + "epoch": 0.85, + "grad_norm": 23.955724716186523, + "learning_rate": 1.432456177048906e-05, + "loss": 1.0631, "step": 6784 }, { - "epoch": 2.04, - "grad_norm": 14.310815811157227, - "learning_rate": 6.400721659817581e-06, - "loss": 0.9196, + "epoch": 0.85, + "grad_norm": 37.74839782714844, + "learning_rate": 1.4323725055432374e-05, + "loss": 1.071, "step": 6785 }, { - "epoch": 2.04, - "grad_norm": 38.41286087036133, - "learning_rate": 6.398717049213191e-06, - "loss": 2.4826, + "epoch": 0.85, + "grad_norm": 46.22028732299805, + "learning_rate": 1.4322888340375687e-05, + "loss": 2.6372, "step": 6786 }, { - "epoch": 2.04, - "grad_norm": 10.610367774963379, - "learning_rate": 6.396712438608801e-06, - "loss": 1.0001, + "epoch": 0.85, + "grad_norm": 45.86208724975586, + "learning_rate": 1.4322051625318998e-05, + "loss": 1.0468, "step": 6787 }, { - "epoch": 2.04, - "grad_norm": 9.231528282165527, - "learning_rate": 6.3947078280044105e-06, - "loss": 1.0759, + "epoch": 0.85, + "grad_norm": 18.5753116607666, + "learning_rate": 1.4321214910262311e-05, + "loss": 2.4317, "step": 6788 }, { - "epoch": 2.04, - "grad_norm": 7.4241862297058105, - "learning_rate": 6.3927032174000206e-06, - "loss": 1.9806, + "epoch": 0.85, + "grad_norm": 6.768474578857422, + "learning_rate": 1.4320378195205625e-05, + "loss": 0.7415, "step": 6789 }, { - "epoch": 2.04, - "grad_norm": 14.655599594116211, - "learning_rate": 6.39069860679563e-06, - "loss": 1.3411, + "epoch": 0.85, + "grad_norm": 14.827455520629883, + "learning_rate": 1.4319541480148937e-05, + "loss": 1.5113, "step": 6790 }, { - "epoch": 2.04, - "grad_norm": 10.586833000183105, - "learning_rate": 6.388693996191241e-06, - "loss": 0.9932, + "epoch": 0.85, + "grad_norm": 7.561862945556641, + "learning_rate": 1.4318704765092249e-05, + "loss": 2.9066, "step": 6791 }, { - "epoch": 2.04, - "grad_norm": 19.10291290283203, - "learning_rate": 6.386689385586851e-06, - "loss": 1.5976, + "epoch": 0.85, + "grad_norm": 46.11296844482422, + "learning_rate": 1.4317868050035562e-05, + "loss": 2.2379, "step": 6792 }, { - "epoch": 2.04, - "grad_norm": 15.874801635742188, - "learning_rate": 6.38468477498246e-06, - "loss": 0.897, + "epoch": 0.85, + "grad_norm": 9.779031753540039, + "learning_rate": 1.4317031334978874e-05, + "loss": 1.3426, "step": 6793 }, { - "epoch": 2.04, - "grad_norm": 13.638111114501953, - "learning_rate": 6.38268016437807e-06, - "loss": 0.7186, + "epoch": 0.85, + "grad_norm": 28.658103942871094, + "learning_rate": 1.4316194619922186e-05, + "loss": 2.5177, "step": 6794 }, { - "epoch": 2.04, - "grad_norm": 9.389366149902344, - "learning_rate": 6.380675553773679e-06, - "loss": 0.9544, + "epoch": 0.85, + "grad_norm": 66.79032897949219, + "learning_rate": 1.4315357904865498e-05, + "loss": 1.5142, "step": 6795 }, { - "epoch": 2.04, - "grad_norm": 26.145200729370117, - "learning_rate": 6.3786709431692894e-06, - "loss": 1.3817, + "epoch": 0.85, + "grad_norm": 8.954390525817871, + "learning_rate": 1.4314521189808812e-05, + "loss": 0.8566, "step": 6796 }, { - "epoch": 2.04, - "grad_norm": 12.559453010559082, - "learning_rate": 6.3766663325649e-06, - "loss": 1.5977, + "epoch": 0.85, + "grad_norm": 10.869063377380371, + "learning_rate": 1.4313684474752126e-05, + "loss": 0.6127, "step": 6797 }, { - "epoch": 2.04, - "grad_norm": 9.843866348266602, - "learning_rate": 6.37466172196051e-06, - "loss": 0.9535, + "epoch": 0.85, + "grad_norm": 25.037322998046875, + "learning_rate": 1.4312847759695436e-05, + "loss": 2.1371, "step": 6798 }, { - "epoch": 2.04, - "grad_norm": 15.275118827819824, - "learning_rate": 6.37265711135612e-06, - "loss": 1.3151, + "epoch": 0.85, + "grad_norm": 4.422179698944092, + "learning_rate": 1.431201104463875e-05, + "loss": 0.2207, "step": 6799 }, { - "epoch": 2.04, - "grad_norm": 18.97870635986328, - "learning_rate": 6.37065250075173e-06, - "loss": 1.6633, + "epoch": 0.85, + "grad_norm": 16.942899703979492, + "learning_rate": 1.4311174329582063e-05, + "loss": 2.3062, "step": 6800 }, { - "epoch": 2.04, - "grad_norm": 11.981043815612793, - "learning_rate": 6.368647890147339e-06, - "loss": 1.0551, + "epoch": 0.85, + "eval_loss": 0.09935930371284485, + "eval_runtime": 111.9226, + "eval_samples_per_second": 31.647, + "eval_steps_per_second": 31.647, + "step": 6800 + }, + { + "epoch": 0.85, + "grad_norm": 17.58791732788086, + "learning_rate": 1.4310337614525373e-05, + "loss": 2.8723, "step": 6801 }, { - "epoch": 2.05, - "grad_norm": 25.228471755981445, - "learning_rate": 6.366643279542949e-06, - "loss": 1.4122, + "epoch": 0.85, + "grad_norm": 10.952619552612305, + "learning_rate": 1.4309500899468687e-05, + "loss": 0.6372, "step": 6802 }, { - "epoch": 2.05, - "grad_norm": 46.05463790893555, - "learning_rate": 6.36463866893856e-06, - "loss": 2.192, + "epoch": 0.85, + "grad_norm": 30.083515167236328, + "learning_rate": 1.4308664184412e-05, + "loss": 2.126, "step": 6803 }, { - "epoch": 2.05, - "grad_norm": 28.93854522705078, - "learning_rate": 6.362634058334169e-06, - "loss": 1.235, + "epoch": 0.85, + "grad_norm": 29.942514419555664, + "learning_rate": 1.4307827469355313e-05, + "loss": 0.8289, "step": 6804 }, { - "epoch": 2.05, - "grad_norm": 22.576128005981445, - "learning_rate": 6.360629447729779e-06, - "loss": 1.61, + "epoch": 0.85, + "grad_norm": 26.02439308166504, + "learning_rate": 1.4306990754298625e-05, + "loss": 1.4562, "step": 6805 }, { - "epoch": 2.05, - "grad_norm": 8.201756477355957, - "learning_rate": 6.358624837125389e-06, - "loss": 1.3716, + "epoch": 0.85, + "grad_norm": 13.239129066467285, + "learning_rate": 1.4306154039241938e-05, + "loss": 2.6482, "step": 6806 }, { - "epoch": 2.05, - "grad_norm": 9.097603797912598, - "learning_rate": 6.356620226520999e-06, - "loss": 0.69, + "epoch": 0.85, + "grad_norm": 14.090408325195312, + "learning_rate": 1.430531732418525e-05, + "loss": 0.621, "step": 6807 }, { - "epoch": 2.05, - "grad_norm": 26.398914337158203, - "learning_rate": 6.354615615916609e-06, - "loss": 2.6075, + "epoch": 0.85, + "grad_norm": 4.832688331604004, + "learning_rate": 1.4304480609128562e-05, + "loss": 0.4012, "step": 6808 }, { - "epoch": 2.05, - "grad_norm": 15.047880172729492, - "learning_rate": 6.352611005312218e-06, - "loss": 1.2919, + "epoch": 0.85, + "grad_norm": 21.516300201416016, + "learning_rate": 1.4303643894071874e-05, + "loss": 4.5126, "step": 6809 }, { - "epoch": 2.05, - "grad_norm": 42.7448616027832, - "learning_rate": 6.350606394707829e-06, - "loss": 1.1902, + "epoch": 0.85, + "grad_norm": 6.897380828857422, + "learning_rate": 1.4302807179015188e-05, + "loss": 0.1666, "step": 6810 }, { - "epoch": 2.05, - "grad_norm": 29.7187557220459, - "learning_rate": 6.348601784103439e-06, - "loss": 1.9144, + "epoch": 0.85, + "grad_norm": 14.608335494995117, + "learning_rate": 1.4301970463958501e-05, + "loss": 1.9012, "step": 6811 }, { - "epoch": 2.05, - "grad_norm": 13.130889892578125, - "learning_rate": 6.346597173499048e-06, - "loss": 0.9165, + "epoch": 0.85, + "grad_norm": 9.011201858520508, + "learning_rate": 1.4301133748901812e-05, + "loss": 1.2681, "step": 6812 }, { - "epoch": 2.05, - "grad_norm": 14.863086700439453, - "learning_rate": 6.344592562894658e-06, - "loss": 2.3026, + "epoch": 0.86, + "grad_norm": 10.01158618927002, + "learning_rate": 1.4300297033845125e-05, + "loss": 0.1179, "step": 6813 }, { - "epoch": 2.05, - "grad_norm": 31.419349670410156, - "learning_rate": 6.342587952290268e-06, - "loss": 1.929, + "epoch": 0.86, + "grad_norm": 11.291976928710938, + "learning_rate": 1.4299460318788439e-05, + "loss": 1.6185, "step": 6814 }, { - "epoch": 2.05, - "grad_norm": 16.03536605834961, - "learning_rate": 6.340583341685878e-06, - "loss": 1.0579, + "epoch": 0.86, + "grad_norm": 19.130647659301758, + "learning_rate": 1.429862360373175e-05, + "loss": 1.3798, "step": 6815 }, { - "epoch": 2.05, - "grad_norm": 9.93125057220459, - "learning_rate": 6.338578731081489e-06, - "loss": 1.0248, + "epoch": 0.86, + "grad_norm": 8.56436538696289, + "learning_rate": 1.4297786888675063e-05, + "loss": 0.6916, "step": 6816 }, { - "epoch": 2.05, - "grad_norm": 20.598365783691406, - "learning_rate": 6.336574120477098e-06, - "loss": 1.0575, + "epoch": 0.86, + "grad_norm": 14.949737548828125, + "learning_rate": 1.4296950173618376e-05, + "loss": 2.0076, "step": 6817 }, { - "epoch": 2.05, - "grad_norm": 24.11957359313965, - "learning_rate": 6.334569509872708e-06, - "loss": 2.0264, + "epoch": 0.86, + "grad_norm": 37.16676330566406, + "learning_rate": 1.4296113458561688e-05, + "loss": 2.9857, "step": 6818 }, { - "epoch": 2.05, - "grad_norm": 12.065218925476074, - "learning_rate": 6.332564899268317e-06, - "loss": 0.6793, + "epoch": 0.86, + "grad_norm": 68.93708038330078, + "learning_rate": 1.4295276743505e-05, + "loss": 1.1956, "step": 6819 }, { - "epoch": 2.05, - "grad_norm": 43.54728698730469, - "learning_rate": 6.330560288663927e-06, - "loss": 2.746, + "epoch": 0.86, + "grad_norm": 67.95276641845703, + "learning_rate": 1.4294440028448312e-05, + "loss": 2.1929, "step": 6820 }, { - "epoch": 2.05, - "grad_norm": 12.573387145996094, - "learning_rate": 6.328555678059537e-06, - "loss": 1.0912, + "epoch": 0.86, + "grad_norm": 47.011775970458984, + "learning_rate": 1.4293603313391626e-05, + "loss": 1.3765, "step": 6821 }, { - "epoch": 2.05, - "grad_norm": 53.12962341308594, - "learning_rate": 6.326551067455147e-06, - "loss": 2.5651, + "epoch": 0.86, + "grad_norm": 6.818073749542236, + "learning_rate": 1.4292766598334938e-05, + "loss": 0.2283, "step": 6822 }, { - "epoch": 2.05, - "grad_norm": 27.892974853515625, - "learning_rate": 6.3245464568507575e-06, - "loss": 2.1102, + "epoch": 0.86, + "grad_norm": 4.801296710968018, + "learning_rate": 1.429192988327825e-05, + "loss": 0.4125, "step": 6823 }, { - "epoch": 2.05, - "grad_norm": 11.939574241638184, - "learning_rate": 6.3225418462463676e-06, - "loss": 1.0358, + "epoch": 0.86, + "grad_norm": 17.18333625793457, + "learning_rate": 1.4291093168221564e-05, + "loss": 2.2435, "step": 6824 }, { - "epoch": 2.05, - "grad_norm": 15.528552055358887, - "learning_rate": 6.320537235641977e-06, - "loss": 1.43, + "epoch": 0.86, + "grad_norm": 11.65798568725586, + "learning_rate": 1.4290256453164877e-05, + "loss": 0.8295, "step": 6825 }, { - "epoch": 2.05, - "grad_norm": 19.028608322143555, - "learning_rate": 6.318532625037587e-06, - "loss": 1.3059, + "epoch": 0.86, + "grad_norm": 31.394832611083984, + "learning_rate": 1.4289419738108187e-05, + "loss": 2.3899, "step": 6826 }, { - "epoch": 2.05, - "grad_norm": 29.945940017700195, - "learning_rate": 6.316528014433196e-06, - "loss": 1.4528, + "epoch": 0.86, + "grad_norm": 6.690638065338135, + "learning_rate": 1.4288583023051501e-05, + "loss": 0.5558, "step": 6827 }, { - "epoch": 2.05, - "grad_norm": 8.454259872436523, - "learning_rate": 6.314523403828807e-06, - "loss": 0.9366, + "epoch": 0.86, + "grad_norm": 27.214235305786133, + "learning_rate": 1.4287746307994815e-05, + "loss": 3.1397, "step": 6828 }, { - "epoch": 2.05, - "grad_norm": 19.378828048706055, - "learning_rate": 6.312518793224417e-06, - "loss": 0.8679, + "epoch": 0.86, + "grad_norm": 9.921982765197754, + "learning_rate": 1.4286909592938125e-05, + "loss": 0.5252, "step": 6829 }, { - "epoch": 2.05, - "grad_norm": 20.16804313659668, - "learning_rate": 6.310514182620026e-06, - "loss": 1.6893, + "epoch": 0.86, + "grad_norm": 30.677799224853516, + "learning_rate": 1.4286072877881439e-05, + "loss": 1.8776, "step": 6830 }, { - "epoch": 2.05, - "grad_norm": 9.753872871398926, - "learning_rate": 6.3085095720156365e-06, - "loss": 1.1253, + "epoch": 0.86, + "grad_norm": 10.252274513244629, + "learning_rate": 1.4285236162824752e-05, + "loss": 0.8843, "step": 6831 }, { - "epoch": 2.05, - "grad_norm": 14.549400329589844, - "learning_rate": 6.306504961411246e-06, - "loss": 1.3079, + "epoch": 0.86, + "grad_norm": 8.96366024017334, + "learning_rate": 1.4284399447768064e-05, + "loss": 1.1557, "step": 6832 }, { - "epoch": 2.05, - "grad_norm": 15.794536590576172, - "learning_rate": 6.304500350806856e-06, - "loss": 1.394, + "epoch": 0.86, + "grad_norm": 14.817914962768555, + "learning_rate": 1.4283562732711376e-05, + "loss": 1.5293, "step": 6833 }, { - "epoch": 2.05, - "grad_norm": 32.083927154541016, - "learning_rate": 6.302495740202467e-06, - "loss": 1.716, + "epoch": 0.86, + "grad_norm": 112.65931701660156, + "learning_rate": 1.4282726017654688e-05, + "loss": 1.856, "step": 6834 }, { - "epoch": 2.06, - "grad_norm": 12.402176856994629, - "learning_rate": 6.300491129598077e-06, - "loss": 1.0654, + "epoch": 0.86, + "grad_norm": 3.1464476585388184, + "learning_rate": 1.4281889302598002e-05, + "loss": 0.2797, "step": 6835 }, { - "epoch": 2.06, - "grad_norm": 16.35983657836914, - "learning_rate": 6.298486518993686e-06, - "loss": 1.1802, + "epoch": 0.86, + "grad_norm": 5.00979471206665, + "learning_rate": 1.4281052587541314e-05, + "loss": 0.4282, "step": 6836 }, { - "epoch": 2.06, - "grad_norm": 27.28036117553711, - "learning_rate": 6.296481908389296e-06, - "loss": 1.8658, + "epoch": 0.86, + "grad_norm": 12.247176170349121, + "learning_rate": 1.4280215872484626e-05, + "loss": 0.8076, "step": 6837 }, { - "epoch": 2.06, - "grad_norm": 10.746267318725586, - "learning_rate": 6.294477297784905e-06, - "loss": 1.2586, + "epoch": 0.86, + "grad_norm": 10.779017448425293, + "learning_rate": 1.427937915742794e-05, + "loss": 1.221, "step": 6838 }, { - "epoch": 2.06, - "grad_norm": 27.72111701965332, - "learning_rate": 6.2924726871805155e-06, - "loss": 2.0948, + "epoch": 0.86, + "grad_norm": 9.00314998626709, + "learning_rate": 1.4278542442371253e-05, + "loss": 0.4914, "step": 6839 }, { - "epoch": 2.06, - "grad_norm": 13.532320976257324, - "learning_rate": 6.290468076576126e-06, - "loss": 1.2246, - "step": 6840 - }, - { - "epoch": 2.06, - "eval_loss": 0.18061862885951996, - "eval_runtime": 43.8871, - "eval_samples_per_second": 33.7, - "eval_steps_per_second": 33.7, + "epoch": 0.86, + "grad_norm": 4.5170817375183105, + "learning_rate": 1.4277705727314563e-05, + "loss": 0.5171, "step": 6840 }, { - "epoch": 2.06, - "grad_norm": 23.459665298461914, - "learning_rate": 6.288463465971736e-06, - "loss": 1.4494, + "epoch": 0.86, + "grad_norm": 21.79302215576172, + "learning_rate": 1.4276869012257877e-05, + "loss": 2.1364, "step": 6841 }, { - "epoch": 2.06, - "grad_norm": 13.611770629882812, - "learning_rate": 6.286458855367346e-06, - "loss": 1.1819, + "epoch": 0.86, + "grad_norm": 7.483713150024414, + "learning_rate": 1.427603229720119e-05, + "loss": 1.3484, "step": 6842 }, { - "epoch": 2.06, - "grad_norm": 25.924118041992188, - "learning_rate": 6.284454244762955e-06, - "loss": 1.0744, + "epoch": 0.86, + "grad_norm": 23.12000846862793, + "learning_rate": 1.4275195582144501e-05, + "loss": 1.1458, "step": 6843 }, { - "epoch": 2.06, - "grad_norm": 19.138996124267578, - "learning_rate": 6.282449634158565e-06, - "loss": 1.2561, + "epoch": 0.86, + "grad_norm": 11.738572120666504, + "learning_rate": 1.4274358867087814e-05, + "loss": 2.4691, "step": 6844 }, { - "epoch": 2.06, - "grad_norm": 46.55351638793945, - "learning_rate": 6.280445023554175e-06, - "loss": 1.4002, + "epoch": 0.86, + "grad_norm": 10.765612602233887, + "learning_rate": 1.4273522152031128e-05, + "loss": 1.4455, "step": 6845 }, { - "epoch": 2.06, - "grad_norm": 16.169979095458984, - "learning_rate": 6.278440412949785e-06, - "loss": 1.2913, + "epoch": 0.86, + "grad_norm": 12.555647850036621, + "learning_rate": 1.427268543697444e-05, + "loss": 1.2412, "step": 6846 }, { - "epoch": 2.06, - "grad_norm": 37.923133850097656, - "learning_rate": 6.276435802345395e-06, - "loss": 1.5886, + "epoch": 0.86, + "grad_norm": 13.866482734680176, + "learning_rate": 1.4271848721917752e-05, + "loss": 2.5328, "step": 6847 }, { - "epoch": 2.06, - "grad_norm": 57.48485565185547, - "learning_rate": 6.274431191741005e-06, - "loss": 2.0952, + "epoch": 0.86, + "grad_norm": 26.960792541503906, + "learning_rate": 1.4271012006861064e-05, + "loss": 2.052, "step": 6848 }, { - "epoch": 2.06, - "grad_norm": 13.789143562316895, - "learning_rate": 6.272426581136615e-06, - "loss": 1.712, + "epoch": 0.86, + "grad_norm": 23.56977081298828, + "learning_rate": 1.4270175291804378e-05, + "loss": 1.7812, "step": 6849 }, { - "epoch": 2.06, - "grad_norm": 21.859037399291992, - "learning_rate": 6.270421970532225e-06, - "loss": 2.0063, + "epoch": 0.86, + "grad_norm": 18.891651153564453, + "learning_rate": 1.426933857674769e-05, + "loss": 2.1884, "step": 6850 }, { - "epoch": 2.06, - "grad_norm": 13.258837699890137, - "learning_rate": 6.268417359927834e-06, - "loss": 1.2955, + "epoch": 0.86, + "grad_norm": 5.035281181335449, + "learning_rate": 1.4268501861691002e-05, + "loss": 1.1694, "step": 6851 }, { - "epoch": 2.06, - "grad_norm": 8.517374038696289, - "learning_rate": 6.266412749323444e-06, - "loss": 1.0369, + "epoch": 0.86, + "grad_norm": 15.985159873962402, + "learning_rate": 1.4267665146634315e-05, + "loss": 1.0514, "step": 6852 }, { - "epoch": 2.06, - "grad_norm": 10.359166145324707, - "learning_rate": 6.264408138719055e-06, - "loss": 1.0215, + "epoch": 0.86, + "grad_norm": 26.73896598815918, + "learning_rate": 1.4266828431577629e-05, + "loss": 1.9858, "step": 6853 }, { - "epoch": 2.06, - "grad_norm": 28.84781837463379, - "learning_rate": 6.262403528114664e-06, - "loss": 1.9747, + "epoch": 0.86, + "grad_norm": 42.217872619628906, + "learning_rate": 1.4265991716520939e-05, + "loss": 3.1336, "step": 6854 }, { - "epoch": 2.06, - "grad_norm": 11.506865501403809, - "learning_rate": 6.260398917510274e-06, - "loss": 1.1076, + "epoch": 0.86, + "grad_norm": 6.821484565734863, + "learning_rate": 1.4265155001464253e-05, + "loss": 0.6554, "step": 6855 }, { - "epoch": 2.06, - "grad_norm": 15.466108322143555, - "learning_rate": 6.2583943069058835e-06, - "loss": 1.2779, + "epoch": 0.86, + "grad_norm": 45.5909538269043, + "learning_rate": 1.4264318286407566e-05, + "loss": 1.9567, "step": 6856 }, { - "epoch": 2.06, - "grad_norm": 31.154294967651367, - "learning_rate": 6.256389696301494e-06, - "loss": 1.9285, + "epoch": 0.86, + "grad_norm": 21.700092315673828, + "learning_rate": 1.4263481571350877e-05, + "loss": 1.7945, "step": 6857 }, { - "epoch": 2.06, - "grad_norm": 15.335009574890137, - "learning_rate": 6.254385085697104e-06, - "loss": 1.6327, + "epoch": 0.86, + "grad_norm": 7.3325676918029785, + "learning_rate": 1.426264485629419e-05, + "loss": 0.6593, "step": 6858 }, { - "epoch": 2.06, - "grad_norm": 14.675117492675781, - "learning_rate": 6.252380475092715e-06, - "loss": 1.644, + "epoch": 0.86, + "grad_norm": 11.090742111206055, + "learning_rate": 1.4261808141237502e-05, + "loss": 1.1509, "step": 6859 }, { - "epoch": 2.06, - "grad_norm": 133.6639862060547, - "learning_rate": 6.250375864488324e-06, - "loss": 1.1432, + "epoch": 0.86, + "grad_norm": 35.66590118408203, + "learning_rate": 1.4260971426180816e-05, + "loss": 2.1461, "step": 6860 }, { - "epoch": 2.06, - "grad_norm": 39.901222229003906, - "learning_rate": 6.248371253883934e-06, - "loss": 2.097, + "epoch": 0.86, + "grad_norm": 18.339378356933594, + "learning_rate": 1.4260134711124128e-05, + "loss": 0.7108, "step": 6861 }, { - "epoch": 2.06, - "grad_norm": 13.691509246826172, - "learning_rate": 6.246366643279543e-06, - "loss": 1.6336, + "epoch": 0.86, + "grad_norm": 16.616304397583008, + "learning_rate": 1.425929799606744e-05, + "loss": 0.9501, "step": 6862 }, { - "epoch": 2.06, - "grad_norm": 14.485090255737305, - "learning_rate": 6.244362032675153e-06, - "loss": 1.3259, + "epoch": 0.86, + "grad_norm": 18.746173858642578, + "learning_rate": 1.4258461281010753e-05, + "loss": 2.2851, "step": 6863 }, { - "epoch": 2.06, - "grad_norm": 72.5770492553711, - "learning_rate": 6.2423574220707625e-06, - "loss": 1.6907, + "epoch": 0.86, + "grad_norm": 12.186095237731934, + "learning_rate": 1.4257624565954065e-05, + "loss": 2.7731, "step": 6864 }, { - "epoch": 2.06, - "grad_norm": 10.634366989135742, - "learning_rate": 6.240352811466373e-06, - "loss": 0.5027, + "epoch": 0.86, + "grad_norm": 9.447904586791992, + "learning_rate": 1.4256787850897377e-05, + "loss": 1.1061, "step": 6865 }, { - "epoch": 2.06, - "grad_norm": 33.018367767333984, - "learning_rate": 6.2383482008619835e-06, - "loss": 1.7409, + "epoch": 0.86, + "grad_norm": 26.08285903930664, + "learning_rate": 1.4255951135840691e-05, + "loss": 1.4773, "step": 6866 }, { - "epoch": 2.06, - "grad_norm": 9.623235702514648, - "learning_rate": 6.236343590257593e-06, - "loss": 1.715, + "epoch": 0.86, + "grad_norm": 8.051258087158203, + "learning_rate": 1.4255114420784005e-05, + "loss": 0.8141, "step": 6867 }, { - "epoch": 2.06, - "grad_norm": 47.73927307128906, - "learning_rate": 6.234338979653203e-06, - "loss": 1.4383, + "epoch": 0.86, + "grad_norm": 16.438636779785156, + "learning_rate": 1.4254277705727315e-05, + "loss": 1.0134, "step": 6868 }, { - "epoch": 2.07, - "grad_norm": 9.162986755371094, - "learning_rate": 6.232334369048813e-06, - "loss": 1.1415, + "epoch": 0.86, + "grad_norm": 4.74124002456665, + "learning_rate": 1.4253440990670629e-05, + "loss": 0.3712, "step": 6869 }, { - "epoch": 2.07, - "grad_norm": 41.81623077392578, - "learning_rate": 6.230329758444422e-06, - "loss": 1.6832, + "epoch": 0.86, + "grad_norm": 18.44993782043457, + "learning_rate": 1.4252604275613942e-05, + "loss": 0.8501, "step": 6870 }, { - "epoch": 2.07, - "grad_norm": 21.047195434570312, - "learning_rate": 6.228325147840033e-06, - "loss": 1.202, + "epoch": 0.86, + "grad_norm": 12.221519470214844, + "learning_rate": 1.4251767560557253e-05, + "loss": 0.8011, "step": 6871 }, { - "epoch": 2.07, - "grad_norm": 19.954036712646484, - "learning_rate": 6.226320537235643e-06, - "loss": 1.4934, + "epoch": 0.86, + "grad_norm": 23.90431785583496, + "learning_rate": 1.4250930845500566e-05, + "loss": 1.0791, "step": 6872 }, { - "epoch": 2.07, - "grad_norm": 24.60007095336914, - "learning_rate": 6.224315926631252e-06, - "loss": 0.9536, + "epoch": 0.86, + "grad_norm": 13.108061790466309, + "learning_rate": 1.4250094130443878e-05, + "loss": 3.2827, "step": 6873 }, { - "epoch": 2.07, - "grad_norm": 23.833621978759766, - "learning_rate": 6.2223113160268625e-06, - "loss": 2.561, + "epoch": 0.86, + "grad_norm": 31.300273895263672, + "learning_rate": 1.4249257415387192e-05, + "loss": 2.3229, "step": 6874 }, { - "epoch": 2.07, - "grad_norm": 51.6407585144043, - "learning_rate": 6.220306705422472e-06, - "loss": 1.8729, + "epoch": 0.86, + "grad_norm": 107.48026275634766, + "learning_rate": 1.4248420700330504e-05, + "loss": 2.0077, "step": 6875 }, { - "epoch": 2.07, - "grad_norm": 9.72765827178955, - "learning_rate": 6.218302094818082e-06, - "loss": 1.6889, + "epoch": 0.86, + "grad_norm": 13.970173835754395, + "learning_rate": 1.4247583985273816e-05, + "loss": 1.0402, "step": 6876 }, { - "epoch": 2.07, - "grad_norm": 86.259765625, - "learning_rate": 6.216297484213693e-06, - "loss": 1.6229, + "epoch": 0.86, + "grad_norm": 3.0541088581085205, + "learning_rate": 1.424674727021713e-05, + "loss": 0.4837, "step": 6877 }, { - "epoch": 2.07, - "grad_norm": 12.195944786071777, - "learning_rate": 6.214292873609302e-06, - "loss": 1.7568, + "epoch": 0.86, + "grad_norm": 7.691309928894043, + "learning_rate": 1.424591055516044e-05, + "loss": 0.3931, "step": 6878 }, { - "epoch": 2.07, - "grad_norm": 41.19993591308594, - "learning_rate": 6.212288263004912e-06, - "loss": 1.2259, + "epoch": 0.86, + "grad_norm": 15.071650505065918, + "learning_rate": 1.4245073840103753e-05, + "loss": 3.4964, "step": 6879 }, { - "epoch": 2.07, - "grad_norm": 10.510581970214844, - "learning_rate": 6.210283652400521e-06, - "loss": 1.1598, + "epoch": 0.86, + "grad_norm": 7.250248908996582, + "learning_rate": 1.4244237125047067e-05, + "loss": 0.6171, "step": 6880 }, { - "epoch": 2.07, - "grad_norm": 14.523313522338867, - "learning_rate": 6.208279041796131e-06, - "loss": 1.3958, + "epoch": 0.86, + "grad_norm": 60.21806716918945, + "learning_rate": 1.424340040999038e-05, + "loss": 0.9972, "step": 6881 }, { - "epoch": 2.07, - "grad_norm": 11.1996431350708, - "learning_rate": 6.2062744311917415e-06, - "loss": 1.6634, + "epoch": 0.86, + "grad_norm": 9.885921478271484, + "learning_rate": 1.424256369493369e-05, + "loss": 1.5922, "step": 6882 }, { - "epoch": 2.07, - "grad_norm": 17.171239852905273, - "learning_rate": 6.2042698205873515e-06, - "loss": 1.8742, + "epoch": 0.86, + "grad_norm": 23.861791610717773, + "learning_rate": 1.4241726979877004e-05, + "loss": 1.9084, "step": 6883 }, { - "epoch": 2.07, - "grad_norm": 83.33916473388672, - "learning_rate": 6.202265209982962e-06, - "loss": 2.0191, + "epoch": 0.86, + "grad_norm": 35.16427993774414, + "learning_rate": 1.4240890264820318e-05, + "loss": 1.6943, "step": 6884 }, { - "epoch": 2.07, - "grad_norm": 13.033934593200684, - "learning_rate": 6.200260599378572e-06, - "loss": 0.9672, + "epoch": 0.86, + "grad_norm": 23.21710777282715, + "learning_rate": 1.4240053549763628e-05, + "loss": 3.1607, "step": 6885 }, { - "epoch": 2.07, - "grad_norm": 25.13782501220703, - "learning_rate": 6.198255988774181e-06, - "loss": 1.5467, + "epoch": 0.86, + "grad_norm": 17.348012924194336, + "learning_rate": 1.4239216834706942e-05, + "loss": 1.9241, "step": 6886 }, { - "epoch": 2.07, - "grad_norm": 129.21408081054688, - "learning_rate": 6.196251378169791e-06, - "loss": 1.8225, + "epoch": 0.86, + "grad_norm": 25.596208572387695, + "learning_rate": 1.4238380119650254e-05, + "loss": 1.6988, "step": 6887 }, { - "epoch": 2.07, - "grad_norm": 15.809123992919922, - "learning_rate": 6.1942467675654e-06, - "loss": 1.1815, + "epoch": 0.86, + "grad_norm": 15.840264320373535, + "learning_rate": 1.4237543404593568e-05, + "loss": 1.1699, "step": 6888 }, { - "epoch": 2.07, - "grad_norm": 10.88241195678711, - "learning_rate": 6.19224215696101e-06, - "loss": 0.5932, + "epoch": 0.86, + "grad_norm": 37.30739974975586, + "learning_rate": 1.423670668953688e-05, + "loss": 1.8944, "step": 6889 }, { - "epoch": 2.07, - "grad_norm": 17.281024932861328, - "learning_rate": 6.190237546356621e-06, - "loss": 0.6785, + "epoch": 0.86, + "grad_norm": 17.949352264404297, + "learning_rate": 1.4235869974480192e-05, + "loss": 1.7828, "step": 6890 }, { - "epoch": 2.07, - "grad_norm": 78.56420135498047, - "learning_rate": 6.1882329357522305e-06, - "loss": 2.5001, + "epoch": 0.86, + "grad_norm": 28.773000717163086, + "learning_rate": 1.4235033259423505e-05, + "loss": 2.6329, "step": 6891 }, { - "epoch": 2.07, - "grad_norm": 7.025608539581299, - "learning_rate": 6.186228325147841e-06, - "loss": 0.9645, + "epoch": 0.86, + "grad_norm": 14.005388259887695, + "learning_rate": 1.4234196544366815e-05, + "loss": 2.0188, "step": 6892 }, { - "epoch": 2.07, - "grad_norm": 17.67293930053711, - "learning_rate": 6.184223714543451e-06, - "loss": 1.5476, + "epoch": 0.87, + "grad_norm": 38.49642562866211, + "learning_rate": 1.4233359829310129e-05, + "loss": 1.6906, "step": 6893 }, { - "epoch": 2.07, - "grad_norm": 54.84214782714844, - "learning_rate": 6.18221910393906e-06, - "loss": 2.5763, + "epoch": 0.87, + "grad_norm": 11.821557998657227, + "learning_rate": 1.4232523114253443e-05, + "loss": 3.0551, "step": 6894 }, { - "epoch": 2.07, - "grad_norm": 16.025243759155273, - "learning_rate": 6.18021449333467e-06, - "loss": 1.0716, + "epoch": 0.87, + "grad_norm": 8.1752290725708, + "learning_rate": 1.4231686399196756e-05, + "loss": 1.6771, "step": 6895 }, { - "epoch": 2.07, - "grad_norm": 19.868698120117188, - "learning_rate": 6.178209882730281e-06, - "loss": 1.6816, + "epoch": 0.87, + "grad_norm": 8.746302604675293, + "learning_rate": 1.4230849684140067e-05, + "loss": 1.7391, "step": 6896 }, { - "epoch": 2.07, - "grad_norm": 23.872350692749023, - "learning_rate": 6.17620527212589e-06, - "loss": 1.1166, + "epoch": 0.87, + "grad_norm": 17.55376625061035, + "learning_rate": 1.423001296908338e-05, + "loss": 0.5621, "step": 6897 }, { - "epoch": 2.07, - "grad_norm": 17.220922470092773, - "learning_rate": 6.1742006615215e-06, - "loss": 1.601, + "epoch": 0.87, + "grad_norm": 17.566560745239258, + "learning_rate": 1.4229176254026694e-05, + "loss": 2.2356, "step": 6898 }, { - "epoch": 2.07, - "grad_norm": 37.89704895019531, - "learning_rate": 6.1721960509171095e-06, - "loss": 1.2833, + "epoch": 0.87, + "grad_norm": 7.742341995239258, + "learning_rate": 1.4228339538970004e-05, + "loss": 0.8694, "step": 6899 }, { - "epoch": 2.07, - "grad_norm": 10.233612060546875, - "learning_rate": 6.17019144031272e-06, - "loss": 1.137, + "epoch": 0.87, + "grad_norm": 7.490888595581055, + "learning_rate": 1.4227502823913318e-05, + "loss": 0.4476, "step": 6900 }, { - "epoch": 2.07, - "grad_norm": 14.788456916809082, - "learning_rate": 6.168186829708329e-06, - "loss": 0.7968, + "epoch": 0.87, + "grad_norm": 26.35576057434082, + "learning_rate": 1.422666610885663e-05, + "loss": 2.3157, "step": 6901 }, { - "epoch": 2.08, - "grad_norm": 47.135398864746094, - "learning_rate": 6.16618221910394e-06, - "loss": 1.8272, + "epoch": 0.87, + "grad_norm": 21.355043411254883, + "learning_rate": 1.4225829393799943e-05, + "loss": 1.2947, "step": 6902 }, { - "epoch": 2.08, - "grad_norm": 48.95859146118164, - "learning_rate": 6.16417760849955e-06, - "loss": 1.8223, + "epoch": 0.87, + "grad_norm": 21.20218849182129, + "learning_rate": 1.4224992678743255e-05, + "loss": 0.7256, "step": 6903 }, { - "epoch": 2.08, - "grad_norm": 9.25454044342041, - "learning_rate": 6.162172997895159e-06, - "loss": 0.9862, + "epoch": 0.87, + "grad_norm": 9.560347557067871, + "learning_rate": 1.4224155963686567e-05, + "loss": 1.951, "step": 6904 }, { - "epoch": 2.08, - "grad_norm": 34.636192321777344, - "learning_rate": 6.160168387290769e-06, - "loss": 1.5629, + "epoch": 0.87, + "grad_norm": 13.457576751708984, + "learning_rate": 1.4223319248629881e-05, + "loss": 2.0152, "step": 6905 }, { - "epoch": 2.08, - "grad_norm": 51.81803512573242, - "learning_rate": 6.158163776686379e-06, - "loss": 1.5237, + "epoch": 0.87, + "grad_norm": 9.960187911987305, + "learning_rate": 1.4222482533573191e-05, + "loss": 0.695, "step": 6906 }, { - "epoch": 2.08, - "grad_norm": 39.07495880126953, - "learning_rate": 6.1561591660819885e-06, - "loss": 1.1899, + "epoch": 0.87, + "grad_norm": 14.246359825134277, + "learning_rate": 1.4221645818516505e-05, + "loss": 2.6998, "step": 6907 }, { - "epoch": 2.08, - "grad_norm": 22.247758865356445, - "learning_rate": 6.154154555477599e-06, - "loss": 0.9568, + "epoch": 0.87, + "grad_norm": 140.53585815429688, + "learning_rate": 1.4220809103459819e-05, + "loss": 4.2545, "step": 6908 }, { - "epoch": 2.08, - "grad_norm": 15.606700897216797, - "learning_rate": 6.1521499448732095e-06, - "loss": 1.5251, + "epoch": 0.87, + "grad_norm": 16.900049209594727, + "learning_rate": 1.4219972388403132e-05, + "loss": 0.7686, "step": 6909 }, { - "epoch": 2.08, - "grad_norm": 17.546934127807617, - "learning_rate": 6.150145334268819e-06, - "loss": 1.1845, + "epoch": 0.87, + "grad_norm": 6.372493267059326, + "learning_rate": 1.4219135673346442e-05, + "loss": 0.5945, "step": 6910 }, { - "epoch": 2.08, - "grad_norm": 7.191950798034668, - "learning_rate": 6.148140723664429e-06, - "loss": 0.7725, + "epoch": 0.87, + "grad_norm": 11.708452224731445, + "learning_rate": 1.4218298958289756e-05, + "loss": 1.1837, "step": 6911 }, { - "epoch": 2.08, - "grad_norm": 10.01009464263916, - "learning_rate": 6.146136113060038e-06, - "loss": 1.0729, + "epoch": 0.87, + "grad_norm": 17.008995056152344, + "learning_rate": 1.4217462243233068e-05, + "loss": 2.3779, "step": 6912 }, { - "epoch": 2.08, - "grad_norm": 9.743487358093262, - "learning_rate": 6.144131502455648e-06, - "loss": 1.2371, + "epoch": 0.87, + "grad_norm": 5.525969982147217, + "learning_rate": 1.421662552817638e-05, + "loss": 1.4673, "step": 6913 }, { - "epoch": 2.08, - "grad_norm": 21.309419631958008, - "learning_rate": 6.142126891851259e-06, - "loss": 2.0328, + "epoch": 0.87, + "grad_norm": 23.5950984954834, + "learning_rate": 1.4215788813119694e-05, + "loss": 1.7786, "step": 6914 }, { - "epoch": 2.08, - "grad_norm": 14.136860847473145, - "learning_rate": 6.140122281246868e-06, - "loss": 1.4086, + "epoch": 0.87, + "grad_norm": 71.16661834716797, + "learning_rate": 1.4214952098063006e-05, + "loss": 2.6548, "step": 6915 }, { - "epoch": 2.08, - "grad_norm": 12.133606910705566, - "learning_rate": 6.138117670642478e-06, - "loss": 1.3393, + "epoch": 0.87, + "grad_norm": 6.5101823806762695, + "learning_rate": 1.421411538300632e-05, + "loss": 0.6794, "step": 6916 }, { - "epoch": 2.08, - "grad_norm": 8.155012130737305, - "learning_rate": 6.136113060038088e-06, - "loss": 0.8114, + "epoch": 0.87, + "grad_norm": 22.508495330810547, + "learning_rate": 1.4213278667949631e-05, + "loss": 2.1745, "step": 6917 }, { - "epoch": 2.08, - "grad_norm": 14.2897367477417, - "learning_rate": 6.134108449433698e-06, - "loss": 1.0815, + "epoch": 0.87, + "grad_norm": 9.747279167175293, + "learning_rate": 1.4212441952892943e-05, + "loss": 1.7265, "step": 6918 }, { - "epoch": 2.08, - "grad_norm": 37.01668930053711, - "learning_rate": 6.132103838829308e-06, - "loss": 2.4772, + "epoch": 0.87, + "grad_norm": 8.583632469177246, + "learning_rate": 1.4211605237836257e-05, + "loss": 0.9745, "step": 6919 }, { - "epoch": 2.08, - "grad_norm": 9.246944427490234, - "learning_rate": 6.130099228224919e-06, - "loss": 1.0533, + "epoch": 0.87, + "grad_norm": 10.896392822265625, + "learning_rate": 1.4210768522779567e-05, + "loss": 1.7416, "step": 6920 }, { - "epoch": 2.08, - "grad_norm": 14.934267044067383, - "learning_rate": 6.128094617620528e-06, - "loss": 0.8619, + "epoch": 0.87, + "grad_norm": 11.444214820861816, + "learning_rate": 1.420993180772288e-05, + "loss": 1.5348, "step": 6921 }, { - "epoch": 2.08, - "grad_norm": 25.984840393066406, - "learning_rate": 6.126090007016138e-06, - "loss": 1.3991, + "epoch": 0.87, + "grad_norm": 14.957980155944824, + "learning_rate": 1.4209095092666194e-05, + "loss": 2.9994, "step": 6922 }, { - "epoch": 2.08, - "grad_norm": 23.823701858520508, - "learning_rate": 6.124085396411747e-06, - "loss": 2.2563, + "epoch": 0.87, + "grad_norm": 13.002206802368164, + "learning_rate": 1.4208258377609508e-05, + "loss": 1.1721, "step": 6923 }, { - "epoch": 2.08, - "grad_norm": 25.616357803344727, - "learning_rate": 6.122080785807357e-06, - "loss": 1.559, + "epoch": 0.87, + "grad_norm": 29.35884666442871, + "learning_rate": 1.4207421662552818e-05, + "loss": 2.1527, "step": 6924 }, { - "epoch": 2.08, - "grad_norm": 25.483346939086914, - "learning_rate": 6.120076175202967e-06, - "loss": 1.8079, + "epoch": 0.87, + "grad_norm": 38.53581237792969, + "learning_rate": 1.4206584947496132e-05, + "loss": 3.7084, "step": 6925 }, { - "epoch": 2.08, - "grad_norm": 26.09974479675293, - "learning_rate": 6.1180715645985775e-06, - "loss": 1.8173, + "epoch": 0.87, + "grad_norm": 9.266407012939453, + "learning_rate": 1.4205748232439444e-05, + "loss": 1.0747, "step": 6926 }, { - "epoch": 2.08, - "grad_norm": 33.826454162597656, - "learning_rate": 6.116066953994188e-06, - "loss": 1.2576, + "epoch": 0.87, + "grad_norm": 12.204514503479004, + "learning_rate": 1.4204911517382756e-05, + "loss": 1.8189, "step": 6927 }, { - "epoch": 2.08, - "grad_norm": 22.255189895629883, - "learning_rate": 6.114062343389797e-06, - "loss": 1.5849, + "epoch": 0.87, + "grad_norm": 21.560924530029297, + "learning_rate": 1.420407480232607e-05, + "loss": 1.6065, "step": 6928 }, { - "epoch": 2.08, - "grad_norm": 23.203243255615234, - "learning_rate": 6.112057732785407e-06, - "loss": 2.0366, + "epoch": 0.87, + "grad_norm": 10.391530990600586, + "learning_rate": 1.4203238087269381e-05, + "loss": 0.4697, "step": 6929 }, { - "epoch": 2.08, - "grad_norm": 48.67564010620117, - "learning_rate": 6.110053122181017e-06, - "loss": 1.4752, + "epoch": 0.87, + "grad_norm": 8.712636947631836, + "learning_rate": 1.4202401372212695e-05, + "loss": 0.5286, "step": 6930 }, { - "epoch": 2.08, - "grad_norm": 13.649922370910645, - "learning_rate": 6.108048511576626e-06, - "loss": 1.0061, + "epoch": 0.87, + "grad_norm": 5.031757354736328, + "learning_rate": 1.4201564657156005e-05, + "loss": 0.0881, "step": 6931 }, { - "epoch": 2.08, - "grad_norm": 15.346248626708984, - "learning_rate": 6.106043900972236e-06, - "loss": 0.975, + "epoch": 0.87, + "grad_norm": 12.193156242370605, + "learning_rate": 1.4200727942099319e-05, + "loss": 1.2649, "step": 6932 }, { - "epoch": 2.08, - "grad_norm": 43.33583450317383, - "learning_rate": 6.104039290367847e-06, - "loss": 1.8835, + "epoch": 0.87, + "grad_norm": 16.03141975402832, + "learning_rate": 1.4199891227042633e-05, + "loss": 1.6772, "step": 6933 }, { - "epoch": 2.08, - "grad_norm": 11.801863670349121, - "learning_rate": 6.1020346797634565e-06, - "loss": 0.9421, + "epoch": 0.87, + "grad_norm": 56.04813003540039, + "learning_rate": 1.4199054511985943e-05, + "loss": 3.0297, "step": 6934 }, { - "epoch": 2.09, - "grad_norm": 13.316075325012207, - "learning_rate": 6.100030069159067e-06, - "loss": 0.9621, + "epoch": 0.87, + "grad_norm": 20.556060791015625, + "learning_rate": 1.4198217796929257e-05, + "loss": 1.6633, "step": 6935 }, { - "epoch": 2.09, - "grad_norm": 7.643853187561035, - "learning_rate": 6.098025458554676e-06, - "loss": 0.7109, + "epoch": 0.87, + "grad_norm": 55.4222412109375, + "learning_rate": 1.419738108187257e-05, + "loss": 2.2368, "step": 6936 }, { - "epoch": 2.09, - "grad_norm": 6.656645774841309, - "learning_rate": 6.096020847950286e-06, - "loss": 0.9677, + "epoch": 0.87, + "grad_norm": 25.066200256347656, + "learning_rate": 1.4196544366815884e-05, + "loss": 0.8717, "step": 6937 }, { - "epoch": 2.09, - "grad_norm": 8.41319751739502, - "learning_rate": 6.094016237345895e-06, - "loss": 1.4541, + "epoch": 0.87, + "grad_norm": 11.717459678649902, + "learning_rate": 1.4195707651759194e-05, + "loss": 3.939, "step": 6938 }, { - "epoch": 2.09, - "grad_norm": 17.172225952148438, - "learning_rate": 6.092011626741506e-06, - "loss": 1.2853, + "epoch": 0.87, + "grad_norm": 18.704364776611328, + "learning_rate": 1.4194870936702508e-05, + "loss": 2.1806, "step": 6939 }, { - "epoch": 2.09, - "grad_norm": 13.03164005279541, - "learning_rate": 6.090007016137116e-06, - "loss": 1.2991, + "epoch": 0.87, + "grad_norm": 18.1362361907959, + "learning_rate": 1.419403422164582e-05, + "loss": 1.377, "step": 6940 }, { - "epoch": 2.09, - "grad_norm": 26.83146095275879, - "learning_rate": 6.088002405532725e-06, - "loss": 1.2144, + "epoch": 0.87, + "grad_norm": 18.96443748474121, + "learning_rate": 1.4193197506589132e-05, + "loss": 1.608, "step": 6941 }, { - "epoch": 2.09, - "grad_norm": 15.566588401794434, - "learning_rate": 6.0859977949283355e-06, - "loss": 1.1742, + "epoch": 0.87, + "grad_norm": 10.305618286132812, + "learning_rate": 1.4192360791532445e-05, + "loss": 0.4654, "step": 6942 }, { - "epoch": 2.09, - "grad_norm": 11.731130599975586, - "learning_rate": 6.083993184323946e-06, - "loss": 1.5685, + "epoch": 0.87, + "grad_norm": 12.450068473815918, + "learning_rate": 1.4191524076475757e-05, + "loss": 1.6787, "step": 6943 }, { - "epoch": 2.09, - "grad_norm": 23.45868492126465, - "learning_rate": 6.081988573719555e-06, - "loss": 1.1652, + "epoch": 0.87, + "grad_norm": 6.444420337677002, + "learning_rate": 1.4190687361419071e-05, + "loss": 0.5719, "step": 6944 }, { - "epoch": 2.09, - "grad_norm": 30.693378448486328, - "learning_rate": 6.079983963115166e-06, - "loss": 1.1192, + "epoch": 0.87, + "grad_norm": 10.12495231628418, + "learning_rate": 1.4189850646362381e-05, + "loss": 0.7943, "step": 6945 }, { - "epoch": 2.09, - "grad_norm": 76.05176544189453, - "learning_rate": 6.077979352510776e-06, - "loss": 1.03, + "epoch": 0.87, + "grad_norm": 12.811422348022461, + "learning_rate": 1.4189013931305695e-05, + "loss": 1.4826, "step": 6946 }, { - "epoch": 2.09, - "grad_norm": 42.937931060791016, - "learning_rate": 6.075974741906385e-06, - "loss": 2.9801, + "epoch": 0.87, + "grad_norm": 27.980676651000977, + "learning_rate": 1.4188177216249009e-05, + "loss": 1.9857, "step": 6947 }, { - "epoch": 2.09, - "grad_norm": 21.51824188232422, - "learning_rate": 6.073970131301995e-06, - "loss": 1.3744, + "epoch": 0.87, + "grad_norm": 23.33842658996582, + "learning_rate": 1.4187340501192319e-05, + "loss": 1.7549, "step": 6948 }, { - "epoch": 2.09, - "grad_norm": 13.035652160644531, - "learning_rate": 6.071965520697604e-06, - "loss": 1.2832, + "epoch": 0.87, + "grad_norm": 8.448101043701172, + "learning_rate": 1.4186503786135632e-05, + "loss": 1.0385, "step": 6949 }, { - "epoch": 2.09, - "grad_norm": 7.819190502166748, - "learning_rate": 6.0699609100932145e-06, - "loss": 1.3225, + "epoch": 0.87, + "grad_norm": 18.32587242126465, + "learning_rate": 1.4185667071078946e-05, + "loss": 2.4694, "step": 6950 }, { - "epoch": 2.09, - "grad_norm": 10.307256698608398, - "learning_rate": 6.067956299488825e-06, - "loss": 1.1998, + "epoch": 0.87, + "grad_norm": 14.72725772857666, + "learning_rate": 1.418483035602226e-05, + "loss": 1.1245, "step": 6951 }, { - "epoch": 2.09, - "grad_norm": 43.879268646240234, - "learning_rate": 6.065951688884435e-06, - "loss": 1.2899, + "epoch": 0.87, + "grad_norm": 10.395641326904297, + "learning_rate": 1.418399364096557e-05, + "loss": 2.1181, "step": 6952 }, { - "epoch": 2.09, - "grad_norm": 14.263348579406738, - "learning_rate": 6.063947078280045e-06, - "loss": 0.8165, + "epoch": 0.87, + "grad_norm": 17.333349227905273, + "learning_rate": 1.4183156925908884e-05, + "loss": 2.2361, "step": 6953 }, { - "epoch": 2.09, - "grad_norm": 19.4408016204834, - "learning_rate": 6.061942467675655e-06, - "loss": 1.2222, + "epoch": 0.87, + "grad_norm": 18.26038932800293, + "learning_rate": 1.4182320210852196e-05, + "loss": 1.1122, "step": 6954 }, { - "epoch": 2.09, - "grad_norm": 21.61421012878418, - "learning_rate": 6.059937857071264e-06, - "loss": 1.6328, + "epoch": 0.87, + "grad_norm": 7.8660478591918945, + "learning_rate": 1.4181483495795508e-05, + "loss": 0.234, "step": 6955 }, { - "epoch": 2.09, - "grad_norm": 59.171634674072266, - "learning_rate": 6.057933246466874e-06, - "loss": 2.2851, + "epoch": 0.87, + "grad_norm": 12.084300994873047, + "learning_rate": 1.4180646780738821e-05, + "loss": 0.5299, "step": 6956 }, { - "epoch": 2.09, - "grad_norm": 17.417770385742188, - "learning_rate": 6.055928635862485e-06, - "loss": 1.5558, + "epoch": 0.87, + "grad_norm": 16.44608497619629, + "learning_rate": 1.4179810065682133e-05, + "loss": 1.3773, "step": 6957 }, { - "epoch": 2.09, - "grad_norm": 21.351945877075195, - "learning_rate": 6.053924025258094e-06, - "loss": 1.9198, + "epoch": 0.87, + "grad_norm": 10.259683609008789, + "learning_rate": 1.4178973350625445e-05, + "loss": 0.7847, "step": 6958 }, { - "epoch": 2.09, - "grad_norm": 21.162813186645508, - "learning_rate": 6.051919414653704e-06, - "loss": 0.8733, + "epoch": 0.87, + "grad_norm": 6.972939968109131, + "learning_rate": 1.4178136635568757e-05, + "loss": 0.7622, "step": 6959 }, { - "epoch": 2.09, - "grad_norm": 28.082653045654297, - "learning_rate": 6.049914804049314e-06, - "loss": 1.3162, - "step": 6960 - }, - { - "epoch": 2.09, - "eval_loss": 0.1762227714061737, - "eval_runtime": 43.6018, - "eval_samples_per_second": 33.921, - "eval_steps_per_second": 33.921, + "epoch": 0.87, + "grad_norm": 10.767699241638184, + "learning_rate": 1.417729992051207e-05, + "loss": 1.5511, "step": 6960 }, { - "epoch": 2.09, - "grad_norm": 17.02800941467285, - "learning_rate": 6.047910193444924e-06, - "loss": 1.4449, + "epoch": 0.87, + "grad_norm": 15.990443229675293, + "learning_rate": 1.4176463205455384e-05, + "loss": 2.1373, "step": 6961 }, { - "epoch": 2.09, - "grad_norm": 48.51543045043945, - "learning_rate": 6.045905582840533e-06, - "loss": 1.0376, + "epoch": 0.87, + "grad_norm": 9.765302658081055, + "learning_rate": 1.4175626490398695e-05, + "loss": 1.1673, "step": 6962 }, { - "epoch": 2.09, - "grad_norm": 30.203258514404297, - "learning_rate": 6.043900972236144e-06, - "loss": 1.8892, + "epoch": 0.87, + "grad_norm": 16.047128677368164, + "learning_rate": 1.4174789775342008e-05, + "loss": 3.8891, "step": 6963 }, { - "epoch": 2.09, - "grad_norm": 22.062679290771484, - "learning_rate": 6.041896361631754e-06, - "loss": 2.034, + "epoch": 0.87, + "grad_norm": 75.53546142578125, + "learning_rate": 1.4173953060285322e-05, + "loss": 2.9341, "step": 6964 }, { - "epoch": 2.09, - "grad_norm": 29.935195922851562, - "learning_rate": 6.039891751027363e-06, - "loss": 2.1552, + "epoch": 0.87, + "grad_norm": 39.25362014770508, + "learning_rate": 1.4173116345228632e-05, + "loss": 0.6573, "step": 6965 }, { - "epoch": 2.09, - "grad_norm": 21.121536254882812, - "learning_rate": 6.037887140422973e-06, - "loss": 1.848, + "epoch": 0.87, + "grad_norm": 28.128896713256836, + "learning_rate": 1.4172279630171946e-05, + "loss": 1.7481, "step": 6966 }, { - "epoch": 2.09, - "grad_norm": 13.469535827636719, - "learning_rate": 6.035882529818583e-06, - "loss": 0.8718, + "epoch": 0.87, + "grad_norm": 33.34860610961914, + "learning_rate": 1.417144291511526e-05, + "loss": 2.1332, "step": 6967 }, { - "epoch": 2.1, - "grad_norm": 9.78769302368164, - "learning_rate": 6.033877919214193e-06, - "loss": 0.5627, + "epoch": 0.87, + "grad_norm": 14.426985740661621, + "learning_rate": 1.4170606200058571e-05, + "loss": 1.993, "step": 6968 }, { - "epoch": 2.1, - "grad_norm": 11.262333869934082, - "learning_rate": 6.0318733086098035e-06, - "loss": 1.4755, + "epoch": 0.87, + "grad_norm": 12.120182037353516, + "learning_rate": 1.4169769485001883e-05, + "loss": 2.3124, "step": 6969 }, { - "epoch": 2.1, - "grad_norm": 21.476552963256836, - "learning_rate": 6.029868698005414e-06, - "loss": 0.9482, + "epoch": 0.87, + "grad_norm": 9.713573455810547, + "learning_rate": 1.4168932769945195e-05, + "loss": 0.6764, "step": 6970 }, { - "epoch": 2.1, - "grad_norm": 8.83089542388916, - "learning_rate": 6.027864087401023e-06, - "loss": 0.9499, + "epoch": 0.87, + "grad_norm": 10.640819549560547, + "learning_rate": 1.4168096054888509e-05, + "loss": 1.2914, "step": 6971 }, { - "epoch": 2.1, - "grad_norm": 16.572052001953125, - "learning_rate": 6.025859476796633e-06, - "loss": 1.4945, + "epoch": 0.87, + "grad_norm": 9.241393089294434, + "learning_rate": 1.4167259339831821e-05, + "loss": 1.2839, "step": 6972 }, { - "epoch": 2.1, - "grad_norm": 94.96258544921875, - "learning_rate": 6.023854866192242e-06, - "loss": 1.655, + "epoch": 0.88, + "grad_norm": 15.63196086883545, + "learning_rate": 1.4166422624775133e-05, + "loss": 0.7556, "step": 6973 }, { - "epoch": 2.1, - "grad_norm": 11.988658905029297, - "learning_rate": 6.021850255587852e-06, - "loss": 1.4834, + "epoch": 0.88, + "grad_norm": 15.385232925415039, + "learning_rate": 1.4165585909718447e-05, + "loss": 1.7352, "step": 6974 }, { - "epoch": 2.1, - "grad_norm": 15.83250904083252, - "learning_rate": 6.0198456449834615e-06, - "loss": 1.7121, + "epoch": 0.88, + "grad_norm": 54.85871505737305, + "learning_rate": 1.416474919466176e-05, + "loss": 2.9882, "step": 6975 }, { - "epoch": 2.1, - "grad_norm": 14.637907981872559, - "learning_rate": 6.0178410343790724e-06, - "loss": 1.849, + "epoch": 0.88, + "grad_norm": 18.45826530456543, + "learning_rate": 1.416391247960507e-05, + "loss": 1.0526, "step": 6976 }, { - "epoch": 2.1, - "grad_norm": 42.469970703125, - "learning_rate": 6.0158364237746825e-06, - "loss": 1.3548, + "epoch": 0.88, + "grad_norm": 33.669708251953125, + "learning_rate": 1.4163075764548384e-05, + "loss": 1.7136, "step": 6977 }, { - "epoch": 2.1, - "grad_norm": 14.263885498046875, - "learning_rate": 6.013831813170293e-06, - "loss": 1.5441, + "epoch": 0.88, + "grad_norm": 16.6716251373291, + "learning_rate": 1.4162239049491698e-05, + "loss": 1.7772, "step": 6978 }, { - "epoch": 2.1, - "grad_norm": 91.35665130615234, - "learning_rate": 6.011827202565902e-06, - "loss": 2.4573, + "epoch": 0.88, + "grad_norm": 10.420305252075195, + "learning_rate": 1.4161402334435008e-05, + "loss": 3.1408, "step": 6979 }, { - "epoch": 2.1, - "grad_norm": 26.59601402282715, - "learning_rate": 6.009822591961512e-06, - "loss": 1.0937, + "epoch": 0.88, + "grad_norm": 30.392881393432617, + "learning_rate": 1.4160565619378322e-05, + "loss": 1.545, "step": 6980 }, { - "epoch": 2.1, - "grad_norm": 21.7692928314209, - "learning_rate": 6.007817981357121e-06, - "loss": 1.4581, + "epoch": 0.88, + "grad_norm": 7.688721656799316, + "learning_rate": 1.4159728904321635e-05, + "loss": 1.6216, "step": 6981 }, { - "epoch": 2.1, - "grad_norm": 55.85102081298828, - "learning_rate": 6.005813370752732e-06, - "loss": 1.5007, + "epoch": 0.88, + "grad_norm": 8.282111167907715, + "learning_rate": 1.4158892189264947e-05, + "loss": 1.3615, "step": 6982 }, { - "epoch": 2.1, - "grad_norm": 17.059696197509766, - "learning_rate": 6.003808760148342e-06, - "loss": 1.2327, + "epoch": 0.88, + "grad_norm": 20.921794891357422, + "learning_rate": 1.415805547420826e-05, + "loss": 2.6633, "step": 6983 }, { - "epoch": 2.1, - "grad_norm": 20.334264755249023, - "learning_rate": 6.001804149543951e-06, - "loss": 1.1726, + "epoch": 0.88, + "grad_norm": 16.57208251953125, + "learning_rate": 1.4157218759151571e-05, + "loss": 2.6769, "step": 6984 }, { - "epoch": 2.1, - "grad_norm": 14.4188871383667, - "learning_rate": 5.9997995389395615e-06, - "loss": 1.4264, + "epoch": 0.88, + "grad_norm": 13.83458137512207, + "learning_rate": 1.4156382044094885e-05, + "loss": 2.5668, "step": 6985 }, { - "epoch": 2.1, - "grad_norm": 10.726988792419434, - "learning_rate": 5.997794928335171e-06, - "loss": 0.912, + "epoch": 0.88, + "grad_norm": 9.996246337890625, + "learning_rate": 1.4155545329038197e-05, + "loss": 1.924, "step": 6986 }, { - "epoch": 2.1, - "grad_norm": 22.09978675842285, - "learning_rate": 5.995790317730781e-06, - "loss": 1.9006, + "epoch": 0.88, + "grad_norm": 14.711870193481445, + "learning_rate": 1.4154708613981509e-05, + "loss": 1.1225, "step": 6987 }, { - "epoch": 2.1, - "grad_norm": 25.827951431274414, - "learning_rate": 5.993785707126392e-06, - "loss": 2.7776, + "epoch": 0.88, + "grad_norm": 13.016510009765625, + "learning_rate": 1.4153871898924822e-05, + "loss": 0.9249, "step": 6988 }, { - "epoch": 2.1, - "grad_norm": 43.5711784362793, - "learning_rate": 5.991781096522001e-06, - "loss": 1.5516, + "epoch": 0.88, + "grad_norm": 11.098211288452148, + "learning_rate": 1.4153035183868136e-05, + "loss": 1.3117, "step": 6989 }, { - "epoch": 2.1, - "grad_norm": 20.165645599365234, - "learning_rate": 5.989776485917611e-06, - "loss": 1.1189, + "epoch": 0.88, + "grad_norm": 22.908828735351562, + "learning_rate": 1.4152198468811446e-05, + "loss": 1.8384, "step": 6990 }, { - "epoch": 2.1, - "grad_norm": 23.63228988647461, - "learning_rate": 5.987771875313221e-06, - "loss": 1.4044, + "epoch": 0.88, + "grad_norm": 18.00226593017578, + "learning_rate": 1.415136175375476e-05, + "loss": 2.6107, "step": 6991 }, { - "epoch": 2.1, - "grad_norm": 18.94355583190918, - "learning_rate": 5.98576726470883e-06, - "loss": 1.3975, + "epoch": 0.88, + "grad_norm": 19.675580978393555, + "learning_rate": 1.4150525038698074e-05, + "loss": 0.7767, "step": 6992 }, { - "epoch": 2.1, - "grad_norm": 43.25507736206055, - "learning_rate": 5.9837626541044405e-06, - "loss": 2.5693, + "epoch": 0.88, + "grad_norm": 5.079214096069336, + "learning_rate": 1.4149688323641384e-05, + "loss": 0.1675, "step": 6993 }, { - "epoch": 2.1, - "grad_norm": 24.190704345703125, - "learning_rate": 5.981758043500051e-06, - "loss": 1.0735, + "epoch": 0.88, + "grad_norm": 18.402210235595703, + "learning_rate": 1.4148851608584697e-05, + "loss": 1.213, "step": 6994 }, { - "epoch": 2.1, - "grad_norm": 16.01865577697754, - "learning_rate": 5.979753432895661e-06, - "loss": 1.2435, + "epoch": 0.88, + "grad_norm": 24.991798400878906, + "learning_rate": 1.4148014893528011e-05, + "loss": 1.6542, "step": 6995 }, { - "epoch": 2.1, - "grad_norm": 16.649044036865234, - "learning_rate": 5.977748822291271e-06, - "loss": 1.3125, + "epoch": 0.88, + "grad_norm": 9.81252384185791, + "learning_rate": 1.4147178178471323e-05, + "loss": 1.3761, "step": 6996 }, { - "epoch": 2.1, - "grad_norm": 17.755905151367188, - "learning_rate": 5.97574421168688e-06, - "loss": 1.6912, + "epoch": 0.88, + "grad_norm": 4.908636569976807, + "learning_rate": 1.4146341463414635e-05, + "loss": 0.2464, "step": 6997 }, { - "epoch": 2.1, - "grad_norm": 18.950183868408203, - "learning_rate": 5.97373960108249e-06, - "loss": 1.9294, + "epoch": 0.88, + "grad_norm": 7.581484317779541, + "learning_rate": 1.4145504748357947e-05, + "loss": 1.0555, "step": 6998 }, { - "epoch": 2.1, - "grad_norm": 9.899605751037598, - "learning_rate": 5.971734990478099e-06, - "loss": 1.6834, + "epoch": 0.88, + "grad_norm": 11.803753852844238, + "learning_rate": 1.414466803330126e-05, + "loss": 1.0861, "step": 6999 }, { - "epoch": 2.1, - "grad_norm": 16.555978775024414, - "learning_rate": 5.96973037987371e-06, - "loss": 1.773, + "epoch": 0.88, + "grad_norm": 15.46438980102539, + "learning_rate": 1.4143831318244573e-05, + "loss": 1.4876, "step": 7000 }, { - "epoch": 2.1, - "grad_norm": 9.918785095214844, - "learning_rate": 5.96772576926932e-06, - "loss": 0.8201, + "epoch": 0.88, + "grad_norm": 18.067228317260742, + "learning_rate": 1.4142994603187885e-05, + "loss": 2.3218, "step": 7001 }, { - "epoch": 2.11, - "grad_norm": 7.417164325714111, - "learning_rate": 5.9657211586649295e-06, - "loss": 0.8608, + "epoch": 0.88, + "grad_norm": 34.01871109008789, + "learning_rate": 1.4142157888131198e-05, + "loss": 1.8282, "step": 7002 }, { - "epoch": 2.11, - "grad_norm": 31.227968215942383, - "learning_rate": 5.96371654806054e-06, - "loss": 1.3737, + "epoch": 0.88, + "grad_norm": 12.05657958984375, + "learning_rate": 1.4141321173074512e-05, + "loss": 1.8803, "step": 7003 }, { - "epoch": 2.11, - "grad_norm": 16.146133422851562, - "learning_rate": 5.96171193745615e-06, - "loss": 1.6191, + "epoch": 0.88, + "grad_norm": 12.193592071533203, + "learning_rate": 1.4140484458017822e-05, + "loss": 2.3793, "step": 7004 }, { - "epoch": 2.11, - "grad_norm": 11.509698867797852, - "learning_rate": 5.959707326851759e-06, - "loss": 1.118, + "epoch": 0.88, + "grad_norm": 29.061677932739258, + "learning_rate": 1.4139647742961136e-05, + "loss": 1.6299, "step": 7005 }, { - "epoch": 2.11, - "grad_norm": 11.928789138793945, - "learning_rate": 5.95770271624737e-06, - "loss": 1.3745, + "epoch": 0.88, + "grad_norm": 7.4007039070129395, + "learning_rate": 1.413881102790445e-05, + "loss": 0.6984, "step": 7006 }, { - "epoch": 2.11, - "grad_norm": 12.056342124938965, - "learning_rate": 5.95569810564298e-06, - "loss": 2.4071, + "epoch": 0.88, + "grad_norm": 9.471341133117676, + "learning_rate": 1.413797431284776e-05, + "loss": 0.9251, "step": 7007 }, { - "epoch": 2.11, - "grad_norm": 15.146832466125488, - "learning_rate": 5.953693495038589e-06, - "loss": 0.6288, + "epoch": 0.88, + "grad_norm": 7.935895919799805, + "learning_rate": 1.4137137597791073e-05, + "loss": 0.7665, "step": 7008 }, { - "epoch": 2.11, - "grad_norm": 15.896178245544434, - "learning_rate": 5.951688884434199e-06, - "loss": 1.3848, + "epoch": 0.88, + "grad_norm": 145.23446655273438, + "learning_rate": 1.4136300882734387e-05, + "loss": 2.0616, "step": 7009 }, { - "epoch": 2.11, - "grad_norm": 20.91379737854004, - "learning_rate": 5.9496842738298085e-06, - "loss": 1.9552, + "epoch": 0.88, + "grad_norm": 11.415495872497559, + "learning_rate": 1.4135464167677699e-05, + "loss": 1.2726, "step": 7010 }, { - "epoch": 2.11, - "grad_norm": 15.576189041137695, - "learning_rate": 5.947679663225419e-06, - "loss": 1.7155, + "epoch": 0.88, + "grad_norm": 31.87625503540039, + "learning_rate": 1.4134627452621011e-05, + "loss": 1.1281, "step": 7011 }, { - "epoch": 2.11, - "grad_norm": 12.269247055053711, - "learning_rate": 5.9456750526210295e-06, - "loss": 1.1991, + "epoch": 0.88, + "grad_norm": 10.819061279296875, + "learning_rate": 1.4133790737564323e-05, + "loss": 1.6783, "step": 7012 }, { - "epoch": 2.11, - "grad_norm": 11.45450210571289, - "learning_rate": 5.943670442016639e-06, - "loss": 0.6404, + "epoch": 0.88, + "grad_norm": 10.764213562011719, + "learning_rate": 1.4132954022507636e-05, + "loss": 1.2949, "step": 7013 }, { - "epoch": 2.11, - "grad_norm": 14.839483261108398, - "learning_rate": 5.941665831412249e-06, - "loss": 1.1984, + "epoch": 0.88, + "grad_norm": 16.699296951293945, + "learning_rate": 1.4132117307450948e-05, + "loss": 0.6814, "step": 7014 }, { - "epoch": 2.11, - "grad_norm": 11.097734451293945, - "learning_rate": 5.939661220807859e-06, - "loss": 1.0033, + "epoch": 0.88, + "grad_norm": 10.548154830932617, + "learning_rate": 1.413128059239426e-05, + "loss": 1.881, "step": 7015 }, { - "epoch": 2.11, - "grad_norm": 23.61093521118164, - "learning_rate": 5.937656610203468e-06, - "loss": 2.0833, + "epoch": 0.88, + "grad_norm": 22.957895278930664, + "learning_rate": 1.4130443877337574e-05, + "loss": 1.9368, "step": 7016 }, { - "epoch": 2.11, - "grad_norm": 16.99240493774414, - "learning_rate": 5.935651999599078e-06, - "loss": 1.4891, + "epoch": 0.88, + "grad_norm": 10.620356559753418, + "learning_rate": 1.4129607162280888e-05, + "loss": 0.5651, "step": 7017 }, { - "epoch": 2.11, - "grad_norm": 11.620327949523926, - "learning_rate": 5.9336473889946875e-06, - "loss": 0.9176, + "epoch": 0.88, + "grad_norm": 12.291309356689453, + "learning_rate": 1.4128770447224198e-05, + "loss": 1.1731, "step": 7018 }, { - "epoch": 2.11, - "grad_norm": 17.769229888916016, - "learning_rate": 5.9316427783902984e-06, - "loss": 2.0986, + "epoch": 0.88, + "grad_norm": 30.018396377563477, + "learning_rate": 1.4127933732167512e-05, + "loss": 1.7424, "step": 7019 }, { - "epoch": 2.11, - "grad_norm": 9.187108039855957, - "learning_rate": 5.9296381677859085e-06, - "loss": 1.0068, + "epoch": 0.88, + "grad_norm": 12.34261703491211, + "learning_rate": 1.4127097017110825e-05, + "loss": 1.4332, "step": 7020 }, { - "epoch": 2.11, - "grad_norm": 47.74100112915039, - "learning_rate": 5.927633557181518e-06, - "loss": 1.3823, + "epoch": 0.88, + "grad_norm": 22.039567947387695, + "learning_rate": 1.4126260302054136e-05, + "loss": 1.5204, "step": 7021 }, { - "epoch": 2.11, - "grad_norm": 20.690021514892578, - "learning_rate": 5.925628946577128e-06, - "loss": 1.5737, + "epoch": 0.88, + "grad_norm": 9.983137130737305, + "learning_rate": 1.412542358699745e-05, + "loss": 1.5573, "step": 7022 }, { - "epoch": 2.11, - "grad_norm": 19.90296745300293, - "learning_rate": 5.923624335972737e-06, - "loss": 1.6142, + "epoch": 0.88, + "grad_norm": 7.264036178588867, + "learning_rate": 1.4124586871940761e-05, + "loss": 0.2537, "step": 7023 }, { - "epoch": 2.11, - "grad_norm": 19.526081085205078, - "learning_rate": 5.921619725368347e-06, - "loss": 0.9043, + "epoch": 0.88, + "grad_norm": 10.880606651306152, + "learning_rate": 1.4123750156884075e-05, + "loss": 2.5847, "step": 7024 }, { - "epoch": 2.11, - "grad_norm": 47.775001525878906, - "learning_rate": 5.919615114763958e-06, - "loss": 3.4549, + "epoch": 0.88, + "grad_norm": 23.634489059448242, + "learning_rate": 1.4122913441827387e-05, + "loss": 2.3404, "step": 7025 }, { - "epoch": 2.11, - "grad_norm": 12.722251892089844, - "learning_rate": 5.917610504159567e-06, - "loss": 1.7378, + "epoch": 0.88, + "grad_norm": 13.010921478271484, + "learning_rate": 1.4122076726770699e-05, + "loss": 1.2087, "step": 7026 }, { - "epoch": 2.11, - "grad_norm": 10.309690475463867, - "learning_rate": 5.915605893555177e-06, - "loss": 0.9647, + "epoch": 0.88, + "grad_norm": 11.797823905944824, + "learning_rate": 1.4121240011714012e-05, + "loss": 2.3684, "step": 7027 }, { - "epoch": 2.11, - "grad_norm": 10.273581504821777, - "learning_rate": 5.9136012829507875e-06, - "loss": 1.6892, + "epoch": 0.88, + "grad_norm": 6.033276557922363, + "learning_rate": 1.4120403296657324e-05, + "loss": 0.6981, "step": 7028 }, { - "epoch": 2.11, - "grad_norm": 16.655803680419922, - "learning_rate": 5.911596672346397e-06, - "loss": 1.6853, + "epoch": 0.88, + "grad_norm": 12.963159561157227, + "learning_rate": 1.4119566581600636e-05, + "loss": 2.9143, "step": 7029 }, { - "epoch": 2.11, - "grad_norm": 23.953218460083008, - "learning_rate": 5.909592061742007e-06, - "loss": 0.5064, + "epoch": 0.88, + "grad_norm": 25.27248764038086, + "learning_rate": 1.411872986654395e-05, + "loss": 4.1087, "step": 7030 }, { - "epoch": 2.11, - "grad_norm": 21.488235473632812, - "learning_rate": 5.907587451137618e-06, - "loss": 1.7174, + "epoch": 0.88, + "grad_norm": 11.717084884643555, + "learning_rate": 1.4117893151487264e-05, + "loss": 1.4052, "step": 7031 }, { - "epoch": 2.11, - "grad_norm": 54.67894744873047, - "learning_rate": 5.905582840533227e-06, - "loss": 1.7804, + "epoch": 0.88, + "grad_norm": 9.638819694519043, + "learning_rate": 1.4117056436430574e-05, + "loss": 2.5262, "step": 7032 }, { - "epoch": 2.11, - "grad_norm": 10.905550956726074, - "learning_rate": 5.903578229928837e-06, - "loss": 0.7826, + "epoch": 0.88, + "grad_norm": 4.7968220710754395, + "learning_rate": 1.4116219721373887e-05, + "loss": 0.3063, "step": 7033 }, { - "epoch": 2.11, - "grad_norm": 6.627830982208252, - "learning_rate": 5.901573619324446e-06, - "loss": 0.9134, + "epoch": 0.88, + "grad_norm": 15.35245132446289, + "learning_rate": 1.4115383006317201e-05, + "loss": 1.6842, "step": 7034 }, { - "epoch": 2.12, - "grad_norm": 12.987330436706543, - "learning_rate": 5.899569008720056e-06, - "loss": 1.4485, + "epoch": 0.88, + "grad_norm": 3.629316806793213, + "learning_rate": 1.4114546291260511e-05, + "loss": 0.0851, "step": 7035 }, { - "epoch": 2.12, - "grad_norm": 14.574166297912598, - "learning_rate": 5.897564398115666e-06, - "loss": 1.1571, + "epoch": 0.88, + "grad_norm": 15.7343168258667, + "learning_rate": 1.4113709576203825e-05, + "loss": 2.3484, "step": 7036 }, { - "epoch": 2.12, - "grad_norm": 12.813142776489258, - "learning_rate": 5.8955597875112766e-06, - "loss": 2.2213, + "epoch": 0.88, + "grad_norm": 9.407702445983887, + "learning_rate": 1.4112872861147137e-05, + "loss": 2.2536, "step": 7037 }, { - "epoch": 2.12, - "grad_norm": 96.96940612792969, - "learning_rate": 5.893555176906887e-06, - "loss": 2.6402, + "epoch": 0.88, + "grad_norm": 15.683849334716797, + "learning_rate": 1.411203614609045e-05, + "loss": 1.682, "step": 7038 }, { - "epoch": 2.12, - "grad_norm": 22.448095321655273, - "learning_rate": 5.891550566302497e-06, - "loss": 1.2935, + "epoch": 0.88, + "grad_norm": 29.45321273803711, + "learning_rate": 1.4111199431033763e-05, + "loss": 1.3847, "step": 7039 }, { - "epoch": 2.12, - "grad_norm": 8.147926330566406, - "learning_rate": 5.889545955698106e-06, - "loss": 0.8031, + "epoch": 0.88, + "grad_norm": 12.613272666931152, + "learning_rate": 1.4110362715977075e-05, + "loss": 1.7653, "step": 7040 }, { - "epoch": 2.12, - "grad_norm": 16.05350685119629, - "learning_rate": 5.887541345093716e-06, - "loss": 1.6077, + "epoch": 0.88, + "grad_norm": 13.921170234680176, + "learning_rate": 1.4109526000920388e-05, + "loss": 2.4739, "step": 7041 }, { - "epoch": 2.12, - "grad_norm": 6.752038955688477, - "learning_rate": 5.885536734489325e-06, - "loss": 0.6485, + "epoch": 0.88, + "grad_norm": 9.824431419372559, + "learning_rate": 1.4108689285863698e-05, + "loss": 1.2665, "step": 7042 }, { - "epoch": 2.12, - "grad_norm": 13.922531127929688, - "learning_rate": 5.883532123884936e-06, - "loss": 1.3103, + "epoch": 0.88, + "grad_norm": 31.29667854309082, + "learning_rate": 1.4107852570807012e-05, + "loss": 1.5647, "step": 7043 }, { - "epoch": 2.12, - "grad_norm": 26.871532440185547, - "learning_rate": 5.881527513280546e-06, - "loss": 1.4686, + "epoch": 0.88, + "grad_norm": 57.366641998291016, + "learning_rate": 1.4107015855750326e-05, + "loss": 0.898, "step": 7044 }, { - "epoch": 2.12, - "grad_norm": 10.8401460647583, - "learning_rate": 5.8795229026761555e-06, - "loss": 1.5021, + "epoch": 0.88, + "grad_norm": 8.376490592956543, + "learning_rate": 1.410617914069364e-05, + "loss": 0.7108, "step": 7045 }, { - "epoch": 2.12, - "grad_norm": 22.18703842163086, - "learning_rate": 5.877518292071766e-06, - "loss": 1.2032, + "epoch": 0.88, + "grad_norm": 52.11063766479492, + "learning_rate": 1.410534242563695e-05, + "loss": 2.0481, "step": 7046 }, { - "epoch": 2.12, - "grad_norm": 14.264338493347168, - "learning_rate": 5.875513681467375e-06, - "loss": 1.3754, + "epoch": 0.88, + "grad_norm": 55.179683685302734, + "learning_rate": 1.4104505710580263e-05, + "loss": 1.5647, "step": 7047 }, { - "epoch": 2.12, - "grad_norm": 14.007697105407715, - "learning_rate": 5.873509070862985e-06, - "loss": 1.549, + "epoch": 0.88, + "grad_norm": 9.608619689941406, + "learning_rate": 1.4103668995523577e-05, + "loss": 0.7384, "step": 7048 }, { - "epoch": 2.12, - "grad_norm": 45.7745361328125, - "learning_rate": 5.871504460258596e-06, - "loss": 1.5497, + "epoch": 0.88, + "grad_norm": 9.80874252319336, + "learning_rate": 1.4102832280466887e-05, + "loss": 1.4948, "step": 7049 }, { - "epoch": 2.12, - "grad_norm": 13.82027530670166, - "learning_rate": 5.869499849654205e-06, - "loss": 0.9325, + "epoch": 0.88, + "grad_norm": 7.465558052062988, + "learning_rate": 1.41019955654102e-05, + "loss": 0.4291, "step": 7050 }, { - "epoch": 2.12, - "grad_norm": 28.13387107849121, - "learning_rate": 5.867495239049815e-06, - "loss": 1.5999, + "epoch": 0.88, + "grad_norm": 5.3900628089904785, + "learning_rate": 1.4101158850353513e-05, + "loss": 0.4701, "step": 7051 }, { - "epoch": 2.12, - "grad_norm": 6.187619209289551, - "learning_rate": 5.865490628445425e-06, - "loss": 0.843, + "epoch": 0.89, + "grad_norm": 14.090367317199707, + "learning_rate": 1.4100322135296826e-05, + "loss": 0.8991, "step": 7052 }, { - "epoch": 2.12, - "grad_norm": 16.584333419799805, - "learning_rate": 5.8634860178410345e-06, - "loss": 1.3149, + "epoch": 0.89, + "grad_norm": 32.191429138183594, + "learning_rate": 1.4099485420240138e-05, + "loss": 1.2037, "step": 7053 }, { - "epoch": 2.12, - "grad_norm": 36.140869140625, - "learning_rate": 5.861481407236645e-06, - "loss": 1.3137, + "epoch": 0.89, + "grad_norm": 5.541016578674316, + "learning_rate": 1.409864870518345e-05, + "loss": 0.098, "step": 7054 }, { - "epoch": 2.12, - "grad_norm": 25.52848243713379, - "learning_rate": 5.8594767966322555e-06, - "loss": 2.0762, + "epoch": 0.89, + "grad_norm": 10.426650047302246, + "learning_rate": 1.4097811990126764e-05, + "loss": 0.9164, "step": 7055 }, { - "epoch": 2.12, - "grad_norm": 36.224937438964844, - "learning_rate": 5.857472186027865e-06, - "loss": 1.4368, + "epoch": 0.89, + "grad_norm": 30.81728172302246, + "learning_rate": 1.4096975275070074e-05, + "loss": 1.9322, "step": 7056 }, { - "epoch": 2.12, - "grad_norm": 37.73203659057617, - "learning_rate": 5.855467575423475e-06, - "loss": 2.0182, + "epoch": 0.89, + "grad_norm": 29.294023513793945, + "learning_rate": 1.4096138560013388e-05, + "loss": 1.4222, "step": 7057 }, { - "epoch": 2.12, - "grad_norm": 10.022035598754883, - "learning_rate": 5.853462964819084e-06, - "loss": 0.9173, + "epoch": 0.89, + "grad_norm": 21.741195678710938, + "learning_rate": 1.4095301844956702e-05, + "loss": 1.363, "step": 7058 }, { - "epoch": 2.12, - "grad_norm": 31.495861053466797, - "learning_rate": 5.851458354214694e-06, - "loss": 1.8068, + "epoch": 0.89, + "grad_norm": 13.830698013305664, + "learning_rate": 1.4094465129900015e-05, + "loss": 1.0887, "step": 7059 }, { - "epoch": 2.12, - "grad_norm": 13.925558090209961, - "learning_rate": 5.849453743610303e-06, - "loss": 0.8992, + "epoch": 0.89, + "grad_norm": 2.4803571701049805, + "learning_rate": 1.4093628414843325e-05, + "loss": 0.0894, "step": 7060 }, { - "epoch": 2.12, - "grad_norm": 45.48153305053711, - "learning_rate": 5.8474491330059135e-06, - "loss": 2.8715, + "epoch": 0.89, + "grad_norm": 18.780452728271484, + "learning_rate": 1.4092791699786639e-05, + "loss": 1.6977, "step": 7061 }, { - "epoch": 2.12, - "grad_norm": 19.373300552368164, - "learning_rate": 5.8454445224015244e-06, - "loss": 1.1757, + "epoch": 0.89, + "grad_norm": 17.498626708984375, + "learning_rate": 1.4091954984729953e-05, + "loss": 2.0894, "step": 7062 }, { - "epoch": 2.12, - "grad_norm": 20.65883445739746, - "learning_rate": 5.8434399117971345e-06, - "loss": 2.2168, + "epoch": 0.89, + "grad_norm": 20.703948974609375, + "learning_rate": 1.4091118269673263e-05, + "loss": 2.0404, "step": 7063 }, { - "epoch": 2.12, - "grad_norm": 18.880720138549805, - "learning_rate": 5.841435301192744e-06, - "loss": 1.2185, + "epoch": 0.89, + "grad_norm": 6.733626842498779, + "learning_rate": 1.4090281554616577e-05, + "loss": 1.989, "step": 7064 }, { - "epoch": 2.12, - "grad_norm": 14.757368087768555, - "learning_rate": 5.839430690588354e-06, - "loss": 1.3524, + "epoch": 0.89, + "grad_norm": 12.881988525390625, + "learning_rate": 1.4089444839559889e-05, + "loss": 0.8604, "step": 7065 }, { - "epoch": 2.12, - "grad_norm": 14.642279624938965, - "learning_rate": 5.837426079983963e-06, - "loss": 2.0154, + "epoch": 0.89, + "grad_norm": 5.169025897979736, + "learning_rate": 1.4088608124503202e-05, + "loss": 0.5154, "step": 7066 }, { - "epoch": 2.12, - "grad_norm": 7.192394733428955, - "learning_rate": 5.835421469379573e-06, - "loss": 0.3944, + "epoch": 0.89, + "grad_norm": 15.80472469329834, + "learning_rate": 1.4087771409446514e-05, + "loss": 1.5132, "step": 7067 }, { - "epoch": 2.13, - "grad_norm": 33.72096252441406, - "learning_rate": 5.833416858775184e-06, - "loss": 1.6128, + "epoch": 0.89, + "grad_norm": 15.307950973510742, + "learning_rate": 1.4086934694389826e-05, + "loss": 1.4989, "step": 7068 }, { - "epoch": 2.13, - "grad_norm": 18.03380584716797, - "learning_rate": 5.831412248170793e-06, - "loss": 1.4521, + "epoch": 0.89, + "grad_norm": 17.048763275146484, + "learning_rate": 1.408609797933314e-05, + "loss": 1.9225, "step": 7069 }, { - "epoch": 2.13, - "grad_norm": 10.494623184204102, - "learning_rate": 5.829407637566403e-06, - "loss": 1.3512, + "epoch": 0.89, + "grad_norm": 14.885255813598633, + "learning_rate": 1.408526126427645e-05, + "loss": 1.6576, "step": 7070 }, { - "epoch": 2.13, - "grad_norm": 12.263495445251465, - "learning_rate": 5.827403026962013e-06, - "loss": 1.0091, + "epoch": 0.89, + "grad_norm": 9.93848991394043, + "learning_rate": 1.4084424549219764e-05, + "loss": 1.1178, "step": 7071 }, { - "epoch": 2.13, - "grad_norm": 10.31989860534668, - "learning_rate": 5.825398416357623e-06, - "loss": 1.4169, + "epoch": 0.89, + "grad_norm": 10.413420677185059, + "learning_rate": 1.4083587834163077e-05, + "loss": 1.8969, "step": 7072 }, { - "epoch": 2.13, - "grad_norm": 24.252532958984375, - "learning_rate": 5.823393805753233e-06, - "loss": 1.5791, + "epoch": 0.89, + "grad_norm": 12.487821578979492, + "learning_rate": 1.4082751119106391e-05, + "loss": 0.8172, "step": 7073 }, { - "epoch": 2.13, - "grad_norm": 9.647172927856445, - "learning_rate": 5.821389195148843e-06, - "loss": 0.7097, + "epoch": 0.89, + "grad_norm": 5.916619300842285, + "learning_rate": 1.4081914404049701e-05, + "loss": 0.4773, "step": 7074 }, { - "epoch": 2.13, - "grad_norm": 39.26737976074219, - "learning_rate": 5.819384584544453e-06, - "loss": 1.2926, + "epoch": 0.89, + "grad_norm": 13.073233604431152, + "learning_rate": 1.4081077688993015e-05, + "loss": 0.8163, "step": 7075 }, { - "epoch": 2.13, - "grad_norm": 18.434120178222656, - "learning_rate": 5.817379973940063e-06, - "loss": 1.684, + "epoch": 0.89, + "grad_norm": 6.288042068481445, + "learning_rate": 1.4080240973936327e-05, + "loss": 0.9977, "step": 7076 }, { - "epoch": 2.13, - "grad_norm": 23.794525146484375, - "learning_rate": 5.815375363335672e-06, - "loss": 1.4986, + "epoch": 0.89, + "grad_norm": 38.35702133178711, + "learning_rate": 1.4079404258879639e-05, + "loss": 2.5409, "step": 7077 }, { - "epoch": 2.13, - "grad_norm": 23.152494430541992, - "learning_rate": 5.813370752731282e-06, - "loss": 1.7962, + "epoch": 0.89, + "grad_norm": 23.470169067382812, + "learning_rate": 1.4078567543822953e-05, + "loss": 1.3125, "step": 7078 }, { - "epoch": 2.13, - "grad_norm": 10.018393516540527, - "learning_rate": 5.811366142126892e-06, - "loss": 0.6564, + "epoch": 0.89, + "grad_norm": 16.05512046813965, + "learning_rate": 1.4077730828766264e-05, + "loss": 2.153, "step": 7079 }, { - "epoch": 2.13, - "grad_norm": 14.856311798095703, - "learning_rate": 5.8093615315225026e-06, - "loss": 1.3591, - "step": 7080 - }, - { - "epoch": 2.13, - "eval_loss": 0.1809202879667282, - "eval_runtime": 43.5269, - "eval_samples_per_second": 33.979, - "eval_steps_per_second": 33.979, + "epoch": 0.89, + "grad_norm": 12.977189064025879, + "learning_rate": 1.4076894113709578e-05, + "loss": 2.1821, "step": 7080 }, { - "epoch": 2.13, - "grad_norm": 14.052577018737793, - "learning_rate": 5.807356920918113e-06, - "loss": 0.7851, + "epoch": 0.89, + "grad_norm": 14.016845703125, + "learning_rate": 1.4076057398652888e-05, + "loss": 3.3767, "step": 7081 }, { - "epoch": 2.13, - "grad_norm": 49.35251998901367, - "learning_rate": 5.805352310313722e-06, - "loss": 2.7815, + "epoch": 0.89, + "grad_norm": 21.714128494262695, + "learning_rate": 1.4075220683596202e-05, + "loss": 1.9269, "step": 7082 }, { - "epoch": 2.13, - "grad_norm": 48.668617248535156, - "learning_rate": 5.803347699709332e-06, - "loss": 1.4805, + "epoch": 0.89, + "grad_norm": 10.526546478271484, + "learning_rate": 1.4074383968539516e-05, + "loss": 2.438, "step": 7083 }, { - "epoch": 2.13, - "grad_norm": 9.275986671447754, - "learning_rate": 5.801343089104941e-06, - "loss": 0.7976, + "epoch": 0.89, + "grad_norm": 12.505184173583984, + "learning_rate": 1.4073547253482826e-05, + "loss": 1.74, "step": 7084 }, { - "epoch": 2.13, - "grad_norm": 50.608848571777344, - "learning_rate": 5.799338478500551e-06, - "loss": 1.9386, + "epoch": 0.89, + "grad_norm": 22.044870376586914, + "learning_rate": 1.407271053842614e-05, + "loss": 1.5427, "step": 7085 }, { - "epoch": 2.13, - "grad_norm": 18.53827476501465, - "learning_rate": 5.797333867896162e-06, - "loss": 1.2232, + "epoch": 0.89, + "grad_norm": 16.35655403137207, + "learning_rate": 1.4071873823369453e-05, + "loss": 3.4506, "step": 7086 }, { - "epoch": 2.13, - "grad_norm": 20.711294174194336, - "learning_rate": 5.7953292572917715e-06, - "loss": 2.057, + "epoch": 0.89, + "grad_norm": 26.269641876220703, + "learning_rate": 1.4071037108312767e-05, + "loss": 1.656, "step": 7087 }, { - "epoch": 2.13, - "grad_norm": 12.552864074707031, - "learning_rate": 5.7933246466873815e-06, - "loss": 0.9687, + "epoch": 0.89, + "grad_norm": 14.767579078674316, + "learning_rate": 1.4070200393256077e-05, + "loss": 2.7777, "step": 7088 }, { - "epoch": 2.13, - "grad_norm": 20.43316078186035, - "learning_rate": 5.791320036082992e-06, - "loss": 1.6212, + "epoch": 0.89, + "grad_norm": 15.188955307006836, + "learning_rate": 1.406936367819939e-05, + "loss": 1.2754, "step": 7089 }, { - "epoch": 2.13, - "grad_norm": 8.86861801147461, - "learning_rate": 5.789315425478601e-06, - "loss": 1.2358, + "epoch": 0.89, + "grad_norm": 13.957823753356934, + "learning_rate": 1.4068526963142703e-05, + "loss": 1.8468, "step": 7090 }, { - "epoch": 2.13, - "grad_norm": 17.865428924560547, - "learning_rate": 5.787310814874211e-06, - "loss": 1.8776, + "epoch": 0.89, + "grad_norm": 6.557285308837891, + "learning_rate": 1.4067690248086015e-05, + "loss": 0.657, "step": 7091 }, { - "epoch": 2.13, - "grad_norm": 10.841514587402344, - "learning_rate": 5.785306204269822e-06, - "loss": 1.0038, + "epoch": 0.89, + "grad_norm": 12.158381462097168, + "learning_rate": 1.4066853533029328e-05, + "loss": 2.1722, "step": 7092 }, { - "epoch": 2.13, - "grad_norm": 10.26215934753418, - "learning_rate": 5.783301593665431e-06, - "loss": 0.7371, + "epoch": 0.89, + "grad_norm": 7.080685615539551, + "learning_rate": 1.406601681797264e-05, + "loss": 0.5531, "step": 7093 }, { - "epoch": 2.13, - "grad_norm": 19.391332626342773, - "learning_rate": 5.781296983061041e-06, - "loss": 1.335, + "epoch": 0.89, + "grad_norm": 16.063068389892578, + "learning_rate": 1.4065180102915954e-05, + "loss": 1.5258, "step": 7094 }, { - "epoch": 2.13, - "grad_norm": 84.89678955078125, - "learning_rate": 5.7792923724566504e-06, - "loss": 3.5264, + "epoch": 0.89, + "grad_norm": 24.38238525390625, + "learning_rate": 1.4064343387859264e-05, + "loss": 0.9849, "step": 7095 }, { - "epoch": 2.13, - "grad_norm": 18.809005737304688, - "learning_rate": 5.7772877618522605e-06, - "loss": 1.2989, + "epoch": 0.89, + "grad_norm": 11.94936466217041, + "learning_rate": 1.4063506672802578e-05, + "loss": 0.9163, "step": 7096 }, { - "epoch": 2.13, - "grad_norm": 17.723434448242188, - "learning_rate": 5.775283151247871e-06, - "loss": 0.9432, + "epoch": 0.89, + "grad_norm": 13.647477149963379, + "learning_rate": 1.4062669957745892e-05, + "loss": 1.0582, "step": 7097 }, { - "epoch": 2.13, - "grad_norm": 11.459822654724121, - "learning_rate": 5.77327854064348e-06, - "loss": 0.796, + "epoch": 0.89, + "grad_norm": 17.50379180908203, + "learning_rate": 1.4061833242689202e-05, + "loss": 1.1992, "step": 7098 }, { - "epoch": 2.13, - "grad_norm": 10.62411880493164, - "learning_rate": 5.771273930039091e-06, - "loss": 2.0095, + "epoch": 0.89, + "grad_norm": 23.42878532409668, + "learning_rate": 1.4060996527632515e-05, + "loss": 1.5985, "step": 7099 }, { - "epoch": 2.13, - "grad_norm": 17.644634246826172, - "learning_rate": 5.769269319434701e-06, - "loss": 1.3346, + "epoch": 0.89, + "grad_norm": 14.225120544433594, + "learning_rate": 1.4060159812575829e-05, + "loss": 1.7414, "step": 7100 }, { - "epoch": 2.13, - "grad_norm": 16.67945671081543, - "learning_rate": 5.76726470883031e-06, - "loss": 1.6617, + "epoch": 0.89, + "grad_norm": 16.80451011657715, + "learning_rate": 1.4059323097519143e-05, + "loss": 1.5058, "step": 7101 }, { - "epoch": 2.14, - "grad_norm": 12.336973190307617, - "learning_rate": 5.76526009822592e-06, - "loss": 1.2088, + "epoch": 0.89, + "grad_norm": 13.066004753112793, + "learning_rate": 1.4058486382462453e-05, + "loss": 2.9879, "step": 7102 }, { - "epoch": 2.14, - "grad_norm": 43.285831451416016, - "learning_rate": 5.763255487621529e-06, - "loss": 1.8144, + "epoch": 0.89, + "grad_norm": 11.759817123413086, + "learning_rate": 1.4057649667405767e-05, + "loss": 2.7269, "step": 7103 }, { - "epoch": 2.14, - "grad_norm": 95.72914123535156, - "learning_rate": 5.7612508770171395e-06, - "loss": 1.4038, + "epoch": 0.89, + "grad_norm": 8.385038375854492, + "learning_rate": 1.4056812952349079e-05, + "loss": 1.5329, "step": 7104 }, { - "epoch": 2.14, - "grad_norm": 37.060298919677734, - "learning_rate": 5.7592462664127504e-06, - "loss": 1.6265, + "epoch": 0.89, + "grad_norm": 5.424639701843262, + "learning_rate": 1.405597623729239e-05, + "loss": 0.2796, "step": 7105 }, { - "epoch": 2.14, - "grad_norm": 45.99290466308594, - "learning_rate": 5.75724165580836e-06, - "loss": 1.7691, + "epoch": 0.89, + "grad_norm": 7.860196113586426, + "learning_rate": 1.4055139522235704e-05, + "loss": 0.5247, "step": 7106 }, { - "epoch": 2.14, - "grad_norm": 15.260518074035645, - "learning_rate": 5.75523704520397e-06, - "loss": 1.7976, + "epoch": 0.89, + "grad_norm": 8.838438987731934, + "learning_rate": 1.4054302807179016e-05, + "loss": 1.2771, "step": 7107 }, { - "epoch": 2.14, - "grad_norm": 18.973817825317383, - "learning_rate": 5.753232434599579e-06, - "loss": 1.355, + "epoch": 0.89, + "grad_norm": 17.758642196655273, + "learning_rate": 1.405346609212233e-05, + "loss": 2.9798, "step": 7108 }, { - "epoch": 2.14, - "grad_norm": 16.729829788208008, - "learning_rate": 5.751227823995189e-06, - "loss": 1.2276, + "epoch": 0.89, + "grad_norm": 8.714234352111816, + "learning_rate": 1.405262937706564e-05, + "loss": 0.5166, "step": 7109 }, { - "epoch": 2.14, - "grad_norm": 15.583281517028809, - "learning_rate": 5.749223213390799e-06, - "loss": 1.3231, + "epoch": 0.89, + "grad_norm": 13.17325210571289, + "learning_rate": 1.4051792662008954e-05, + "loss": 1.4872, "step": 7110 }, { - "epoch": 2.14, - "grad_norm": 9.61965274810791, - "learning_rate": 5.747218602786409e-06, - "loss": 0.6578, + "epoch": 0.89, + "grad_norm": 15.605276107788086, + "learning_rate": 1.4050955946952267e-05, + "loss": 2.4161, "step": 7111 }, { - "epoch": 2.14, - "grad_norm": 14.255253791809082, - "learning_rate": 5.745213992182019e-06, - "loss": 1.0521, + "epoch": 0.89, + "grad_norm": 16.567859649658203, + "learning_rate": 1.4050119231895578e-05, + "loss": 0.7589, "step": 7112 }, { - "epoch": 2.14, - "grad_norm": 28.25699234008789, - "learning_rate": 5.743209381577629e-06, - "loss": 1.7292, + "epoch": 0.89, + "grad_norm": 6.031264781951904, + "learning_rate": 1.4049282516838891e-05, + "loss": 0.5424, "step": 7113 }, { - "epoch": 2.14, - "grad_norm": 24.005878448486328, - "learning_rate": 5.741204770973239e-06, - "loss": 1.3648, + "epoch": 0.89, + "grad_norm": 12.151751518249512, + "learning_rate": 1.4048445801782205e-05, + "loss": 1.4286, "step": 7114 }, { - "epoch": 2.14, - "grad_norm": 55.37303924560547, - "learning_rate": 5.739200160368849e-06, - "loss": 1.104, + "epoch": 0.89, + "grad_norm": 14.095078468322754, + "learning_rate": 1.4047609086725517e-05, + "loss": 1.2149, "step": 7115 }, { - "epoch": 2.14, - "grad_norm": 18.499359130859375, - "learning_rate": 5.737195549764458e-06, - "loss": 1.4972, + "epoch": 0.89, + "grad_norm": 15.963891983032227, + "learning_rate": 1.4046772371668829e-05, + "loss": 2.6274, "step": 7116 }, { - "epoch": 2.14, - "grad_norm": 14.160520553588867, - "learning_rate": 5.735190939160069e-06, - "loss": 1.2106, + "epoch": 0.89, + "grad_norm": 7.442634582519531, + "learning_rate": 1.4045935656612142e-05, + "loss": 0.7399, "step": 7117 }, { - "epoch": 2.14, - "grad_norm": 12.180475234985352, - "learning_rate": 5.733186328555679e-06, - "loss": 1.4281, + "epoch": 0.89, + "grad_norm": 25.207595825195312, + "learning_rate": 1.4045098941555454e-05, + "loss": 2.1868, "step": 7118 }, { - "epoch": 2.14, - "grad_norm": 10.075895309448242, - "learning_rate": 5.731181717951288e-06, - "loss": 0.811, + "epoch": 0.89, + "grad_norm": 26.329158782958984, + "learning_rate": 1.4044262226498766e-05, + "loss": 1.7922, "step": 7119 }, { - "epoch": 2.14, - "grad_norm": 62.43848419189453, - "learning_rate": 5.729177107346898e-06, - "loss": 2.8725, + "epoch": 0.89, + "grad_norm": 8.771462440490723, + "learning_rate": 1.404342551144208e-05, + "loss": 1.5612, "step": 7120 }, { - "epoch": 2.14, - "grad_norm": 23.31486701965332, - "learning_rate": 5.7271724967425076e-06, - "loss": 2.8676, + "epoch": 0.89, + "grad_norm": 2.6278157234191895, + "learning_rate": 1.4042588796385392e-05, + "loss": 0.0416, "step": 7121 }, { - "epoch": 2.14, - "grad_norm": 13.410110473632812, - "learning_rate": 5.725167886138118e-06, - "loss": 1.0656, + "epoch": 0.89, + "grad_norm": 6.591731071472168, + "learning_rate": 1.4041752081328706e-05, + "loss": 0.9422, "step": 7122 }, { - "epoch": 2.14, - "grad_norm": 25.776451110839844, - "learning_rate": 5.7231632755337286e-06, - "loss": 1.9623, + "epoch": 0.89, + "grad_norm": 7.92766809463501, + "learning_rate": 1.4040915366272016e-05, + "loss": 1.6853, "step": 7123 }, { - "epoch": 2.14, - "grad_norm": 19.853778839111328, - "learning_rate": 5.721158664929339e-06, - "loss": 1.2182, + "epoch": 0.89, + "grad_norm": 6.853774547576904, + "learning_rate": 1.404007865121533e-05, + "loss": 0.7023, "step": 7124 }, { - "epoch": 2.14, - "grad_norm": 30.225332260131836, - "learning_rate": 5.719154054324948e-06, - "loss": 1.4418, + "epoch": 0.89, + "grad_norm": 8.716609954833984, + "learning_rate": 1.4039241936158643e-05, + "loss": 0.6673, "step": 7125 }, { - "epoch": 2.14, - "grad_norm": 9.551996231079102, - "learning_rate": 5.717149443720558e-06, - "loss": 1.1343, + "epoch": 0.89, + "grad_norm": 21.26909637451172, + "learning_rate": 1.4038405221101953e-05, + "loss": 1.9829, "step": 7126 }, { - "epoch": 2.14, - "grad_norm": 33.873294830322266, - "learning_rate": 5.715144833116167e-06, - "loss": 1.9743, + "epoch": 0.89, + "grad_norm": 8.954400062561035, + "learning_rate": 1.4037568506045267e-05, + "loss": 1.3099, "step": 7127 }, { - "epoch": 2.14, - "grad_norm": 11.439074516296387, - "learning_rate": 5.713140222511777e-06, - "loss": 1.1761, + "epoch": 0.89, + "grad_norm": 17.29623794555664, + "learning_rate": 1.403673179098858e-05, + "loss": 2.3307, "step": 7128 }, { - "epoch": 2.14, - "grad_norm": 11.673768043518066, - "learning_rate": 5.711135611907388e-06, - "loss": 1.4744, + "epoch": 0.89, + "grad_norm": 17.295703887939453, + "learning_rate": 1.4035895075931893e-05, + "loss": 2.1599, "step": 7129 }, { - "epoch": 2.14, - "grad_norm": 22.201738357543945, - "learning_rate": 5.7091310013029975e-06, - "loss": 2.8592, + "epoch": 0.89, + "grad_norm": 24.35731315612793, + "learning_rate": 1.4035058360875205e-05, + "loss": 2.1767, "step": 7130 }, { - "epoch": 2.14, - "grad_norm": 11.81099796295166, - "learning_rate": 5.7071263906986075e-06, - "loss": 1.2218, + "epoch": 0.89, + "grad_norm": 20.482467651367188, + "learning_rate": 1.4034221645818518e-05, + "loss": 1.7421, "step": 7131 }, { - "epoch": 2.14, - "grad_norm": 26.6683406829834, - "learning_rate": 5.705121780094217e-06, - "loss": 1.2941, + "epoch": 0.9, + "grad_norm": 27.635377883911133, + "learning_rate": 1.403338493076183e-05, + "loss": 1.744, "step": 7132 }, { - "epoch": 2.14, - "grad_norm": 8.380386352539062, - "learning_rate": 5.703117169489827e-06, - "loss": 0.6369, + "epoch": 0.9, + "grad_norm": 10.110763549804688, + "learning_rate": 1.4032548215705142e-05, + "loss": 0.9675, "step": 7133 }, { - "epoch": 2.14, - "grad_norm": 11.767404556274414, - "learning_rate": 5.701112558885437e-06, - "loss": 1.0829, + "epoch": 0.9, + "grad_norm": 13.34918212890625, + "learning_rate": 1.4031711500648454e-05, + "loss": 2.1284, "step": 7134 }, { - "epoch": 2.15, - "grad_norm": 25.861289978027344, - "learning_rate": 5.699107948281047e-06, - "loss": 1.9822, + "epoch": 0.9, + "grad_norm": 5.477113246917725, + "learning_rate": 1.4030874785591768e-05, + "loss": 0.4138, "step": 7135 }, { - "epoch": 2.15, - "grad_norm": 27.93609619140625, - "learning_rate": 5.697103337676657e-06, - "loss": 1.3616, + "epoch": 0.9, + "grad_norm": 6.456885814666748, + "learning_rate": 1.4030038070535081e-05, + "loss": 0.3358, "step": 7136 }, { - "epoch": 2.15, - "grad_norm": 87.64994812011719, - "learning_rate": 5.695098727072267e-06, - "loss": 2.3206, + "epoch": 0.9, + "grad_norm": 10.445297241210938, + "learning_rate": 1.4029201355478392e-05, + "loss": 1.6127, "step": 7137 }, { - "epoch": 2.15, - "grad_norm": 11.817543029785156, - "learning_rate": 5.6930941164678764e-06, - "loss": 1.2847, + "epoch": 0.9, + "grad_norm": 13.392292976379395, + "learning_rate": 1.4028364640421705e-05, + "loss": 1.4171, "step": 7138 }, { - "epoch": 2.15, - "grad_norm": 21.869726181030273, - "learning_rate": 5.6910895058634865e-06, - "loss": 1.2728, + "epoch": 0.9, + "grad_norm": 13.771772384643555, + "learning_rate": 1.4027527925365019e-05, + "loss": 1.5265, "step": 7139 }, { - "epoch": 2.15, - "grad_norm": 12.761932373046875, - "learning_rate": 5.689084895259096e-06, - "loss": 1.5109, + "epoch": 0.9, + "grad_norm": 20.047332763671875, + "learning_rate": 1.402669121030833e-05, + "loss": 2.2711, "step": 7140 }, { - "epoch": 2.15, - "grad_norm": 10.641894340515137, - "learning_rate": 5.687080284654706e-06, - "loss": 0.7452, + "epoch": 0.9, + "grad_norm": 21.45895004272461, + "learning_rate": 1.4025854495251643e-05, + "loss": 3.7054, "step": 7141 }, { - "epoch": 2.15, - "grad_norm": 17.009889602661133, - "learning_rate": 5.685075674050317e-06, - "loss": 1.8117, + "epoch": 0.9, + "grad_norm": 11.379473686218262, + "learning_rate": 1.4025017780194957e-05, + "loss": 2.3151, "step": 7142 }, { - "epoch": 2.15, - "grad_norm": 24.577238082885742, - "learning_rate": 5.683071063445926e-06, - "loss": 1.1146, + "epoch": 0.9, + "grad_norm": 75.81961822509766, + "learning_rate": 1.4024181065138269e-05, + "loss": 1.9507, "step": 7143 }, { - "epoch": 2.15, - "grad_norm": 35.608741760253906, - "learning_rate": 5.681066452841536e-06, - "loss": 2.1795, + "epoch": 0.9, + "grad_norm": 17.682466506958008, + "learning_rate": 1.402334435008158e-05, + "loss": 0.8294, "step": 7144 }, { - "epoch": 2.15, - "grad_norm": 11.501810073852539, - "learning_rate": 5.679061842237145e-06, - "loss": 0.9263, + "epoch": 0.9, + "grad_norm": 8.107851028442383, + "learning_rate": 1.4022507635024894e-05, + "loss": 0.8469, "step": 7145 }, { - "epoch": 2.15, - "grad_norm": 17.368040084838867, - "learning_rate": 5.6770572316327554e-06, - "loss": 1.4102, + "epoch": 0.9, + "grad_norm": 28.90208625793457, + "learning_rate": 1.4021670919968206e-05, + "loss": 1.9015, "step": 7146 }, { - "epoch": 2.15, - "grad_norm": 16.95510482788086, - "learning_rate": 5.6750526210283655e-06, - "loss": 1.601, + "epoch": 0.9, + "grad_norm": 14.991819381713867, + "learning_rate": 1.4020834204911518e-05, + "loss": 2.5976, "step": 7147 }, { - "epoch": 2.15, - "grad_norm": 8.293194770812988, - "learning_rate": 5.673048010423976e-06, - "loss": 0.8742, + "epoch": 0.9, + "grad_norm": 13.504834175109863, + "learning_rate": 1.401999748985483e-05, + "loss": 1.9669, "step": 7148 }, { - "epoch": 2.15, - "grad_norm": 6.959547519683838, - "learning_rate": 5.671043399819586e-06, - "loss": 0.4787, + "epoch": 0.9, + "grad_norm": 14.985677719116211, + "learning_rate": 1.4019160774798144e-05, + "loss": 1.2278, "step": 7149 }, { - "epoch": 2.15, - "grad_norm": 15.539986610412598, - "learning_rate": 5.669038789215196e-06, - "loss": 0.8455, + "epoch": 0.9, + "grad_norm": 31.02518081665039, + "learning_rate": 1.4018324059741457e-05, + "loss": 1.9601, "step": 7150 }, { - "epoch": 2.15, - "grad_norm": 11.356565475463867, - "learning_rate": 5.667034178610805e-06, - "loss": 1.1054, + "epoch": 0.9, + "grad_norm": 13.735119819641113, + "learning_rate": 1.4017487344684768e-05, + "loss": 1.2967, "step": 7151 }, { - "epoch": 2.15, - "grad_norm": 25.823801040649414, - "learning_rate": 5.665029568006415e-06, - "loss": 1.7274, + "epoch": 0.9, + "grad_norm": 7.824489116668701, + "learning_rate": 1.4016650629628081e-05, + "loss": 0.6478, "step": 7152 }, { - "epoch": 2.15, - "grad_norm": 43.120635986328125, - "learning_rate": 5.663024957402024e-06, - "loss": 1.6869, + "epoch": 0.9, + "grad_norm": 11.406153678894043, + "learning_rate": 1.4015813914571395e-05, + "loss": 1.9932, "step": 7153 }, { - "epoch": 2.15, - "grad_norm": 8.684465408325195, - "learning_rate": 5.661020346797635e-06, - "loss": 1.2637, + "epoch": 0.9, + "grad_norm": 10.797396659851074, + "learning_rate": 1.4014977199514705e-05, + "loss": 1.707, "step": 7154 }, { - "epoch": 2.15, - "grad_norm": 23.894262313842773, - "learning_rate": 5.659015736193245e-06, - "loss": 1.4771, + "epoch": 0.9, + "grad_norm": 34.46938705444336, + "learning_rate": 1.4014140484458019e-05, + "loss": 2.7764, "step": 7155 }, { - "epoch": 2.15, - "grad_norm": 26.122549057006836, - "learning_rate": 5.6570111255888546e-06, - "loss": 1.3112, + "epoch": 0.9, + "grad_norm": 59.19340896606445, + "learning_rate": 1.4013303769401332e-05, + "loss": 1.8641, "step": 7156 }, { - "epoch": 2.15, - "grad_norm": 20.096174240112305, - "learning_rate": 5.655006514984465e-06, - "loss": 1.0569, + "epoch": 0.9, + "grad_norm": 12.268207550048828, + "learning_rate": 1.4012467054344644e-05, + "loss": 0.8285, "step": 7157 }, { - "epoch": 2.15, - "grad_norm": 36.696006774902344, - "learning_rate": 5.653001904380075e-06, - "loss": 1.7493, + "epoch": 0.9, + "grad_norm": 4.307581424713135, + "learning_rate": 1.4011630339287956e-05, + "loss": 0.4835, "step": 7158 }, { - "epoch": 2.15, - "grad_norm": 28.245203018188477, - "learning_rate": 5.650997293775684e-06, - "loss": 1.5218, + "epoch": 0.9, + "grad_norm": 18.47773551940918, + "learning_rate": 1.401079362423127e-05, + "loss": 1.4313, "step": 7159 }, { - "epoch": 2.15, - "grad_norm": 27.659273147583008, - "learning_rate": 5.648992683171295e-06, - "loss": 2.6867, + "epoch": 0.9, + "grad_norm": 4.660049915313721, + "learning_rate": 1.4009956909174582e-05, + "loss": 1.4403, "step": 7160 }, { - "epoch": 2.15, - "grad_norm": 47.04728317260742, - "learning_rate": 5.646988072566905e-06, - "loss": 1.1954, + "epoch": 0.9, + "grad_norm": 13.854349136352539, + "learning_rate": 1.4009120194117894e-05, + "loss": 1.8415, "step": 7161 }, { - "epoch": 2.15, - "grad_norm": 17.212331771850586, - "learning_rate": 5.644983461962514e-06, - "loss": 2.1451, + "epoch": 0.9, + "grad_norm": 13.846010208129883, + "learning_rate": 1.4008283479061206e-05, + "loss": 1.4329, "step": 7162 }, { - "epoch": 2.15, - "grad_norm": 12.564343452453613, - "learning_rate": 5.642978851358124e-06, - "loss": 1.1914, + "epoch": 0.9, + "grad_norm": 35.755775451660156, + "learning_rate": 1.400744676400452e-05, + "loss": 1.4303, "step": 7163 }, { - "epoch": 2.15, - "grad_norm": 38.6114501953125, - "learning_rate": 5.6409742407537336e-06, - "loss": 0.9675, + "epoch": 0.9, + "grad_norm": 77.38518524169922, + "learning_rate": 1.4006610048947833e-05, + "loss": 1.8845, "step": 7164 }, { - "epoch": 2.15, - "grad_norm": 49.109031677246094, - "learning_rate": 5.638969630149344e-06, - "loss": 1.4241, + "epoch": 0.9, + "grad_norm": 8.316797256469727, + "learning_rate": 1.4005773333891143e-05, + "loss": 0.9716, "step": 7165 }, { - "epoch": 2.15, - "grad_norm": 17.366100311279297, - "learning_rate": 5.6369650195449546e-06, - "loss": 1.9756, + "epoch": 0.9, + "grad_norm": 17.104446411132812, + "learning_rate": 1.4004936618834457e-05, + "loss": 0.42, "step": 7166 }, { - "epoch": 2.15, - "grad_norm": 79.63975524902344, - "learning_rate": 5.634960408940564e-06, - "loss": 1.9999, + "epoch": 0.9, + "grad_norm": 24.48944664001465, + "learning_rate": 1.400409990377777e-05, + "loss": 1.9745, "step": 7167 }, { - "epoch": 2.16, - "grad_norm": 28.03964614868164, - "learning_rate": 5.632955798336174e-06, - "loss": 1.4169, + "epoch": 0.9, + "grad_norm": 10.361266136169434, + "learning_rate": 1.4003263188721081e-05, + "loss": 2.0676, "step": 7168 }, { - "epoch": 2.16, - "grad_norm": 23.203266143798828, - "learning_rate": 5.630951187731783e-06, - "loss": 1.3344, + "epoch": 0.9, + "grad_norm": 13.4504976272583, + "learning_rate": 1.4002426473664395e-05, + "loss": 1.0771, "step": 7169 }, { - "epoch": 2.16, - "grad_norm": 12.224784851074219, - "learning_rate": 5.628946577127393e-06, - "loss": 1.4286, + "epoch": 0.9, + "grad_norm": 18.236623764038086, + "learning_rate": 1.4001589758607708e-05, + "loss": 0.8684, "step": 7170 }, { - "epoch": 2.16, - "grad_norm": 9.047876358032227, - "learning_rate": 5.626941966523003e-06, - "loss": 1.1898, + "epoch": 0.9, + "grad_norm": 32.698753356933594, + "learning_rate": 1.400075304355102e-05, + "loss": 1.0423, "step": 7171 }, { - "epoch": 2.16, - "grad_norm": 22.1667423248291, - "learning_rate": 5.624937355918613e-06, - "loss": 1.945, + "epoch": 0.9, + "grad_norm": 7.705559253692627, + "learning_rate": 1.3999916328494332e-05, + "loss": 0.4881, "step": 7172 }, { - "epoch": 2.16, - "grad_norm": 13.911121368408203, - "learning_rate": 5.6229327453142235e-06, - "loss": 1.3742, + "epoch": 0.9, + "grad_norm": 8.77570629119873, + "learning_rate": 1.3999079613437646e-05, + "loss": 0.7506, "step": 7173 }, { - "epoch": 2.16, - "grad_norm": 9.241662979125977, - "learning_rate": 5.6209281347098335e-06, - "loss": 1.0261, + "epoch": 0.9, + "grad_norm": 11.598416328430176, + "learning_rate": 1.3998242898380958e-05, + "loss": 0.2903, "step": 7174 }, { - "epoch": 2.16, - "grad_norm": 13.640268325805664, - "learning_rate": 5.618923524105443e-06, - "loss": 1.1987, + "epoch": 0.9, + "grad_norm": 18.047786712646484, + "learning_rate": 1.399740618332427e-05, + "loss": 1.9002, "step": 7175 }, { - "epoch": 2.16, - "grad_norm": 21.423112869262695, - "learning_rate": 5.616918913501053e-06, - "loss": 1.722, + "epoch": 0.9, + "grad_norm": 15.605576515197754, + "learning_rate": 1.3996569468267582e-05, + "loss": 2.7821, "step": 7176 }, { - "epoch": 2.16, - "grad_norm": 33.87396240234375, - "learning_rate": 5.614914302896662e-06, - "loss": 0.9787, + "epoch": 0.9, + "grad_norm": 8.118613243103027, + "learning_rate": 1.3995732753210895e-05, + "loss": 1.4705, "step": 7177 }, { - "epoch": 2.16, - "grad_norm": 29.940101623535156, - "learning_rate": 5.612909692292273e-06, - "loss": 1.792, + "epoch": 0.9, + "grad_norm": 32.597618103027344, + "learning_rate": 1.3994896038154209e-05, + "loss": 2.3296, "step": 7178 }, { - "epoch": 2.16, - "grad_norm": 22.763090133666992, - "learning_rate": 5.610905081687883e-06, - "loss": 1.3948, + "epoch": 0.9, + "grad_norm": 11.029437065124512, + "learning_rate": 1.399405932309752e-05, + "loss": 2.8688, "step": 7179 }, { - "epoch": 2.16, - "grad_norm": 33.94498062133789, - "learning_rate": 5.608900471083492e-06, - "loss": 2.2237, + "epoch": 0.9, + "grad_norm": 4.349131107330322, + "learning_rate": 1.3993222608040833e-05, + "loss": 0.579, "step": 7180 }, { - "epoch": 2.16, - "grad_norm": 15.50265884399414, - "learning_rate": 5.6068958604791024e-06, - "loss": 1.5329, + "epoch": 0.9, + "grad_norm": 18.68450164794922, + "learning_rate": 1.3992385892984147e-05, + "loss": 2.8652, "step": 7181 }, { - "epoch": 2.16, - "grad_norm": 20.567745208740234, - "learning_rate": 5.6048912498747125e-06, - "loss": 1.7991, + "epoch": 0.9, + "grad_norm": 14.697657585144043, + "learning_rate": 1.3991549177927457e-05, + "loss": 0.5772, "step": 7182 }, { - "epoch": 2.16, - "grad_norm": 38.924346923828125, - "learning_rate": 5.602886639270322e-06, - "loss": 1.8058, + "epoch": 0.9, + "grad_norm": 3.4043476581573486, + "learning_rate": 1.399071246287077e-05, + "loss": 0.1265, "step": 7183 }, { - "epoch": 2.16, - "grad_norm": 23.95490264892578, - "learning_rate": 5.600882028665932e-06, - "loss": 0.9036, + "epoch": 0.9, + "grad_norm": 18.97923469543457, + "learning_rate": 1.3989875747814084e-05, + "loss": 0.9864, "step": 7184 }, { - "epoch": 2.16, - "grad_norm": 8.675360679626465, - "learning_rate": 5.598877418061543e-06, - "loss": 1.1445, + "epoch": 0.9, + "grad_norm": 13.734219551086426, + "learning_rate": 1.3989039032757396e-05, + "loss": 1.0469, "step": 7185 }, { - "epoch": 2.16, - "grad_norm": 52.8515510559082, - "learning_rate": 5.596872807457152e-06, - "loss": 0.9069, + "epoch": 0.9, + "grad_norm": 9.58630084991455, + "learning_rate": 1.3988202317700708e-05, + "loss": 0.1549, "step": 7186 }, { - "epoch": 2.16, - "grad_norm": 14.209603309631348, - "learning_rate": 5.594868196852762e-06, - "loss": 1.1668, + "epoch": 0.9, + "grad_norm": 13.013339042663574, + "learning_rate": 1.398736560264402e-05, + "loss": 1.3083, "step": 7187 }, { - "epoch": 2.16, - "grad_norm": 19.132238388061523, - "learning_rate": 5.592863586248371e-06, - "loss": 1.6934, + "epoch": 0.9, + "grad_norm": 20.65224266052246, + "learning_rate": 1.3986528887587334e-05, + "loss": 1.9961, "step": 7188 }, { - "epoch": 2.16, - "grad_norm": 16.72909927368164, - "learning_rate": 5.5908589756439814e-06, - "loss": 1.127, + "epoch": 0.9, + "grad_norm": 58.49673080444336, + "learning_rate": 1.3985692172530646e-05, + "loss": 2.1291, "step": 7189 }, { - "epoch": 2.16, - "grad_norm": 11.741925239562988, - "learning_rate": 5.588854365039591e-06, - "loss": 1.9914, + "epoch": 0.9, + "grad_norm": 21.04233741760254, + "learning_rate": 1.3984855457473958e-05, + "loss": 3.0153, "step": 7190 }, { - "epoch": 2.16, - "grad_norm": 34.14512634277344, - "learning_rate": 5.586849754435202e-06, - "loss": 2.0267, + "epoch": 0.9, + "grad_norm": 56.784305572509766, + "learning_rate": 1.3984018742417271e-05, + "loss": 1.2907, "step": 7191 }, { - "epoch": 2.16, - "grad_norm": 15.906356811523438, - "learning_rate": 5.584845143830812e-06, - "loss": 1.2969, + "epoch": 0.9, + "grad_norm": 18.404029846191406, + "learning_rate": 1.3983182027360585e-05, + "loss": 1.9775, "step": 7192 }, { - "epoch": 2.16, - "grad_norm": 14.884824752807617, - "learning_rate": 5.582840533226421e-06, - "loss": 1.6604, + "epoch": 0.9, + "grad_norm": 16.816679000854492, + "learning_rate": 1.3982345312303895e-05, + "loss": 1.1258, "step": 7193 }, { - "epoch": 2.16, - "grad_norm": 14.613624572753906, - "learning_rate": 5.580835922622031e-06, - "loss": 1.3751, + "epoch": 0.9, + "grad_norm": 4.080275535583496, + "learning_rate": 1.3981508597247209e-05, + "loss": 0.1642, "step": 7194 }, { - "epoch": 2.16, - "grad_norm": 14.830942153930664, - "learning_rate": 5.578831312017641e-06, - "loss": 1.5889, + "epoch": 0.9, + "grad_norm": 10.518272399902344, + "learning_rate": 1.3980671882190522e-05, + "loss": 2.8347, "step": 7195 }, { - "epoch": 2.16, - "grad_norm": 17.539030075073242, - "learning_rate": 5.57682670141325e-06, - "loss": 1.0911, + "epoch": 0.9, + "grad_norm": 14.956340789794922, + "learning_rate": 1.3979835167133833e-05, + "loss": 1.6647, "step": 7196 }, { - "epoch": 2.16, - "grad_norm": 28.30144500732422, - "learning_rate": 5.574822090808861e-06, - "loss": 1.2038, + "epoch": 0.9, + "grad_norm": 9.525736808776855, + "learning_rate": 1.3978998452077146e-05, + "loss": 1.8221, "step": 7197 }, { - "epoch": 2.16, - "grad_norm": 11.865046501159668, - "learning_rate": 5.572817480204471e-06, - "loss": 1.2387, + "epoch": 0.9, + "grad_norm": 47.34003829956055, + "learning_rate": 1.397816173702046e-05, + "loss": 2.235, "step": 7198 }, { - "epoch": 2.16, - "grad_norm": 29.797569274902344, - "learning_rate": 5.5708128696000806e-06, - "loss": 2.5909, + "epoch": 0.9, + "grad_norm": 12.209076881408691, + "learning_rate": 1.3977325021963772e-05, + "loss": 1.2154, "step": 7199 }, { - "epoch": 2.16, - "grad_norm": 14.200777053833008, - "learning_rate": 5.568808258995691e-06, - "loss": 1.5443, + "epoch": 0.9, + "grad_norm": 15.106338500976562, + "learning_rate": 1.3976488306907084e-05, + "loss": 2.5594, "step": 7200 }, { - "epoch": 2.16, - "eval_loss": 0.1803688108921051, - "eval_runtime": 43.5403, - "eval_samples_per_second": 33.969, - "eval_steps_per_second": 33.969, + "epoch": 0.9, + "eval_loss": 0.09446447342634201, + "eval_runtime": 156.8041, + "eval_samples_per_second": 22.589, + "eval_steps_per_second": 22.589, "step": 7200 }, { - "epoch": 2.17, - "grad_norm": 8.243300437927246, - "learning_rate": 5.5668036483913e-06, - "loss": 1.2393, + "epoch": 0.9, + "grad_norm": 25.125381469726562, + "learning_rate": 1.3975651591850396e-05, + "loss": 3.3564, "step": 7201 }, { - "epoch": 2.17, - "grad_norm": 29.610637664794922, - "learning_rate": 5.56479903778691e-06, - "loss": 0.9743, + "epoch": 0.9, + "grad_norm": 9.072014808654785, + "learning_rate": 1.397481487679371e-05, + "loss": 1.421, "step": 7202 }, { - "epoch": 2.17, - "grad_norm": 30.562345504760742, - "learning_rate": 5.562794427182521e-06, - "loss": 1.5775, + "epoch": 0.9, + "grad_norm": 4.650312900543213, + "learning_rate": 1.3973978161737021e-05, + "loss": 0.3649, "step": 7203 }, { - "epoch": 2.17, - "grad_norm": 13.141053199768066, - "learning_rate": 5.56078981657813e-06, - "loss": 1.026, + "epoch": 0.9, + "grad_norm": 9.013452529907227, + "learning_rate": 1.3973141446680333e-05, + "loss": 0.7415, "step": 7204 }, { - "epoch": 2.17, - "grad_norm": 15.844094276428223, - "learning_rate": 5.55878520597374e-06, - "loss": 1.8327, + "epoch": 0.9, + "grad_norm": 57.821529388427734, + "learning_rate": 1.3972304731623647e-05, + "loss": 2.823, "step": 7205 }, { - "epoch": 2.17, - "grad_norm": 7.878782272338867, - "learning_rate": 5.5567805953693495e-06, - "loss": 1.0848, + "epoch": 0.9, + "grad_norm": 9.371427536010742, + "learning_rate": 1.397146801656696e-05, + "loss": 1.2171, "step": 7206 }, { - "epoch": 2.17, - "grad_norm": 12.858576774597168, - "learning_rate": 5.5547759847649596e-06, - "loss": 1.5242, + "epoch": 0.9, + "grad_norm": 18.87268829345703, + "learning_rate": 1.3970631301510271e-05, + "loss": 1.9732, "step": 7207 }, { - "epoch": 2.17, - "grad_norm": 11.919968605041504, - "learning_rate": 5.55277137416057e-06, - "loss": 1.6723, + "epoch": 0.9, + "grad_norm": 27.41904067993164, + "learning_rate": 1.3969794586453585e-05, + "loss": 2.1656, "step": 7208 }, { - "epoch": 2.17, - "grad_norm": 11.811442375183105, - "learning_rate": 5.5507667635561806e-06, - "loss": 1.1354, + "epoch": 0.9, + "grad_norm": 17.43402862548828, + "learning_rate": 1.3968957871396898e-05, + "loss": 1.4941, "step": 7209 }, { - "epoch": 2.17, - "grad_norm": 32.790809631347656, - "learning_rate": 5.54876215295179e-06, - "loss": 1.136, + "epoch": 0.9, + "grad_norm": 11.491718292236328, + "learning_rate": 1.3968121156340208e-05, + "loss": 1.061, "step": 7210 }, { - "epoch": 2.17, - "grad_norm": 29.143482208251953, - "learning_rate": 5.5467575423474e-06, - "loss": 1.3966, + "epoch": 0.9, + "grad_norm": 23.266481399536133, + "learning_rate": 1.3967284441283522e-05, + "loss": 1.5969, "step": 7211 }, { - "epoch": 2.17, - "grad_norm": 24.455890655517578, - "learning_rate": 5.544752931743009e-06, - "loss": 2.0205, + "epoch": 0.91, + "grad_norm": 14.703265190124512, + "learning_rate": 1.3966447726226836e-05, + "loss": 2.4698, "step": 7212 }, { - "epoch": 2.17, - "grad_norm": 23.328998565673828, - "learning_rate": 5.542748321138619e-06, - "loss": 1.6471, + "epoch": 0.91, + "grad_norm": 133.14865112304688, + "learning_rate": 1.3965611011170148e-05, + "loss": 2.1416, "step": 7213 }, { - "epoch": 2.17, - "grad_norm": 26.35028076171875, - "learning_rate": 5.5407437105342285e-06, - "loss": 1.5245, + "epoch": 0.91, + "grad_norm": 4.905088424682617, + "learning_rate": 1.396477429611346e-05, + "loss": 0.1839, "step": 7214 }, { - "epoch": 2.17, - "grad_norm": 23.13437843322754, - "learning_rate": 5.538739099929839e-06, - "loss": 1.7795, + "epoch": 0.91, + "grad_norm": 8.868611335754395, + "learning_rate": 1.3963937581056772e-05, + "loss": 3.0066, "step": 7215 }, { - "epoch": 2.17, - "grad_norm": 16.176660537719727, - "learning_rate": 5.5367344893254495e-06, - "loss": 1.5189, + "epoch": 0.91, + "grad_norm": 36.57992935180664, + "learning_rate": 1.3963100866000085e-05, + "loss": 2.9071, "step": 7216 }, { - "epoch": 2.17, - "grad_norm": 16.43119239807129, - "learning_rate": 5.534729878721059e-06, - "loss": 1.0649, + "epoch": 0.91, + "grad_norm": 27.707067489624023, + "learning_rate": 1.3962264150943397e-05, + "loss": 1.2687, "step": 7217 }, { - "epoch": 2.17, - "grad_norm": 16.297130584716797, - "learning_rate": 5.532725268116669e-06, - "loss": 1.0049, + "epoch": 0.91, + "grad_norm": 25.992782592773438, + "learning_rate": 1.396142743588671e-05, + "loss": 2.2674, "step": 7218 }, { - "epoch": 2.17, - "grad_norm": 11.09915542602539, - "learning_rate": 5.530720657512279e-06, - "loss": 1.3301, + "epoch": 0.91, + "grad_norm": 34.82918167114258, + "learning_rate": 1.3960590720830023e-05, + "loss": 2.9348, "step": 7219 }, { - "epoch": 2.17, - "grad_norm": 10.667206764221191, - "learning_rate": 5.528716046907888e-06, - "loss": 1.0879, + "epoch": 0.91, + "grad_norm": 26.297290802001953, + "learning_rate": 1.3959754005773336e-05, + "loss": 1.9831, "step": 7220 }, { - "epoch": 2.17, - "grad_norm": 22.059289932250977, - "learning_rate": 5.526711436303499e-06, - "loss": 1.7633, + "epoch": 0.91, + "grad_norm": 11.515657424926758, + "learning_rate": 1.3958917290716647e-05, + "loss": 1.78, "step": 7221 }, { - "epoch": 2.17, - "grad_norm": 45.51456069946289, - "learning_rate": 5.524706825699109e-06, - "loss": 1.451, + "epoch": 0.91, + "grad_norm": 11.170564651489258, + "learning_rate": 1.395808057565996e-05, + "loss": 2.2037, "step": 7222 }, { - "epoch": 2.17, - "grad_norm": 13.462637901306152, - "learning_rate": 5.522702215094718e-06, - "loss": 1.4225, + "epoch": 0.91, + "grad_norm": 84.04158020019531, + "learning_rate": 1.3957243860603274e-05, + "loss": 3.4347, "step": 7223 }, { - "epoch": 2.17, - "grad_norm": 11.555397033691406, - "learning_rate": 5.5206976044903284e-06, - "loss": 1.2796, + "epoch": 0.91, + "grad_norm": 38.162410736083984, + "learning_rate": 1.3956407145546584e-05, + "loss": 2.0202, "step": 7224 }, { - "epoch": 2.17, - "grad_norm": 25.498672485351562, - "learning_rate": 5.518692993885938e-06, - "loss": 1.1889, + "epoch": 0.91, + "grad_norm": 20.224584579467773, + "learning_rate": 1.3955570430489898e-05, + "loss": 1.7124, "step": 7225 }, { - "epoch": 2.17, - "grad_norm": 18.293188095092773, - "learning_rate": 5.516688383281548e-06, - "loss": 2.2397, + "epoch": 0.91, + "grad_norm": 17.18352508544922, + "learning_rate": 1.395473371543321e-05, + "loss": 3.1194, "step": 7226 }, { - "epoch": 2.17, - "grad_norm": 17.86231231689453, - "learning_rate": 5.514683772677157e-06, - "loss": 1.5173, + "epoch": 0.91, + "grad_norm": 10.034878730773926, + "learning_rate": 1.3953897000376524e-05, + "loss": 0.6442, "step": 7227 }, { - "epoch": 2.17, - "grad_norm": 18.436603546142578, - "learning_rate": 5.512679162072768e-06, - "loss": 1.2884, + "epoch": 0.91, + "grad_norm": 24.862689971923828, + "learning_rate": 1.3953060285319836e-05, + "loss": 1.5621, "step": 7228 }, { - "epoch": 2.17, - "grad_norm": 10.068883895874023, - "learning_rate": 5.510674551468378e-06, - "loss": 1.314, + "epoch": 0.91, + "grad_norm": 26.202434539794922, + "learning_rate": 1.3952223570263147e-05, + "loss": 0.9027, "step": 7229 }, { - "epoch": 2.17, - "grad_norm": 9.923480987548828, - "learning_rate": 5.508669940863987e-06, - "loss": 0.8318, + "epoch": 0.91, + "grad_norm": 19.527454376220703, + "learning_rate": 1.3951386855206461e-05, + "loss": 2.2706, "step": 7230 }, { - "epoch": 2.17, - "grad_norm": 30.2728214263916, - "learning_rate": 5.506665330259597e-06, - "loss": 1.7006, + "epoch": 0.91, + "grad_norm": 29.45587158203125, + "learning_rate": 1.3950550140149773e-05, + "loss": 1.4363, "step": 7231 }, { - "epoch": 2.17, - "grad_norm": 24.514514923095703, - "learning_rate": 5.5046607196552074e-06, - "loss": 1.2372, + "epoch": 0.91, + "grad_norm": 17.59807777404785, + "learning_rate": 1.3949713425093085e-05, + "loss": 2.2094, "step": 7232 }, { - "epoch": 2.17, - "grad_norm": 8.947182655334473, - "learning_rate": 5.502656109050817e-06, - "loss": 0.6531, + "epoch": 0.91, + "grad_norm": 6.8975934982299805, + "learning_rate": 1.3948876710036399e-05, + "loss": 0.938, "step": 7233 }, { - "epoch": 2.17, - "grad_norm": 32.466487884521484, - "learning_rate": 5.500651498446428e-06, - "loss": 0.9527, + "epoch": 0.91, + "grad_norm": 75.04706573486328, + "learning_rate": 1.3948039994979712e-05, + "loss": 2.0924, "step": 7234 }, { - "epoch": 2.18, - "grad_norm": 19.44408416748047, - "learning_rate": 5.498646887842038e-06, - "loss": 1.8595, + "epoch": 0.91, + "grad_norm": 8.471443176269531, + "learning_rate": 1.3947203279923023e-05, + "loss": 1.807, "step": 7235 }, { - "epoch": 2.18, - "grad_norm": 15.075602531433105, - "learning_rate": 5.496642277237647e-06, - "loss": 1.2715, + "epoch": 0.91, + "grad_norm": 23.299184799194336, + "learning_rate": 1.3946366564866336e-05, + "loss": 3.5616, "step": 7236 }, { - "epoch": 2.18, - "grad_norm": 40.67919921875, - "learning_rate": 5.494637666633257e-06, - "loss": 1.1642, + "epoch": 0.91, + "grad_norm": 10.661001205444336, + "learning_rate": 1.394552984980965e-05, + "loss": 0.9546, "step": 7237 }, { - "epoch": 2.18, - "grad_norm": 11.373068809509277, - "learning_rate": 5.492633056028866e-06, - "loss": 0.8346, + "epoch": 0.91, + "grad_norm": 28.18711280822754, + "learning_rate": 1.394469313475296e-05, + "loss": 1.6205, "step": 7238 }, { - "epoch": 2.18, - "grad_norm": 19.530393600463867, - "learning_rate": 5.490628445424476e-06, - "loss": 1.1305, + "epoch": 0.91, + "grad_norm": 177.2775115966797, + "learning_rate": 1.3943856419696274e-05, + "loss": 3.1622, "step": 7239 }, { - "epoch": 2.18, - "grad_norm": 93.64016723632812, - "learning_rate": 5.488623834820087e-06, - "loss": 1.6901, + "epoch": 0.91, + "grad_norm": 10.07783031463623, + "learning_rate": 1.3943019704639586e-05, + "loss": 1.1788, "step": 7240 }, { - "epoch": 2.18, - "grad_norm": 26.258949279785156, - "learning_rate": 5.4866192242156965e-06, - "loss": 2.7283, + "epoch": 0.91, + "grad_norm": 24.80470848083496, + "learning_rate": 1.39421829895829e-05, + "loss": 2.455, "step": 7241 }, { - "epoch": 2.18, - "grad_norm": 23.18985366821289, - "learning_rate": 5.4846146136113066e-06, - "loss": 1.3425, + "epoch": 0.91, + "grad_norm": 12.35865306854248, + "learning_rate": 1.3941346274526211e-05, + "loss": 2.0769, "step": 7242 }, { - "epoch": 2.18, - "grad_norm": 18.908994674682617, - "learning_rate": 5.482610003006917e-06, - "loss": 1.075, + "epoch": 0.91, + "grad_norm": 22.02276039123535, + "learning_rate": 1.3940509559469523e-05, + "loss": 1.3337, "step": 7243 }, { - "epoch": 2.18, - "grad_norm": 16.2775936126709, - "learning_rate": 5.480605392402526e-06, - "loss": 1.6765, + "epoch": 0.91, + "grad_norm": 16.321331024169922, + "learning_rate": 1.3939672844412837e-05, + "loss": 1.0, "step": 7244 }, { - "epoch": 2.18, - "grad_norm": 22.71320915222168, - "learning_rate": 5.478600781798136e-06, - "loss": 0.7902, + "epoch": 0.91, + "grad_norm": 34.85578918457031, + "learning_rate": 1.3938836129356147e-05, + "loss": 1.4231, "step": 7245 }, { - "epoch": 2.18, - "grad_norm": 27.827980041503906, - "learning_rate": 5.476596171193747e-06, - "loss": 1.4328, + "epoch": 0.91, + "grad_norm": 19.227888107299805, + "learning_rate": 1.3937999414299461e-05, + "loss": 1.4348, "step": 7246 }, { - "epoch": 2.18, - "grad_norm": 11.23440170288086, - "learning_rate": 5.474591560589356e-06, - "loss": 1.3235, + "epoch": 0.91, + "grad_norm": 22.612197875976562, + "learning_rate": 1.3937162699242775e-05, + "loss": 1.4956, "step": 7247 }, { - "epoch": 2.18, - "grad_norm": 32.618648529052734, - "learning_rate": 5.472586949984966e-06, - "loss": 1.3066, + "epoch": 0.91, + "grad_norm": 27.75803565979004, + "learning_rate": 1.3936325984186088e-05, + "loss": 2.9314, "step": 7248 }, { - "epoch": 2.18, - "grad_norm": 41.39394760131836, - "learning_rate": 5.4705823393805755e-06, - "loss": 1.2173, + "epoch": 0.91, + "grad_norm": 13.859046936035156, + "learning_rate": 1.3935489269129398e-05, + "loss": 3.4463, "step": 7249 }, { - "epoch": 2.18, - "grad_norm": 17.75341796875, - "learning_rate": 5.4685777287761856e-06, - "loss": 0.9759, + "epoch": 0.91, + "grad_norm": 31.465036392211914, + "learning_rate": 1.3934652554072712e-05, + "loss": 3.0091, "step": 7250 }, { - "epoch": 2.18, - "grad_norm": 24.715322494506836, - "learning_rate": 5.466573118171795e-06, - "loss": 2.2409, + "epoch": 0.91, + "grad_norm": 12.641586303710938, + "learning_rate": 1.3933815839016026e-05, + "loss": 1.5988, "step": 7251 }, { - "epoch": 2.18, - "grad_norm": 16.27878761291504, - "learning_rate": 5.464568507567406e-06, - "loss": 1.0532, + "epoch": 0.91, + "grad_norm": 13.063556671142578, + "learning_rate": 1.3932979123959336e-05, + "loss": 1.3257, "step": 7252 }, { - "epoch": 2.18, - "grad_norm": 31.52488899230957, - "learning_rate": 5.462563896963016e-06, - "loss": 2.1667, + "epoch": 0.91, + "grad_norm": 8.62407112121582, + "learning_rate": 1.393214240890265e-05, + "loss": 0.7926, "step": 7253 }, { - "epoch": 2.18, - "grad_norm": 24.314443588256836, - "learning_rate": 5.460559286358625e-06, - "loss": 1.6379, + "epoch": 0.91, + "grad_norm": 28.4606990814209, + "learning_rate": 1.3931305693845962e-05, + "loss": 2.8975, "step": 7254 }, { - "epoch": 2.18, - "grad_norm": 9.153544425964355, - "learning_rate": 5.458554675754235e-06, - "loss": 0.9792, + "epoch": 0.91, + "grad_norm": 6.050575256347656, + "learning_rate": 1.3930468978789275e-05, + "loss": 0.6473, "step": 7255 }, { - "epoch": 2.18, - "grad_norm": 15.48164176940918, - "learning_rate": 5.456550065149845e-06, - "loss": 1.3478, + "epoch": 0.91, + "grad_norm": 15.265398979187012, + "learning_rate": 1.3929632263732587e-05, + "loss": 2.0768, "step": 7256 }, { - "epoch": 2.18, - "grad_norm": 13.745574951171875, - "learning_rate": 5.4545454545454545e-06, - "loss": 1.0092, + "epoch": 0.91, + "grad_norm": 19.988567352294922, + "learning_rate": 1.3928795548675899e-05, + "loss": 2.2212, "step": 7257 }, { - "epoch": 2.18, - "grad_norm": 42.99429702758789, - "learning_rate": 5.452540843941065e-06, - "loss": 2.0053, + "epoch": 0.91, + "grad_norm": 9.324673652648926, + "learning_rate": 1.3927958833619213e-05, + "loss": 0.4111, "step": 7258 }, { - "epoch": 2.18, - "grad_norm": 15.831052780151367, - "learning_rate": 5.4505362333366755e-06, - "loss": 1.2616, + "epoch": 0.91, + "grad_norm": 8.651508331298828, + "learning_rate": 1.3927122118562523e-05, + "loss": 1.4244, "step": 7259 }, { - "epoch": 2.18, - "grad_norm": 18.406320571899414, - "learning_rate": 5.448531622732285e-06, - "loss": 1.7931, + "epoch": 0.91, + "grad_norm": 14.281431198120117, + "learning_rate": 1.3926285403505837e-05, + "loss": 1.139, "step": 7260 }, { - "epoch": 2.18, - "grad_norm": 23.25520133972168, - "learning_rate": 5.446527012127895e-06, - "loss": 1.7849, + "epoch": 0.91, + "grad_norm": 13.728886604309082, + "learning_rate": 1.392544868844915e-05, + "loss": 1.0722, "step": 7261 }, { - "epoch": 2.18, - "grad_norm": 11.698332786560059, - "learning_rate": 5.444522401523504e-06, - "loss": 1.0173, + "epoch": 0.91, + "grad_norm": 10.251960754394531, + "learning_rate": 1.3924611973392464e-05, + "loss": 0.8638, "step": 7262 }, { - "epoch": 2.18, - "grad_norm": 18.50562286376953, - "learning_rate": 5.442517790919114e-06, - "loss": 1.0716, - "step": 7263 + "epoch": 0.91, + "grad_norm": 20.055835723876953, + "learning_rate": 1.3923775258335774e-05, + "loss": 1.4993, + "step": 7263 }, { - "epoch": 2.18, - "grad_norm": 18.679840087890625, - "learning_rate": 5.440513180314725e-06, - "loss": 1.331, + "epoch": 0.91, + "grad_norm": 11.874016761779785, + "learning_rate": 1.3922938543279088e-05, + "loss": 2.348, "step": 7264 }, { - "epoch": 2.18, - "grad_norm": 11.41555118560791, - "learning_rate": 5.438508569710334e-06, - "loss": 1.2174, + "epoch": 0.91, + "grad_norm": 14.539461135864258, + "learning_rate": 1.3922101828222402e-05, + "loss": 1.5973, "step": 7265 }, { - "epoch": 2.18, - "grad_norm": 12.062613487243652, - "learning_rate": 5.436503959105944e-06, - "loss": 1.7347, + "epoch": 0.91, + "grad_norm": 7.348511219024658, + "learning_rate": 1.3921265113165712e-05, + "loss": 0.9858, "step": 7266 }, { - "epoch": 2.18, - "grad_norm": 12.057971000671387, - "learning_rate": 5.4344993485015544e-06, - "loss": 1.2392, + "epoch": 0.91, + "grad_norm": 9.626068115234375, + "learning_rate": 1.3920428398109025e-05, + "loss": 1.9385, "step": 7267 }, { - "epoch": 2.19, - "grad_norm": 14.793939590454102, - "learning_rate": 5.432494737897164e-06, - "loss": 1.7844, + "epoch": 0.91, + "grad_norm": 14.306622505187988, + "learning_rate": 1.3919591683052337e-05, + "loss": 0.9388, "step": 7268 }, { - "epoch": 2.19, - "grad_norm": 20.210912704467773, - "learning_rate": 5.430490127292774e-06, - "loss": 2.0136, + "epoch": 0.91, + "grad_norm": 35.246219635009766, + "learning_rate": 1.3918754967995651e-05, + "loss": 1.5638, "step": 7269 }, { - "epoch": 2.19, - "grad_norm": 6.897406578063965, - "learning_rate": 5.428485516688383e-06, - "loss": 1.0915, + "epoch": 0.91, + "grad_norm": 13.283602714538574, + "learning_rate": 1.3917918252938963e-05, + "loss": 1.0215, "step": 7270 }, { - "epoch": 2.19, - "grad_norm": 12.593406677246094, - "learning_rate": 5.426480906083994e-06, - "loss": 0.7125, + "epoch": 0.91, + "grad_norm": 16.750701904296875, + "learning_rate": 1.3917081537882275e-05, + "loss": 3.2107, "step": 7271 }, { - "epoch": 2.19, - "grad_norm": 12.755428314208984, - "learning_rate": 5.424476295479604e-06, - "loss": 1.9477, + "epoch": 0.91, + "grad_norm": 8.930619239807129, + "learning_rate": 1.3916244822825589e-05, + "loss": 1.3279, "step": 7272 }, { - "epoch": 2.19, - "grad_norm": 7.096426486968994, - "learning_rate": 5.422471684875213e-06, - "loss": 0.718, + "epoch": 0.91, + "grad_norm": 9.9955472946167, + "learning_rate": 1.3915408107768899e-05, + "loss": 0.43, "step": 7273 }, { - "epoch": 2.19, - "grad_norm": 8.153369903564453, - "learning_rate": 5.420467074270823e-06, - "loss": 0.7129, + "epoch": 0.91, + "grad_norm": 21.53797149658203, + "learning_rate": 1.3914571392712213e-05, + "loss": 1.2248, "step": 7274 }, { - "epoch": 2.19, - "grad_norm": 14.67286491394043, - "learning_rate": 5.418462463666433e-06, - "loss": 1.6797, + "epoch": 0.91, + "grad_norm": 14.853739738464355, + "learning_rate": 1.3913734677655526e-05, + "loss": 1.7979, "step": 7275 }, { - "epoch": 2.19, - "grad_norm": 18.533056259155273, - "learning_rate": 5.416457853062043e-06, - "loss": 1.1951, + "epoch": 0.91, + "grad_norm": 27.4403018951416, + "learning_rate": 1.391289796259884e-05, + "loss": 4.5861, "step": 7276 }, { - "epoch": 2.19, - "grad_norm": 8.047659873962402, - "learning_rate": 5.414453242457654e-06, - "loss": 0.6167, + "epoch": 0.91, + "grad_norm": 15.481956481933594, + "learning_rate": 1.391206124754215e-05, + "loss": 1.9291, "step": 7277 }, { - "epoch": 2.19, - "grad_norm": 11.328049659729004, - "learning_rate": 5.412448631853263e-06, - "loss": 0.9758, + "epoch": 0.91, + "grad_norm": 15.197737693786621, + "learning_rate": 1.3911224532485464e-05, + "loss": 2.6118, "step": 7278 }, { - "epoch": 2.19, - "grad_norm": 23.584749221801758, - "learning_rate": 5.410444021248873e-06, - "loss": 1.7252, + "epoch": 0.91, + "grad_norm": 16.19619369506836, + "learning_rate": 1.3910387817428776e-05, + "loss": 0.5395, "step": 7279 }, { - "epoch": 2.19, - "grad_norm": 22.34720230102539, - "learning_rate": 5.408439410644483e-06, - "loss": 2.0207, + "epoch": 0.91, + "grad_norm": 13.692214012145996, + "learning_rate": 1.3909551102372088e-05, + "loss": 0.9755, "step": 7280 }, { - "epoch": 2.19, - "grad_norm": 19.402429580688477, - "learning_rate": 5.406434800040092e-06, - "loss": 1.2057, + "epoch": 0.91, + "grad_norm": 9.371413230895996, + "learning_rate": 1.3908714387315401e-05, + "loss": 1.5356, "step": 7281 }, { - "epoch": 2.19, - "grad_norm": 16.989200592041016, - "learning_rate": 5.404430189435702e-06, - "loss": 1.9234, + "epoch": 0.91, + "grad_norm": 24.087139129638672, + "learning_rate": 1.3907877672258713e-05, + "loss": 1.9896, "step": 7282 }, { - "epoch": 2.19, - "grad_norm": 26.748077392578125, - "learning_rate": 5.402425578831313e-06, - "loss": 1.8995, + "epoch": 0.91, + "grad_norm": 22.373689651489258, + "learning_rate": 1.3907040957202027e-05, + "loss": 2.4051, "step": 7283 }, { - "epoch": 2.19, - "grad_norm": 97.96756744384766, - "learning_rate": 5.4004209682269225e-06, - "loss": 2.0797, + "epoch": 0.91, + "grad_norm": 15.469829559326172, + "learning_rate": 1.3906204242145339e-05, + "loss": 1.7109, "step": 7284 }, { - "epoch": 2.19, - "grad_norm": 20.710670471191406, - "learning_rate": 5.398416357622533e-06, - "loss": 1.0238, + "epoch": 0.91, + "grad_norm": 28.993919372558594, + "learning_rate": 1.390536752708865e-05, + "loss": 1.6556, "step": 7285 }, { - "epoch": 2.19, - "grad_norm": 13.581433296203613, - "learning_rate": 5.396411747018142e-06, - "loss": 1.6866, + "epoch": 0.91, + "grad_norm": 5.3975348472595215, + "learning_rate": 1.3904530812031964e-05, + "loss": 1.0122, "step": 7286 }, { - "epoch": 2.19, - "grad_norm": 12.058156967163086, - "learning_rate": 5.394407136413752e-06, - "loss": 1.0288, + "epoch": 0.91, + "grad_norm": 15.712648391723633, + "learning_rate": 1.3903694096975275e-05, + "loss": 3.0246, "step": 7287 }, { - "epoch": 2.19, - "grad_norm": 21.06386375427246, - "learning_rate": 5.392402525809361e-06, - "loss": 1.271, + "epoch": 0.91, + "grad_norm": 9.079512596130371, + "learning_rate": 1.3902857381918588e-05, + "loss": 0.5973, "step": 7288 }, { - "epoch": 2.19, - "grad_norm": 51.95856857299805, - "learning_rate": 5.390397915204972e-06, - "loss": 1.5959, + "epoch": 0.91, + "grad_norm": 15.587929725646973, + "learning_rate": 1.3902020666861902e-05, + "loss": 1.6151, "step": 7289 }, { - "epoch": 2.19, - "grad_norm": 39.71308898925781, - "learning_rate": 5.388393304600582e-06, - "loss": 1.7519, + "epoch": 0.91, + "grad_norm": 10.63734245300293, + "learning_rate": 1.3901183951805216e-05, + "loss": 1.9477, "step": 7290 }, { - "epoch": 2.19, - "grad_norm": 22.637428283691406, - "learning_rate": 5.386388693996191e-06, - "loss": 2.0241, + "epoch": 0.92, + "grad_norm": 5.688929557800293, + "learning_rate": 1.3900347236748526e-05, + "loss": 0.3181, "step": 7291 }, { - "epoch": 2.19, - "grad_norm": 13.722464561462402, - "learning_rate": 5.3843840833918015e-06, - "loss": 1.0318, + "epoch": 0.92, + "grad_norm": 10.356765747070312, + "learning_rate": 1.389951052169184e-05, + "loss": 0.6252, "step": 7292 }, { - "epoch": 2.19, - "grad_norm": 35.6817512512207, - "learning_rate": 5.3823794727874116e-06, - "loss": 1.7191, + "epoch": 0.92, + "grad_norm": 7.640063762664795, + "learning_rate": 1.3898673806635152e-05, + "loss": 1.0457, "step": 7293 }, { - "epoch": 2.19, - "grad_norm": 13.845057487487793, - "learning_rate": 5.380374862183021e-06, - "loss": 1.3774, + "epoch": 0.92, + "grad_norm": 3.4225265979766846, + "learning_rate": 1.3897837091578463e-05, + "loss": 0.0955, "step": 7294 }, { - "epoch": 2.19, - "grad_norm": 12.214203834533691, - "learning_rate": 5.378370251578632e-06, - "loss": 1.5962, + "epoch": 0.92, + "grad_norm": 8.284340858459473, + "learning_rate": 1.3897000376521777e-05, + "loss": 2.0904, "step": 7295 }, { - "epoch": 2.19, - "grad_norm": 66.19721984863281, - "learning_rate": 5.376365640974242e-06, - "loss": 1.2639, + "epoch": 0.92, + "grad_norm": 2.8512842655181885, + "learning_rate": 1.3896163661465089e-05, + "loss": 0.1162, "step": 7296 }, { - "epoch": 2.19, - "grad_norm": 14.872382164001465, - "learning_rate": 5.374361030369851e-06, - "loss": 1.6655, + "epoch": 0.92, + "grad_norm": 10.442544937133789, + "learning_rate": 1.3895326946408403e-05, + "loss": 0.6149, "step": 7297 }, { - "epoch": 2.19, - "grad_norm": 19.328413009643555, - "learning_rate": 5.372356419765461e-06, - "loss": 1.3506, + "epoch": 0.92, + "grad_norm": 9.65011978149414, + "learning_rate": 1.3894490231351713e-05, + "loss": 1.4331, "step": 7298 }, { - "epoch": 2.19, - "grad_norm": 12.651726722717285, - "learning_rate": 5.37035180916107e-06, - "loss": 1.2916, + "epoch": 0.92, + "grad_norm": 14.805377006530762, + "learning_rate": 1.3893653516295027e-05, + "loss": 1.3727, "step": 7299 }, { - "epoch": 2.19, - "grad_norm": 13.568867683410645, - "learning_rate": 5.3683471985566805e-06, - "loss": 1.116, + "epoch": 0.92, + "grad_norm": 32.2829704284668, + "learning_rate": 1.389281680123834e-05, + "loss": 2.195, "step": 7300 }, { - "epoch": 2.2, - "grad_norm": 17.879594802856445, - "learning_rate": 5.366342587952291e-06, - "loss": 1.9263, + "epoch": 0.92, + "grad_norm": 34.021888732910156, + "learning_rate": 1.389198008618165e-05, + "loss": 1.14, "step": 7301 }, { - "epoch": 2.2, - "grad_norm": 16.369518280029297, - "learning_rate": 5.364337977347901e-06, - "loss": 0.7172, + "epoch": 0.92, + "grad_norm": 7.26022481918335, + "learning_rate": 1.3891143371124964e-05, + "loss": 0.6746, "step": 7302 }, { - "epoch": 2.2, - "grad_norm": 37.3539924621582, - "learning_rate": 5.362333366743511e-06, - "loss": 2.3236, + "epoch": 0.92, + "grad_norm": 4.94225549697876, + "learning_rate": 1.3890306656068278e-05, + "loss": 1.5535, "step": 7303 }, { - "epoch": 2.2, - "grad_norm": 28.66567611694336, - "learning_rate": 5.360328756139121e-06, - "loss": 1.4089, + "epoch": 0.92, + "grad_norm": 38.85111618041992, + "learning_rate": 1.3889469941011592e-05, + "loss": 5.2981, "step": 7304 }, { - "epoch": 2.2, - "grad_norm": 15.510285377502441, - "learning_rate": 5.35832414553473e-06, - "loss": 1.4704, + "epoch": 0.92, + "grad_norm": 18.075780868530273, + "learning_rate": 1.3888633225954902e-05, + "loss": 2.4893, "step": 7305 }, { - "epoch": 2.2, - "grad_norm": 18.096101760864258, - "learning_rate": 5.35631953493034e-06, - "loss": 1.029, + "epoch": 0.92, + "grad_norm": 8.066834449768066, + "learning_rate": 1.3887796510898215e-05, + "loss": 1.8651, "step": 7306 }, { - "epoch": 2.2, - "grad_norm": 34.01200866699219, - "learning_rate": 5.354314924325949e-06, - "loss": 1.732, + "epoch": 0.92, + "grad_norm": 15.736470222473145, + "learning_rate": 1.3886959795841527e-05, + "loss": 1.8323, "step": 7307 }, { - "epoch": 2.2, - "grad_norm": 12.730477333068848, - "learning_rate": 5.35231031372156e-06, - "loss": 2.1386, + "epoch": 0.92, + "grad_norm": 20.7080078125, + "learning_rate": 1.388612308078484e-05, + "loss": 2.635, "step": 7308 }, { - "epoch": 2.2, - "grad_norm": 17.955995559692383, - "learning_rate": 5.35030570311717e-06, - "loss": 0.6812, + "epoch": 0.92, + "grad_norm": 8.999063491821289, + "learning_rate": 1.3885286365728153e-05, + "loss": 1.009, "step": 7309 }, { - "epoch": 2.2, - "grad_norm": 9.278726577758789, - "learning_rate": 5.34830109251278e-06, - "loss": 0.5906, + "epoch": 0.92, + "grad_norm": 96.46437072753906, + "learning_rate": 1.3884449650671465e-05, + "loss": 1.8903, "step": 7310 }, { - "epoch": 2.2, - "grad_norm": 7.9487199783325195, - "learning_rate": 5.34629648190839e-06, - "loss": 0.6598, + "epoch": 0.92, + "grad_norm": 91.24198150634766, + "learning_rate": 1.3883612935614779e-05, + "loss": 1.8324, "step": 7311 }, { - "epoch": 2.2, - "grad_norm": 14.72791576385498, - "learning_rate": 5.344291871303999e-06, - "loss": 0.7821, + "epoch": 0.92, + "grad_norm": 18.01479721069336, + "learning_rate": 1.3882776220558089e-05, + "loss": 0.8534, "step": 7312 }, { - "epoch": 2.2, - "grad_norm": 8.070356369018555, - "learning_rate": 5.342287260699609e-06, - "loss": 0.7893, + "epoch": 0.92, + "grad_norm": 12.081615447998047, + "learning_rate": 1.3881939505501402e-05, + "loss": 0.997, "step": 7313 }, { - "epoch": 2.2, - "grad_norm": 14.87585163116455, - "learning_rate": 5.34028265009522e-06, - "loss": 1.5793, + "epoch": 0.92, + "grad_norm": 16.82686424255371, + "learning_rate": 1.3881102790444716e-05, + "loss": 2.0204, "step": 7314 }, { - "epoch": 2.2, - "grad_norm": 119.86707305908203, - "learning_rate": 5.338278039490829e-06, - "loss": 2.2575, + "epoch": 0.92, + "grad_norm": 4.8888115882873535, + "learning_rate": 1.3880266075388026e-05, + "loss": 0.2367, "step": 7315 }, { - "epoch": 2.2, - "grad_norm": 22.031450271606445, - "learning_rate": 5.336273428886439e-06, - "loss": 0.9648, + "epoch": 0.92, + "grad_norm": 11.908449172973633, + "learning_rate": 1.387942936033134e-05, + "loss": 1.0558, "step": 7316 }, { - "epoch": 2.2, - "grad_norm": 20.54271697998047, - "learning_rate": 5.334268818282049e-06, - "loss": 1.7682, + "epoch": 0.92, + "grad_norm": 10.127660751342773, + "learning_rate": 1.3878592645274654e-05, + "loss": 1.0307, "step": 7317 }, { - "epoch": 2.2, - "grad_norm": 18.402219772338867, - "learning_rate": 5.332264207677659e-06, - "loss": 1.3206, + "epoch": 0.92, + "grad_norm": 15.890748023986816, + "learning_rate": 1.3877755930217964e-05, + "loss": 0.8469, "step": 7318 }, { - "epoch": 2.2, - "grad_norm": 21.85274887084961, - "learning_rate": 5.330259597073269e-06, - "loss": 2.4046, + "epoch": 0.92, + "grad_norm": 11.59317398071289, + "learning_rate": 1.3876919215161278e-05, + "loss": 2.238, "step": 7319 }, { - "epoch": 2.2, - "grad_norm": 12.441575050354004, - "learning_rate": 5.32825498646888e-06, - "loss": 1.8915, - "step": 7320 - }, - { - "epoch": 2.2, - "eval_loss": 0.18911099433898926, - "eval_runtime": 43.726, - "eval_samples_per_second": 33.824, - "eval_steps_per_second": 33.824, + "epoch": 0.92, + "grad_norm": 16.340787887573242, + "learning_rate": 1.3876082500104591e-05, + "loss": 1.6672, "step": 7320 }, { - "epoch": 2.2, - "grad_norm": 18.565292358398438, - "learning_rate": 5.326250375864489e-06, - "loss": 1.4864, + "epoch": 0.92, + "grad_norm": 900.1279907226562, + "learning_rate": 1.3875245785047903e-05, + "loss": 1.7338, "step": 7321 }, { - "epoch": 2.2, - "grad_norm": 21.362342834472656, - "learning_rate": 5.324245765260099e-06, - "loss": 1.0767, + "epoch": 0.92, + "grad_norm": 23.428512573242188, + "learning_rate": 1.3874409069991215e-05, + "loss": 2.7145, "step": 7322 }, { - "epoch": 2.2, - "grad_norm": 15.389693260192871, - "learning_rate": 5.322241154655708e-06, - "loss": 0.9276, + "epoch": 0.92, + "grad_norm": 38.12934494018555, + "learning_rate": 1.3873572354934529e-05, + "loss": 3.1097, "step": 7323 }, { - "epoch": 2.2, - "grad_norm": 83.4190673828125, - "learning_rate": 5.320236544051318e-06, - "loss": 2.4627, + "epoch": 0.92, + "grad_norm": 5.672229290008545, + "learning_rate": 1.387273563987784e-05, + "loss": 0.4034, "step": 7324 }, { - "epoch": 2.2, - "grad_norm": 17.144039154052734, - "learning_rate": 5.3182319334469275e-06, - "loss": 1.6204, + "epoch": 0.92, + "grad_norm": 9.384552001953125, + "learning_rate": 1.3871898924821153e-05, + "loss": 0.4092, "step": 7325 }, { - "epoch": 2.2, - "grad_norm": 12.209066390991211, - "learning_rate": 5.316227322842538e-06, - "loss": 0.9967, + "epoch": 0.92, + "grad_norm": 4.673786640167236, + "learning_rate": 1.3871062209764465e-05, + "loss": 0.23, "step": 7326 }, { - "epoch": 2.2, - "grad_norm": 20.075319290161133, - "learning_rate": 5.3142227122381485e-06, - "loss": 2.2126, + "epoch": 0.92, + "grad_norm": 18.703664779663086, + "learning_rate": 1.3870225494707778e-05, + "loss": 1.3684, "step": 7327 }, { - "epoch": 2.2, - "grad_norm": 22.937063217163086, - "learning_rate": 5.312218101633759e-06, - "loss": 1.583, + "epoch": 0.92, + "grad_norm": 9.275733947753906, + "learning_rate": 1.3869388779651092e-05, + "loss": 1.0947, "step": 7328 }, { - "epoch": 2.2, - "grad_norm": 9.0242338180542, - "learning_rate": 5.310213491029368e-06, - "loss": 0.6662, + "epoch": 0.92, + "grad_norm": 24.407047271728516, + "learning_rate": 1.3868552064594402e-05, + "loss": 1.3104, "step": 7329 }, { - "epoch": 2.2, - "grad_norm": 8.918848037719727, - "learning_rate": 5.308208880424978e-06, - "loss": 0.8654, + "epoch": 0.92, + "grad_norm": 7.646800518035889, + "learning_rate": 1.3867715349537716e-05, + "loss": 1.5059, "step": 7330 }, { - "epoch": 2.2, - "grad_norm": 32.961360931396484, - "learning_rate": 5.306204269820587e-06, - "loss": 1.8518, + "epoch": 0.92, + "grad_norm": 28.678197860717773, + "learning_rate": 1.386687863448103e-05, + "loss": 2.3387, "step": 7331 }, { - "epoch": 2.2, - "grad_norm": 16.80774688720703, - "learning_rate": 5.304199659216198e-06, - "loss": 1.4526, + "epoch": 0.92, + "grad_norm": 9.313803672790527, + "learning_rate": 1.386604191942434e-05, + "loss": 0.3099, "step": 7332 }, { - "epoch": 2.2, - "grad_norm": 86.86822509765625, - "learning_rate": 5.302195048611808e-06, - "loss": 1.354, + "epoch": 0.92, + "grad_norm": 16.865198135375977, + "learning_rate": 1.3865205204367653e-05, + "loss": 2.1941, "step": 7333 }, { - "epoch": 2.21, - "grad_norm": 18.777795791625977, - "learning_rate": 5.300190438007417e-06, - "loss": 1.2628, + "epoch": 0.92, + "grad_norm": 37.41999435424805, + "learning_rate": 1.3864368489310967e-05, + "loss": 1.9056, "step": 7334 }, { - "epoch": 2.21, - "grad_norm": 47.588802337646484, - "learning_rate": 5.2981858274030275e-06, - "loss": 3.0136, + "epoch": 0.92, + "grad_norm": 10.510849952697754, + "learning_rate": 1.3863531774254279e-05, + "loss": 0.8305, "step": 7335 }, { - "epoch": 2.21, - "grad_norm": 41.61444091796875, - "learning_rate": 5.296181216798637e-06, - "loss": 2.3059, + "epoch": 0.92, + "grad_norm": 53.111297607421875, + "learning_rate": 1.3862695059197591e-05, + "loss": 2.8198, "step": 7336 }, { - "epoch": 2.21, - "grad_norm": 16.625478744506836, - "learning_rate": 5.294176606194247e-06, - "loss": 1.0123, + "epoch": 0.92, + "grad_norm": 11.218560218811035, + "learning_rate": 1.3861858344140905e-05, + "loss": 1.5149, "step": 7337 }, { - "epoch": 2.21, - "grad_norm": 50.9505729675293, - "learning_rate": 5.292171995589858e-06, - "loss": 1.1573, + "epoch": 0.92, + "grad_norm": 29.03372573852539, + "learning_rate": 1.3861021629084217e-05, + "loss": 1.3805, "step": 7338 }, { - "epoch": 2.21, - "grad_norm": 3.5535824298858643, - "learning_rate": 5.290167384985467e-06, - "loss": 0.5366, + "epoch": 0.92, + "grad_norm": 43.508018493652344, + "learning_rate": 1.3860184914027529e-05, + "loss": 2.5458, "step": 7339 }, { - "epoch": 2.21, - "grad_norm": 13.707948684692383, - "learning_rate": 5.288162774381077e-06, - "loss": 1.2388, + "epoch": 0.92, + "grad_norm": 9.611120223999023, + "learning_rate": 1.385934819897084e-05, + "loss": 1.8637, "step": 7340 }, { - "epoch": 2.21, - "grad_norm": 117.4101333618164, - "learning_rate": 5.286158163776687e-06, - "loss": 2.3683, + "epoch": 0.92, + "grad_norm": 12.637951850891113, + "learning_rate": 1.3858511483914154e-05, + "loss": 3.2302, "step": 7341 }, { - "epoch": 2.21, - "grad_norm": 13.597126960754395, - "learning_rate": 5.284153553172296e-06, - "loss": 1.0199, + "epoch": 0.92, + "grad_norm": 16.153526306152344, + "learning_rate": 1.3857674768857468e-05, + "loss": 1.8928, "step": 7342 }, { - "epoch": 2.21, - "grad_norm": 8.304295539855957, - "learning_rate": 5.2821489425679065e-06, - "loss": 0.7169, + "epoch": 0.92, + "grad_norm": 21.75115394592285, + "learning_rate": 1.3856838053800778e-05, + "loss": 3.125, "step": 7343 }, { - "epoch": 2.21, - "grad_norm": 14.256132125854492, - "learning_rate": 5.280144331963517e-06, - "loss": 1.2964, + "epoch": 0.92, + "grad_norm": 14.656881332397461, + "learning_rate": 1.3856001338744092e-05, + "loss": 1.3364, "step": 7344 }, { - "epoch": 2.21, - "grad_norm": 86.91548919677734, - "learning_rate": 5.278139721359127e-06, - "loss": 3.997, + "epoch": 0.92, + "grad_norm": 52.3955078125, + "learning_rate": 1.3855164623687405e-05, + "loss": 2.6731, "step": 7345 }, { - "epoch": 2.21, - "grad_norm": 16.045759201049805, - "learning_rate": 5.276135110754737e-06, - "loss": 1.1771, + "epoch": 0.92, + "grad_norm": 9.987492561340332, + "learning_rate": 1.3854327908630716e-05, + "loss": 0.8823, "step": 7346 }, { - "epoch": 2.21, - "grad_norm": 39.05559158325195, - "learning_rate": 5.274130500150346e-06, - "loss": 2.0468, + "epoch": 0.92, + "grad_norm": 6.1971330642700195, + "learning_rate": 1.385349119357403e-05, + "loss": 0.6373, "step": 7347 }, { - "epoch": 2.21, - "grad_norm": 13.659476280212402, - "learning_rate": 5.272125889545956e-06, - "loss": 0.9715, + "epoch": 0.92, + "grad_norm": 24.01615333557129, + "learning_rate": 1.3852654478517343e-05, + "loss": 1.7604, "step": 7348 }, { - "epoch": 2.21, - "grad_norm": 12.945648193359375, - "learning_rate": 5.270121278941565e-06, - "loss": 1.0217, + "epoch": 0.92, + "grad_norm": 5.612430572509766, + "learning_rate": 1.3851817763460655e-05, + "loss": 0.3099, "step": 7349 }, { - "epoch": 2.21, - "grad_norm": 11.518617630004883, - "learning_rate": 5.268116668337175e-06, - "loss": 1.2031, + "epoch": 0.92, + "grad_norm": 13.868998527526855, + "learning_rate": 1.3850981048403967e-05, + "loss": 1.3927, "step": 7350 }, { - "epoch": 2.21, - "grad_norm": 43.542816162109375, - "learning_rate": 5.266112057732786e-06, - "loss": 1.8437, + "epoch": 0.92, + "grad_norm": 9.475567817687988, + "learning_rate": 1.3850144333347279e-05, + "loss": 0.7834, "step": 7351 }, { - "epoch": 2.21, - "grad_norm": 9.149428367614746, - "learning_rate": 5.2641074471283955e-06, - "loss": 1.1645, + "epoch": 0.92, + "grad_norm": 10.77379322052002, + "learning_rate": 1.3849307618290592e-05, + "loss": 1.2543, "step": 7352 }, { - "epoch": 2.21, - "grad_norm": 17.6467227935791, - "learning_rate": 5.262102836524006e-06, - "loss": 1.3694, + "epoch": 0.92, + "grad_norm": 9.363557815551758, + "learning_rate": 1.3848470903233904e-05, + "loss": 2.3459, "step": 7353 }, { - "epoch": 2.21, - "grad_norm": 30.845457077026367, - "learning_rate": 5.260098225919616e-06, - "loss": 1.3288, + "epoch": 0.92, + "grad_norm": 17.62056541442871, + "learning_rate": 1.3847634188177216e-05, + "loss": 3.9656, "step": 7354 }, { - "epoch": 2.21, - "grad_norm": 17.407968521118164, - "learning_rate": 5.258093615315225e-06, - "loss": 1.3238, + "epoch": 0.92, + "grad_norm": 7.625229358673096, + "learning_rate": 1.384679747312053e-05, + "loss": 1.9232, "step": 7355 }, { - "epoch": 2.21, - "grad_norm": 22.961395263671875, - "learning_rate": 5.256089004710835e-06, - "loss": 1.1421, + "epoch": 0.92, + "grad_norm": 10.988778114318848, + "learning_rate": 1.3845960758063844e-05, + "loss": 1.0116, "step": 7356 }, { - "epoch": 2.21, - "grad_norm": 21.641342163085938, - "learning_rate": 5.254084394106446e-06, - "loss": 1.2616, + "epoch": 0.92, + "grad_norm": 15.370214462280273, + "learning_rate": 1.3845124043007154e-05, + "loss": 3.1325, "step": 7357 }, { - "epoch": 2.21, - "grad_norm": 16.428573608398438, - "learning_rate": 5.252079783502055e-06, - "loss": 0.9203, + "epoch": 0.92, + "grad_norm": 11.344792366027832, + "learning_rate": 1.3844287327950468e-05, + "loss": 0.529, "step": 7358 }, { - "epoch": 2.21, - "grad_norm": 17.370075225830078, - "learning_rate": 5.250075172897665e-06, - "loss": 1.0207, + "epoch": 0.92, + "grad_norm": 11.936365127563477, + "learning_rate": 1.3843450612893781e-05, + "loss": 1.2153, "step": 7359 }, { - "epoch": 2.21, - "grad_norm": 17.009870529174805, - "learning_rate": 5.2480705622932745e-06, - "loss": 1.3327, + "epoch": 0.92, + "grad_norm": 12.046215057373047, + "learning_rate": 1.3842613897837091e-05, + "loss": 0.9478, "step": 7360 }, { - "epoch": 2.21, - "grad_norm": 16.25709342956543, - "learning_rate": 5.246065951688885e-06, - "loss": 1.274, + "epoch": 0.92, + "grad_norm": 11.978967666625977, + "learning_rate": 1.3841777182780405e-05, + "loss": 1.6368, "step": 7361 }, { - "epoch": 2.21, - "grad_norm": 11.702857971191406, - "learning_rate": 5.244061341084495e-06, - "loss": 1.3384, + "epoch": 0.92, + "grad_norm": 23.105283737182617, + "learning_rate": 1.3840940467723719e-05, + "loss": 3.2074, "step": 7362 }, { - "epoch": 2.21, - "grad_norm": 39.857627868652344, - "learning_rate": 5.242056730480105e-06, - "loss": 1.6856, + "epoch": 0.92, + "grad_norm": 10.976641654968262, + "learning_rate": 1.384010375266703e-05, + "loss": 1.0288, "step": 7363 }, { - "epoch": 2.21, - "grad_norm": 34.70151138305664, - "learning_rate": 5.240052119875715e-06, - "loss": 1.7488, + "epoch": 0.92, + "grad_norm": 12.47696590423584, + "learning_rate": 1.3839267037610343e-05, + "loss": 1.0648, "step": 7364 }, { - "epoch": 2.21, - "grad_norm": 8.398064613342285, - "learning_rate": 5.238047509271325e-06, - "loss": 1.1744, + "epoch": 0.92, + "grad_norm": 3.1748385429382324, + "learning_rate": 1.3838430322553655e-05, + "loss": 0.1766, "step": 7365 }, { - "epoch": 2.21, - "grad_norm": 16.08351707458496, - "learning_rate": 5.236042898666934e-06, - "loss": 1.2976, + "epoch": 0.92, + "grad_norm": 134.07736206054688, + "learning_rate": 1.3837593607496968e-05, + "loss": 2.144, "step": 7366 }, { - "epoch": 2.21, - "grad_norm": 41.76036834716797, - "learning_rate": 5.234038288062544e-06, - "loss": 2.1369, + "epoch": 0.92, + "grad_norm": 8.017218589782715, + "learning_rate": 1.383675689244028e-05, + "loss": 1.0889, "step": 7367 }, { - "epoch": 2.22, - "grad_norm": 14.112876892089844, - "learning_rate": 5.2320336774581535e-06, - "loss": 0.5372, + "epoch": 0.92, + "grad_norm": 12.834436416625977, + "learning_rate": 1.3835920177383592e-05, + "loss": 0.5733, "step": 7368 }, { - "epoch": 2.22, - "grad_norm": 25.881088256835938, - "learning_rate": 5.230029066853764e-06, - "loss": 1.3301, + "epoch": 0.92, + "grad_norm": 11.939716339111328, + "learning_rate": 1.3835083462326906e-05, + "loss": 1.137, "step": 7369 }, { - "epoch": 2.22, - "grad_norm": 28.427711486816406, - "learning_rate": 5.2280244562493745e-06, - "loss": 1.6783, + "epoch": 0.92, + "grad_norm": 15.541251182556152, + "learning_rate": 1.383424674727022e-05, + "loss": 1.0705, "step": 7370 }, { - "epoch": 2.22, - "grad_norm": 10.222917556762695, - "learning_rate": 5.226019845644984e-06, - "loss": 0.9323, + "epoch": 0.93, + "grad_norm": 10.923202514648438, + "learning_rate": 1.383341003221353e-05, + "loss": 0.8229, "step": 7371 }, { - "epoch": 2.22, - "grad_norm": 8.820756912231445, - "learning_rate": 5.224015235040594e-06, - "loss": 0.9893, + "epoch": 0.93, + "grad_norm": 7.435032367706299, + "learning_rate": 1.3832573317156843e-05, + "loss": 0.5059, "step": 7372 }, { - "epoch": 2.22, - "grad_norm": 17.932085037231445, - "learning_rate": 5.222010624436203e-06, - "loss": 1.7065, + "epoch": 0.93, + "grad_norm": 19.814422607421875, + "learning_rate": 1.3831736602100157e-05, + "loss": 1.4652, "step": 7373 }, { - "epoch": 2.22, - "grad_norm": 11.802325248718262, - "learning_rate": 5.220006013831813e-06, - "loss": 1.3393, + "epoch": 0.93, + "grad_norm": 12.966021537780762, + "learning_rate": 1.3830899887043467e-05, + "loss": 1.8792, "step": 7374 }, { - "epoch": 2.22, - "grad_norm": 23.23512077331543, - "learning_rate": 5.218001403227424e-06, - "loss": 1.1122, + "epoch": 0.93, + "grad_norm": 27.098787307739258, + "learning_rate": 1.3830063171986781e-05, + "loss": 2.7767, "step": 7375 }, { - "epoch": 2.22, - "grad_norm": 15.658203125, - "learning_rate": 5.215996792623033e-06, - "loss": 1.3305, + "epoch": 0.93, + "grad_norm": 14.493932723999023, + "learning_rate": 1.3829226456930095e-05, + "loss": 1.2419, "step": 7376 }, { - "epoch": 2.22, - "grad_norm": 39.932865142822266, - "learning_rate": 5.213992182018643e-06, - "loss": 1.7015, + "epoch": 0.93, + "grad_norm": 15.625213623046875, + "learning_rate": 1.3828389741873407e-05, + "loss": 1.3691, "step": 7377 }, { - "epoch": 2.22, - "grad_norm": 33.233642578125, - "learning_rate": 5.2119875714142535e-06, - "loss": 1.5578, + "epoch": 0.93, + "grad_norm": 18.71935272216797, + "learning_rate": 1.3827553026816719e-05, + "loss": 1.6166, "step": 7378 }, { - "epoch": 2.22, - "grad_norm": 12.168890953063965, - "learning_rate": 5.209982960809863e-06, - "loss": 0.9835, + "epoch": 0.93, + "grad_norm": 10.90803050994873, + "learning_rate": 1.382671631176003e-05, + "loss": 1.3807, "step": 7379 }, { - "epoch": 2.22, - "grad_norm": 9.322324752807617, - "learning_rate": 5.207978350205473e-06, - "loss": 0.8729, + "epoch": 0.93, + "grad_norm": 8.382681846618652, + "learning_rate": 1.3825879596703344e-05, + "loss": 1.7445, "step": 7380 }, { - "epoch": 2.22, - "grad_norm": 32.88364791870117, - "learning_rate": 5.205973739601084e-06, - "loss": 1.5136, + "epoch": 0.93, + "grad_norm": 6.501852035522461, + "learning_rate": 1.3825042881646656e-05, + "loss": 1.4699, "step": 7381 }, { - "epoch": 2.22, - "grad_norm": 24.65951156616211, - "learning_rate": 5.203969128996693e-06, - "loss": 1.2119, + "epoch": 0.93, + "grad_norm": 9.208329200744629, + "learning_rate": 1.3824206166589968e-05, + "loss": 1.6787, "step": 7382 }, { - "epoch": 2.22, - "grad_norm": 9.384584426879883, - "learning_rate": 5.201964518392303e-06, - "loss": 0.8526, + "epoch": 0.93, + "grad_norm": 25.36642837524414, + "learning_rate": 1.3823369451533282e-05, + "loss": 3.3833, "step": 7383 }, { - "epoch": 2.22, - "grad_norm": 16.68515396118164, - "learning_rate": 5.199959907787912e-06, - "loss": 1.6815, + "epoch": 0.93, + "grad_norm": 11.165738105773926, + "learning_rate": 1.3822532736476595e-05, + "loss": 0.2827, "step": 7384 }, { - "epoch": 2.22, - "grad_norm": 12.431203842163086, - "learning_rate": 5.197955297183522e-06, - "loss": 0.8398, + "epoch": 0.93, + "grad_norm": 12.452773094177246, + "learning_rate": 1.3821696021419906e-05, + "loss": 1.4942, "step": 7385 }, { - "epoch": 2.22, - "grad_norm": 10.749804496765137, - "learning_rate": 5.1959506865791325e-06, - "loss": 1.6542, + "epoch": 0.93, + "grad_norm": 18.42108154296875, + "learning_rate": 1.382085930636322e-05, + "loss": 2.2891, "step": 7386 }, { - "epoch": 2.22, - "grad_norm": 19.671411514282227, - "learning_rate": 5.1939460759747425e-06, - "loss": 1.1745, + "epoch": 0.93, + "grad_norm": 16.08868980407715, + "learning_rate": 1.3820022591306533e-05, + "loss": 1.7406, "step": 7387 }, { - "epoch": 2.22, - "grad_norm": 15.82059383392334, - "learning_rate": 5.191941465370353e-06, - "loss": 1.5952, + "epoch": 0.93, + "grad_norm": 21.774072647094727, + "learning_rate": 1.3819185876249843e-05, + "loss": 1.0848, "step": 7388 }, { - "epoch": 2.22, - "grad_norm": 11.592869758605957, - "learning_rate": 5.189936854765963e-06, - "loss": 1.375, + "epoch": 0.93, + "grad_norm": 11.0586519241333, + "learning_rate": 1.3818349161193157e-05, + "loss": 1.4377, "step": 7389 }, { - "epoch": 2.22, - "grad_norm": 12.64307975769043, - "learning_rate": 5.187932244161572e-06, - "loss": 1.5403, + "epoch": 0.93, + "grad_norm": 8.787914276123047, + "learning_rate": 1.3817512446136469e-05, + "loss": 1.1182, "step": 7390 }, { - "epoch": 2.22, - "grad_norm": 21.732891082763672, - "learning_rate": 5.185927633557182e-06, - "loss": 0.9123, + "epoch": 0.93, + "grad_norm": 28.798927307128906, + "learning_rate": 1.3816675731079782e-05, + "loss": 1.9551, "step": 7391 }, { - "epoch": 2.22, - "grad_norm": 11.803874969482422, - "learning_rate": 5.183923022952791e-06, - "loss": 1.1493, + "epoch": 0.93, + "grad_norm": 20.054466247558594, + "learning_rate": 1.3815839016023094e-05, + "loss": 1.446, "step": 7392 }, { - "epoch": 2.22, - "grad_norm": 13.172371864318848, - "learning_rate": 5.181918412348401e-06, - "loss": 1.5287, + "epoch": 0.93, + "grad_norm": 14.390433311462402, + "learning_rate": 1.3815002300966406e-05, + "loss": 1.6343, "step": 7393 }, { - "epoch": 2.22, - "grad_norm": 11.935776710510254, - "learning_rate": 5.179913801744012e-06, - "loss": 1.0215, + "epoch": 0.93, + "grad_norm": 27.4454345703125, + "learning_rate": 1.381416558590972e-05, + "loss": 1.9062, "step": 7394 }, { - "epoch": 2.22, - "grad_norm": 41.79684829711914, - "learning_rate": 5.1779091911396215e-06, - "loss": 2.0082, + "epoch": 0.93, + "grad_norm": 22.983104705810547, + "learning_rate": 1.3813328870853032e-05, + "loss": 1.524, "step": 7395 }, { - "epoch": 2.22, - "grad_norm": 20.29804229736328, - "learning_rate": 5.175904580535232e-06, - "loss": 1.9343, + "epoch": 0.93, + "grad_norm": 18.85066032409668, + "learning_rate": 1.3812492155796344e-05, + "loss": 1.1134, "step": 7396 }, { - "epoch": 2.22, - "grad_norm": 13.398682594299316, - "learning_rate": 5.173899969930841e-06, - "loss": 1.1503, + "epoch": 0.93, + "grad_norm": 31.238264083862305, + "learning_rate": 1.3811655440739658e-05, + "loss": 2.4208, "step": 7397 }, { - "epoch": 2.22, - "grad_norm": 24.813034057617188, - "learning_rate": 5.171895359326451e-06, - "loss": 1.0149, + "epoch": 0.93, + "grad_norm": 8.082986831665039, + "learning_rate": 1.3810818725682971e-05, + "loss": 0.1792, "step": 7398 }, { - "epoch": 2.22, - "grad_norm": 23.43954849243164, - "learning_rate": 5.169890748722061e-06, - "loss": 1.2993, + "epoch": 0.93, + "grad_norm": 6.717072486877441, + "learning_rate": 1.3809982010626281e-05, + "loss": 1.0453, "step": 7399 }, { - "epoch": 2.22, - "grad_norm": 14.955592155456543, - "learning_rate": 5.167886138117671e-06, - "loss": 1.3491, + "epoch": 0.93, + "grad_norm": 8.783857345581055, + "learning_rate": 1.3809145295569595e-05, + "loss": 0.2954, "step": 7400 }, { - "epoch": 2.23, - "grad_norm": 15.617820739746094, - "learning_rate": 5.165881527513281e-06, - "loss": 1.1608, + "epoch": 0.93, + "grad_norm": 13.640789031982422, + "learning_rate": 1.3808308580512909e-05, + "loss": 0.9781, "step": 7401 }, { - "epoch": 2.23, - "grad_norm": 18.086448669433594, - "learning_rate": 5.163876916908891e-06, - "loss": 1.4179, + "epoch": 0.93, + "grad_norm": 10.328807830810547, + "learning_rate": 1.3807471865456219e-05, + "loss": 0.3612, "step": 7402 }, { - "epoch": 2.23, - "grad_norm": 7.104720115661621, - "learning_rate": 5.1618723063045005e-06, - "loss": 0.8867, + "epoch": 0.93, + "grad_norm": 17.736278533935547, + "learning_rate": 1.3806635150399533e-05, + "loss": 0.8795, "step": 7403 }, { - "epoch": 2.23, - "grad_norm": 15.77996826171875, - "learning_rate": 5.159867695700111e-06, - "loss": 1.276, + "epoch": 0.93, + "grad_norm": 20.107175827026367, + "learning_rate": 1.3805798435342845e-05, + "loss": 2.9603, "step": 7404 }, { - "epoch": 2.23, - "grad_norm": 28.867000579833984, - "learning_rate": 5.15786308509572e-06, - "loss": 1.6279, + "epoch": 0.93, + "grad_norm": 33.18934631347656, + "learning_rate": 1.3804961720286158e-05, + "loss": 1.4749, "step": 7405 }, { - "epoch": 2.23, - "grad_norm": 26.46651268005371, - "learning_rate": 5.155858474491331e-06, - "loss": 1.2349, + "epoch": 0.93, + "grad_norm": 11.525893211364746, + "learning_rate": 1.380412500522947e-05, + "loss": 0.6962, "step": 7406 }, { - "epoch": 2.23, - "grad_norm": 15.890294075012207, - "learning_rate": 5.153853863886941e-06, - "loss": 2.3934, + "epoch": 0.93, + "grad_norm": 16.433094024658203, + "learning_rate": 1.3803288290172782e-05, + "loss": 1.3322, "step": 7407 }, { - "epoch": 2.23, - "grad_norm": 37.84567642211914, - "learning_rate": 5.15184925328255e-06, - "loss": 1.5725, + "epoch": 0.93, + "grad_norm": 22.894237518310547, + "learning_rate": 1.3802451575116096e-05, + "loss": 4.3966, "step": 7408 }, { - "epoch": 2.23, - "grad_norm": 13.57693862915039, - "learning_rate": 5.14984464267816e-06, - "loss": 1.4944, + "epoch": 0.93, + "grad_norm": 11.382543563842773, + "learning_rate": 1.3801614860059406e-05, + "loss": 3.1286, "step": 7409 }, { - "epoch": 2.23, - "grad_norm": 16.91181755065918, - "learning_rate": 5.147840032073769e-06, - "loss": 1.7839, + "epoch": 0.93, + "grad_norm": 38.796470642089844, + "learning_rate": 1.380077814500272e-05, + "loss": 1.9953, "step": 7410 }, { - "epoch": 2.23, - "grad_norm": 36.64265441894531, - "learning_rate": 5.1458354214693795e-06, - "loss": 2.3358, + "epoch": 0.93, + "grad_norm": 13.195964813232422, + "learning_rate": 1.3799941429946033e-05, + "loss": 2.0658, "step": 7411 }, { - "epoch": 2.23, - "grad_norm": 5.110405921936035, - "learning_rate": 5.14383081086499e-06, - "loss": 0.6339, + "epoch": 0.93, + "grad_norm": 11.612751960754395, + "learning_rate": 1.3799104714889347e-05, + "loss": 2.575, "step": 7412 }, { - "epoch": 2.23, - "grad_norm": 13.546368598937988, - "learning_rate": 5.1418262002606005e-06, - "loss": 2.2864, + "epoch": 0.93, + "grad_norm": 22.106855392456055, + "learning_rate": 1.3798267999832657e-05, + "loss": 0.7899, "step": 7413 }, { - "epoch": 2.23, - "grad_norm": 13.818781852722168, - "learning_rate": 5.13982158965621e-06, - "loss": 1.8962, + "epoch": 0.93, + "grad_norm": 7.899169921875, + "learning_rate": 1.3797431284775971e-05, + "loss": 0.7671, "step": 7414 }, { - "epoch": 2.23, - "grad_norm": 8.545878410339355, - "learning_rate": 5.13781697905182e-06, - "loss": 1.9361, + "epoch": 0.93, + "grad_norm": 33.73710632324219, + "learning_rate": 1.3796594569719285e-05, + "loss": 1.829, "step": 7415 }, { - "epoch": 2.23, - "grad_norm": 22.833742141723633, - "learning_rate": 5.135812368447429e-06, - "loss": 1.869, + "epoch": 0.93, + "grad_norm": 27.88739776611328, + "learning_rate": 1.3795757854662595e-05, + "loss": 1.6476, "step": 7416 }, { - "epoch": 2.23, - "grad_norm": 48.628639221191406, - "learning_rate": 5.133807757843039e-06, - "loss": 1.6338, + "epoch": 0.93, + "grad_norm": 16.797008514404297, + "learning_rate": 1.3794921139605908e-05, + "loss": 1.0908, "step": 7417 }, { - "epoch": 2.23, - "grad_norm": 18.07792091369629, - "learning_rate": 5.13180314723865e-06, - "loss": 1.0774, + "epoch": 0.93, + "grad_norm": 12.012316703796387, + "learning_rate": 1.379408442454922e-05, + "loss": 1.7921, "step": 7418 }, { - "epoch": 2.23, - "grad_norm": 12.404952049255371, - "learning_rate": 5.129798536634259e-06, - "loss": 1.8888, + "epoch": 0.93, + "grad_norm": 17.799327850341797, + "learning_rate": 1.3793247709492534e-05, + "loss": 2.1023, "step": 7419 }, { - "epoch": 2.23, - "grad_norm": 16.634307861328125, - "learning_rate": 5.127793926029869e-06, - "loss": 0.9383, + "epoch": 0.93, + "grad_norm": 10.95718765258789, + "learning_rate": 1.3792410994435846e-05, + "loss": 0.8405, "step": 7420 }, { - "epoch": 2.23, - "grad_norm": 13.299745559692383, - "learning_rate": 5.125789315425479e-06, - "loss": 1.4448, + "epoch": 0.93, + "grad_norm": 18.754972457885742, + "learning_rate": 1.3791574279379158e-05, + "loss": 2.3038, "step": 7421 }, { - "epoch": 2.23, - "grad_norm": 20.849573135375977, - "learning_rate": 5.123784704821089e-06, - "loss": 1.8726, + "epoch": 0.93, + "grad_norm": 12.339506149291992, + "learning_rate": 1.3790737564322472e-05, + "loss": 0.9753, "step": 7422 }, { - "epoch": 2.23, - "grad_norm": 69.56287384033203, - "learning_rate": 5.121780094216699e-06, - "loss": 2.2076, + "epoch": 0.93, + "grad_norm": 12.315550804138184, + "learning_rate": 1.3789900849265782e-05, + "loss": 1.1979, "step": 7423 }, { - "epoch": 2.23, - "grad_norm": 13.498839378356934, - "learning_rate": 5.119775483612309e-06, - "loss": 0.9413, + "epoch": 0.93, + "grad_norm": 8.54345989227295, + "learning_rate": 1.3789064134209096e-05, + "loss": 0.7829, "step": 7424 }, { - "epoch": 2.23, - "grad_norm": 18.645475387573242, - "learning_rate": 5.117770873007919e-06, - "loss": 1.2546, + "epoch": 0.93, + "grad_norm": 14.872376441955566, + "learning_rate": 1.378822741915241e-05, + "loss": 1.2126, "step": 7425 }, { - "epoch": 2.23, - "grad_norm": 24.639263153076172, - "learning_rate": 5.115766262403529e-06, - "loss": 1.5101, + "epoch": 0.93, + "grad_norm": 25.59175682067871, + "learning_rate": 1.3787390704095723e-05, + "loss": 1.7746, "step": 7426 }, { - "epoch": 2.23, - "grad_norm": 9.19277572631836, - "learning_rate": 5.113761651799138e-06, - "loss": 0.789, + "epoch": 0.93, + "grad_norm": 19.434415817260742, + "learning_rate": 1.3786553989039033e-05, + "loss": 1.5769, "step": 7427 }, { - "epoch": 2.23, - "grad_norm": 13.205931663513184, - "learning_rate": 5.111757041194748e-06, - "loss": 1.2832, + "epoch": 0.93, + "grad_norm": 7.455471038818359, + "learning_rate": 1.3785717273982347e-05, + "loss": 0.4915, "step": 7428 }, { - "epoch": 2.23, - "grad_norm": 26.456480026245117, - "learning_rate": 5.109752430590358e-06, - "loss": 2.1971, + "epoch": 0.93, + "grad_norm": 7.419821262359619, + "learning_rate": 1.378488055892566e-05, + "loss": 1.1604, "step": 7429 }, { - "epoch": 2.23, - "grad_norm": 16.407329559326172, - "learning_rate": 5.1077478199859685e-06, - "loss": 1.7726, + "epoch": 0.93, + "grad_norm": 24.802993774414062, + "learning_rate": 1.378404384386897e-05, + "loss": 1.9481, "step": 7430 }, { - "epoch": 2.23, - "grad_norm": 24.08245277404785, - "learning_rate": 5.105743209381579e-06, - "loss": 1.5051, + "epoch": 0.93, + "grad_norm": 16.577354431152344, + "learning_rate": 1.3783207128812284e-05, + "loss": 1.7758, "step": 7431 }, { - "epoch": 2.23, - "grad_norm": 42.648616790771484, - "learning_rate": 5.103738598777188e-06, - "loss": 1.9396, + "epoch": 0.93, + "grad_norm": 15.920984268188477, + "learning_rate": 1.3782370413755596e-05, + "loss": 1.6791, "step": 7432 }, { - "epoch": 2.23, - "grad_norm": 21.788301467895508, - "learning_rate": 5.101733988172798e-06, - "loss": 1.0098, + "epoch": 0.93, + "grad_norm": 23.824922561645508, + "learning_rate": 1.378153369869891e-05, + "loss": 2.0636, "step": 7433 }, { - "epoch": 2.24, - "grad_norm": 19.175148010253906, - "learning_rate": 5.099729377568407e-06, - "loss": 1.4408, + "epoch": 0.93, + "grad_norm": 13.970311164855957, + "learning_rate": 1.3780696983642222e-05, + "loss": 1.3183, "step": 7434 }, { - "epoch": 2.24, - "grad_norm": 33.719520568847656, - "learning_rate": 5.097724766964017e-06, - "loss": 1.6809, + "epoch": 0.93, + "grad_norm": 9.506698608398438, + "learning_rate": 1.3779860268585534e-05, + "loss": 0.5083, "step": 7435 }, { - "epoch": 2.24, - "grad_norm": 22.389249801635742, - "learning_rate": 5.095720156359627e-06, - "loss": 1.5647, + "epoch": 0.93, + "grad_norm": 13.299325942993164, + "learning_rate": 1.3779023553528847e-05, + "loss": 1.4974, "step": 7436 }, { - "epoch": 2.24, - "grad_norm": 21.30531120300293, - "learning_rate": 5.0937155457552374e-06, - "loss": 1.5027, + "epoch": 0.93, + "grad_norm": 52.766910552978516, + "learning_rate": 1.3778186838472158e-05, + "loss": 0.9813, "step": 7437 }, { - "epoch": 2.24, - "grad_norm": 26.76338768005371, - "learning_rate": 5.0917109351508475e-06, - "loss": 1.3216, + "epoch": 0.93, + "grad_norm": 10.32703971862793, + "learning_rate": 1.3777350123415471e-05, + "loss": 0.9801, "step": 7438 }, { - "epoch": 2.24, - "grad_norm": 13.98495101928711, - "learning_rate": 5.089706324546458e-06, - "loss": 1.1699, + "epoch": 0.93, + "grad_norm": 8.354700088500977, + "learning_rate": 1.3776513408358785e-05, + "loss": 0.8647, "step": 7439 }, { - "epoch": 2.24, - "grad_norm": 6.971924304962158, - "learning_rate": 5.087701713942067e-06, - "loss": 0.8873, - "step": 7440 - }, - { - "epoch": 2.24, - "eval_loss": 0.1698659360408783, - "eval_runtime": 43.7492, - "eval_samples_per_second": 33.806, - "eval_steps_per_second": 33.806, + "epoch": 0.93, + "grad_norm": 29.22992515563965, + "learning_rate": 1.3775676693302099e-05, + "loss": 1.803, "step": 7440 }, { - "epoch": 2.24, - "grad_norm": 15.718365669250488, - "learning_rate": 5.085697103337677e-06, - "loss": 1.535, + "epoch": 0.93, + "grad_norm": 13.670393943786621, + "learning_rate": 1.3774839978245409e-05, + "loss": 1.4286, "step": 7441 }, { - "epoch": 2.24, - "grad_norm": 23.17116928100586, - "learning_rate": 5.083692492733286e-06, - "loss": 1.6606, + "epoch": 0.93, + "grad_norm": 10.728265762329102, + "learning_rate": 1.3774003263188723e-05, + "loss": 0.8971, "step": 7442 }, { - "epoch": 2.24, - "grad_norm": 17.73175621032715, - "learning_rate": 5.081687882128897e-06, - "loss": 1.6878, + "epoch": 0.93, + "grad_norm": 16.651935577392578, + "learning_rate": 1.3773166548132035e-05, + "loss": 1.7557, "step": 7443 }, { - "epoch": 2.24, - "grad_norm": 19.423385620117188, - "learning_rate": 5.079683271524507e-06, - "loss": 1.3193, + "epoch": 0.93, + "grad_norm": 71.80274200439453, + "learning_rate": 1.3772329833075346e-05, + "loss": 4.096, "step": 7444 }, { - "epoch": 2.24, - "grad_norm": 11.797534942626953, - "learning_rate": 5.077678660920116e-06, - "loss": 0.881, + "epoch": 0.93, + "grad_norm": 25.78373146057129, + "learning_rate": 1.377149311801866e-05, + "loss": 1.9425, "step": 7445 }, { - "epoch": 2.24, - "grad_norm": 15.763280868530273, - "learning_rate": 5.0756740503157265e-06, - "loss": 1.4587, + "epoch": 0.93, + "grad_norm": 11.401162147521973, + "learning_rate": 1.3770656402961972e-05, + "loss": 1.6871, "step": 7446 }, { - "epoch": 2.24, - "grad_norm": 8.128504753112793, - "learning_rate": 5.073669439711337e-06, - "loss": 1.0937, + "epoch": 0.93, + "grad_norm": 7.95670223236084, + "learning_rate": 1.3769819687905286e-05, + "loss": 0.9672, "step": 7447 }, { - "epoch": 2.24, - "grad_norm": 31.446063995361328, - "learning_rate": 5.071664829106946e-06, - "loss": 1.4584, + "epoch": 0.93, + "grad_norm": 8.02076530456543, + "learning_rate": 1.3768982972848598e-05, + "loss": 0.2893, "step": 7448 }, { - "epoch": 2.24, - "grad_norm": 13.80343246459961, - "learning_rate": 5.069660218502557e-06, - "loss": 0.7605, + "epoch": 0.93, + "grad_norm": 13.573423385620117, + "learning_rate": 1.376814625779191e-05, + "loss": 2.2369, "step": 7449 }, { - "epoch": 2.24, - "grad_norm": 10.658504486083984, - "learning_rate": 5.067655607898167e-06, - "loss": 0.8793, + "epoch": 0.93, + "grad_norm": 10.555697441101074, + "learning_rate": 1.3767309542735223e-05, + "loss": 0.9838, "step": 7450 }, { - "epoch": 2.24, - "grad_norm": 12.806272506713867, - "learning_rate": 5.065650997293776e-06, - "loss": 0.9486, + "epoch": 0.94, + "grad_norm": 10.912386894226074, + "learning_rate": 1.3766472827678534e-05, + "loss": 0.9101, "step": 7451 }, { - "epoch": 2.24, - "grad_norm": 9.60067081451416, - "learning_rate": 5.063646386689386e-06, - "loss": 1.0777, + "epoch": 0.94, + "grad_norm": 7.684079170227051, + "learning_rate": 1.3765636112621847e-05, + "loss": 0.5299, "step": 7452 }, { - "epoch": 2.24, - "grad_norm": 10.402573585510254, - "learning_rate": 5.061641776084995e-06, - "loss": 1.0899, + "epoch": 0.94, + "grad_norm": 86.8624496459961, + "learning_rate": 1.3764799397565161e-05, + "loss": 2.2287, "step": 7453 }, { - "epoch": 2.24, - "grad_norm": 24.5078182220459, - "learning_rate": 5.0596371654806055e-06, - "loss": 1.6828, + "epoch": 0.94, + "grad_norm": 41.78707504272461, + "learning_rate": 1.3763962682508475e-05, + "loss": 3.1811, "step": 7454 }, { - "epoch": 2.24, - "grad_norm": 20.464935302734375, - "learning_rate": 5.057632554876216e-06, - "loss": 1.5368, + "epoch": 0.94, + "grad_norm": 13.535687446594238, + "learning_rate": 1.3763125967451785e-05, + "loss": 1.2221, "step": 7455 }, { - "epoch": 2.24, - "grad_norm": 13.39626693725586, - "learning_rate": 5.055627944271826e-06, - "loss": 0.9746, + "epoch": 0.94, + "grad_norm": 9.792207717895508, + "learning_rate": 1.3762289252395098e-05, + "loss": 1.3552, "step": 7456 }, { - "epoch": 2.24, - "grad_norm": 18.0714168548584, - "learning_rate": 5.053623333667436e-06, - "loss": 1.0704, + "epoch": 0.94, + "grad_norm": 68.6406478881836, + "learning_rate": 1.376145253733841e-05, + "loss": 2.4895, "step": 7457 }, { - "epoch": 2.24, - "grad_norm": 33.03631591796875, - "learning_rate": 5.051618723063045e-06, - "loss": 1.4787, + "epoch": 0.94, + "grad_norm": 15.665657043457031, + "learning_rate": 1.3760615822281722e-05, + "loss": 1.4168, "step": 7458 }, { - "epoch": 2.24, - "grad_norm": 101.09317779541016, - "learning_rate": 5.049614112458655e-06, - "loss": 1.9596, + "epoch": 0.94, + "grad_norm": 7.849665641784668, + "learning_rate": 1.3759779107225036e-05, + "loss": 0.8555, "step": 7459 }, { - "epoch": 2.24, - "grad_norm": 27.06547737121582, - "learning_rate": 5.047609501854265e-06, - "loss": 0.824, + "epoch": 0.94, + "grad_norm": 20.467613220214844, + "learning_rate": 1.3758942392168348e-05, + "loss": 3.5343, "step": 7460 }, { - "epoch": 2.24, - "grad_norm": 26.54560089111328, - "learning_rate": 5.045604891249875e-06, - "loss": 1.0234, + "epoch": 0.94, + "grad_norm": 12.852072715759277, + "learning_rate": 1.3758105677111662e-05, + "loss": 0.4474, "step": 7461 }, { - "epoch": 2.24, - "grad_norm": 29.58421516418457, - "learning_rate": 5.043600280645485e-06, - "loss": 1.8553, + "epoch": 0.94, + "grad_norm": 30.18389320373535, + "learning_rate": 1.3757268962054972e-05, + "loss": 1.0194, "step": 7462 }, { - "epoch": 2.24, - "grad_norm": 11.534109115600586, - "learning_rate": 5.041595670041095e-06, - "loss": 0.9154, + "epoch": 0.94, + "grad_norm": 3.9626660346984863, + "learning_rate": 1.3756432246998285e-05, + "loss": 0.2651, "step": 7463 }, { - "epoch": 2.24, - "grad_norm": 23.17076301574707, - "learning_rate": 5.039591059436705e-06, - "loss": 2.0129, + "epoch": 0.94, + "grad_norm": 23.92144775390625, + "learning_rate": 1.3755595531941599e-05, + "loss": 2.8444, "step": 7464 }, { - "epoch": 2.24, - "grad_norm": 19.68193244934082, - "learning_rate": 5.037586448832315e-06, - "loss": 1.0759, + "epoch": 0.94, + "grad_norm": 13.733891487121582, + "learning_rate": 1.375475881688491e-05, + "loss": 2.9688, "step": 7465 }, { - "epoch": 2.24, - "grad_norm": 21.87065887451172, - "learning_rate": 5.035581838227924e-06, - "loss": 0.9128, + "epoch": 0.94, + "grad_norm": 9.05319595336914, + "learning_rate": 1.3753922101828223e-05, + "loss": 1.8021, "step": 7466 }, { - "epoch": 2.25, - "grad_norm": 21.481231689453125, - "learning_rate": 5.033577227623535e-06, - "loss": 1.7024, + "epoch": 0.94, + "grad_norm": 9.432863235473633, + "learning_rate": 1.3753085386771537e-05, + "loss": 0.703, "step": 7467 }, { - "epoch": 2.25, - "grad_norm": 11.491585731506348, - "learning_rate": 5.031572617019145e-06, - "loss": 1.0131, + "epoch": 0.94, + "grad_norm": 16.356163024902344, + "learning_rate": 1.375224867171485e-05, + "loss": 1.6557, "step": 7468 }, { - "epoch": 2.25, - "grad_norm": 14.809111595153809, - "learning_rate": 5.029568006414754e-06, - "loss": 1.3544, + "epoch": 0.94, + "grad_norm": 19.758914947509766, + "learning_rate": 1.375141195665816e-05, + "loss": 2.7752, "step": 7469 }, { - "epoch": 2.25, - "grad_norm": 31.751813888549805, - "learning_rate": 5.027563395810364e-06, - "loss": 1.477, + "epoch": 0.94, + "grad_norm": 12.722827911376953, + "learning_rate": 1.3750575241601474e-05, + "loss": 2.161, "step": 7470 }, { - "epoch": 2.25, - "grad_norm": 24.474321365356445, - "learning_rate": 5.025558785205974e-06, - "loss": 1.5531, + "epoch": 0.94, + "grad_norm": 12.439787864685059, + "learning_rate": 1.3749738526544786e-05, + "loss": 0.9437, "step": 7471 }, { - "epoch": 2.25, - "grad_norm": 182.55990600585938, - "learning_rate": 5.023554174601584e-06, - "loss": 2.9409, + "epoch": 0.94, + "grad_norm": 6.628915309906006, + "learning_rate": 1.3748901811488098e-05, + "loss": 1.4233, "step": 7472 }, { - "epoch": 2.25, - "grad_norm": 17.08757972717285, - "learning_rate": 5.0215495639971945e-06, - "loss": 1.093, + "epoch": 0.94, + "grad_norm": 55.125999450683594, + "learning_rate": 1.3748065096431412e-05, + "loss": 3.3859, "step": 7473 }, { - "epoch": 2.25, - "grad_norm": 18.484947204589844, - "learning_rate": 5.019544953392805e-06, - "loss": 1.7707, + "epoch": 0.94, + "grad_norm": 15.76113510131836, + "learning_rate": 1.3747228381374724e-05, + "loss": 1.7849, "step": 7474 }, { - "epoch": 2.25, - "grad_norm": 14.49228572845459, - "learning_rate": 5.017540342788414e-06, - "loss": 1.4408, + "epoch": 0.94, + "grad_norm": 12.657258033752441, + "learning_rate": 1.3746391666318037e-05, + "loss": 0.6101, "step": 7475 }, { - "epoch": 2.25, - "grad_norm": 10.136235237121582, - "learning_rate": 5.015535732184024e-06, - "loss": 1.0075, + "epoch": 0.94, + "grad_norm": 15.167543411254883, + "learning_rate": 1.3745554951261348e-05, + "loss": 1.2373, "step": 7476 }, { - "epoch": 2.25, - "grad_norm": 63.96528244018555, - "learning_rate": 5.013531121579633e-06, - "loss": 2.0736, + "epoch": 0.94, + "grad_norm": 17.01559829711914, + "learning_rate": 1.3744718236204661e-05, + "loss": 2.4644, "step": 7477 }, { - "epoch": 2.25, - "grad_norm": 10.56208610534668, - "learning_rate": 5.011526510975243e-06, - "loss": 1.0639, + "epoch": 0.94, + "grad_norm": 12.297941207885742, + "learning_rate": 1.3743881521147975e-05, + "loss": 0.6074, "step": 7478 }, { - "epoch": 2.25, - "grad_norm": 11.87157154083252, - "learning_rate": 5.0095219003708525e-06, - "loss": 1.0202, + "epoch": 0.94, + "grad_norm": 12.612531661987305, + "learning_rate": 1.3743044806091285e-05, + "loss": 1.2503, "step": 7479 }, { - "epoch": 2.25, - "grad_norm": 11.656047821044922, - "learning_rate": 5.0075172897664634e-06, - "loss": 1.3296, + "epoch": 0.94, + "grad_norm": 9.722151756286621, + "learning_rate": 1.3742208091034599e-05, + "loss": 1.8202, "step": 7480 }, { - "epoch": 2.25, - "grad_norm": 20.969064712524414, - "learning_rate": 5.0055126791620735e-06, - "loss": 2.0833, + "epoch": 0.94, + "grad_norm": 14.149904251098633, + "learning_rate": 1.3741371375977913e-05, + "loss": 1.3875, "step": 7481 }, { - "epoch": 2.25, - "grad_norm": 12.001461029052734, - "learning_rate": 5.003508068557683e-06, - "loss": 1.13, + "epoch": 0.94, + "grad_norm": 7.271524429321289, + "learning_rate": 1.3740534660921226e-05, + "loss": 1.1477, "step": 7482 }, { - "epoch": 2.25, - "grad_norm": 17.097755432128906, - "learning_rate": 5.001503457953293e-06, - "loss": 1.3623, + "epoch": 0.94, + "grad_norm": 23.882055282592773, + "learning_rate": 1.3739697945864536e-05, + "loss": 2.6625, "step": 7483 }, { - "epoch": 2.25, - "grad_norm": 10.47364330291748, - "learning_rate": 4.999498847348903e-06, - "loss": 0.7631, + "epoch": 0.94, + "grad_norm": 17.38764762878418, + "learning_rate": 1.373886123080785e-05, + "loss": 1.6953, "step": 7484 }, { - "epoch": 2.25, - "grad_norm": 11.359280586242676, - "learning_rate": 4.997494236744513e-06, - "loss": 0.6724, + "epoch": 0.94, + "grad_norm": 11.968402862548828, + "learning_rate": 1.3738024515751162e-05, + "loss": 1.5998, "step": 7485 }, { - "epoch": 2.25, - "grad_norm": 10.474048614501953, - "learning_rate": 4.995489626140122e-06, - "loss": 0.5766, + "epoch": 0.94, + "grad_norm": 8.602836608886719, + "learning_rate": 1.3737187800694474e-05, + "loss": 1.1245, "step": 7486 }, { - "epoch": 2.25, - "grad_norm": 14.837994575500488, - "learning_rate": 4.993485015535733e-06, - "loss": 1.1248, + "epoch": 0.94, + "grad_norm": 6.8582234382629395, + "learning_rate": 1.3736351085637788e-05, + "loss": 1.5747, "step": 7487 }, { - "epoch": 2.25, - "grad_norm": 18.054821014404297, - "learning_rate": 4.991480404931342e-06, - "loss": 1.0936, + "epoch": 0.94, + "grad_norm": 21.258777618408203, + "learning_rate": 1.37355143705811e-05, + "loss": 1.6977, "step": 7488 }, { - "epoch": 2.25, - "grad_norm": 9.805904388427734, - "learning_rate": 4.9894757943269525e-06, - "loss": 0.8442, + "epoch": 0.94, + "grad_norm": 53.06081771850586, + "learning_rate": 1.3734677655524413e-05, + "loss": 1.1751, "step": 7489 }, { - "epoch": 2.25, - "grad_norm": 44.7296257019043, - "learning_rate": 4.987471183722563e-06, - "loss": 2.3775, + "epoch": 0.94, + "grad_norm": 41.308082580566406, + "learning_rate": 1.3733840940467724e-05, + "loss": 2.9062, "step": 7490 }, { - "epoch": 2.25, - "grad_norm": 13.354528427124023, - "learning_rate": 4.985466573118173e-06, - "loss": 1.3387, + "epoch": 0.94, + "grad_norm": 14.199646949768066, + "learning_rate": 1.3733004225411037e-05, + "loss": 2.1871, "step": 7491 }, { - "epoch": 2.25, - "grad_norm": 24.598257064819336, - "learning_rate": 4.983461962513782e-06, - "loss": 2.0694, + "epoch": 0.94, + "grad_norm": 14.047327041625977, + "learning_rate": 1.373216751035435e-05, + "loss": 2.9026, "step": 7492 }, { - "epoch": 2.25, - "grad_norm": 28.982959747314453, - "learning_rate": 4.981457351909392e-06, - "loss": 1.3952, + "epoch": 0.94, + "grad_norm": 23.0893497467041, + "learning_rate": 1.3731330795297661e-05, + "loss": 1.0065, "step": 7493 }, { - "epoch": 2.25, - "grad_norm": 16.613788604736328, - "learning_rate": 4.979452741305002e-06, - "loss": 1.1931, + "epoch": 0.94, + "grad_norm": 15.064163208007812, + "learning_rate": 1.3730494080240975e-05, + "loss": 2.4648, "step": 7494 }, { - "epoch": 2.25, - "grad_norm": 11.446743965148926, - "learning_rate": 4.977448130700611e-06, - "loss": 0.8703, + "epoch": 0.94, + "grad_norm": 8.875805854797363, + "learning_rate": 1.3729657365184288e-05, + "loss": 1.6665, "step": 7495 }, { - "epoch": 2.25, - "grad_norm": 14.696474075317383, - "learning_rate": 4.975443520096221e-06, - "loss": 1.3185, + "epoch": 0.94, + "grad_norm": 14.805878639221191, + "learning_rate": 1.37288206501276e-05, + "loss": 2.3215, "step": 7496 }, { - "epoch": 2.25, - "grad_norm": 33.95481491088867, - "learning_rate": 4.9734389094918315e-06, - "loss": 1.3152, + "epoch": 0.94, + "grad_norm": 9.885756492614746, + "learning_rate": 1.3727983935070912e-05, + "loss": 1.0313, "step": 7497 }, { - "epoch": 2.25, - "grad_norm": 9.814773559570312, - "learning_rate": 4.9714342988874416e-06, - "loss": 1.4784, + "epoch": 0.94, + "grad_norm": 18.31471824645996, + "learning_rate": 1.3727147220014226e-05, + "loss": 2.0022, "step": 7498 }, { - "epoch": 2.25, - "grad_norm": 31.25359344482422, - "learning_rate": 4.969429688283051e-06, - "loss": 1.9966, + "epoch": 0.94, + "grad_norm": 15.420527458190918, + "learning_rate": 1.3726310504957538e-05, + "loss": 0.7821, "step": 7499 }, { - "epoch": 2.25, - "grad_norm": 28.215229034423828, - "learning_rate": 4.967425077678662e-06, - "loss": 2.6453, + "epoch": 0.94, + "grad_norm": 7.836379051208496, + "learning_rate": 1.372547378990085e-05, + "loss": 1.149, "step": 7500 }, { - "epoch": 2.26, - "grad_norm": 16.87133026123047, - "learning_rate": 4.965420467074271e-06, - "loss": 1.0434, + "epoch": 0.94, + "grad_norm": 9.965003967285156, + "learning_rate": 1.3724637074844162e-05, + "loss": 0.1831, "step": 7501 }, { - "epoch": 2.26, - "grad_norm": 16.18372344970703, - "learning_rate": 4.963415856469881e-06, - "loss": 1.9418, + "epoch": 0.94, + "grad_norm": 19.751415252685547, + "learning_rate": 1.3723800359787475e-05, + "loss": 0.9685, "step": 7502 }, { - "epoch": 2.26, - "grad_norm": 10.651266098022461, - "learning_rate": 4.961411245865491e-06, - "loss": 1.5416, + "epoch": 0.94, + "grad_norm": 22.532337188720703, + "learning_rate": 1.3722963644730789e-05, + "loss": 1.5096, "step": 7503 }, { - "epoch": 2.26, - "grad_norm": 37.711185455322266, - "learning_rate": 4.959406635261101e-06, - "loss": 1.3423, + "epoch": 0.94, + "grad_norm": 37.046485900878906, + "learning_rate": 1.37221269296741e-05, + "loss": 1.573, "step": 7504 }, { - "epoch": 2.26, - "grad_norm": 21.620492935180664, - "learning_rate": 4.9574020246567105e-06, - "loss": 1.435, + "epoch": 0.94, + "grad_norm": 27.697509765625, + "learning_rate": 1.3721290214617413e-05, + "loss": 1.5354, "step": 7505 }, { - "epoch": 2.26, - "grad_norm": 52.260536193847656, - "learning_rate": 4.9553974140523205e-06, - "loss": 1.8419, + "epoch": 0.94, + "grad_norm": 8.985095024108887, + "learning_rate": 1.3720453499560727e-05, + "loss": 1.4237, "step": 7506 }, { - "epoch": 2.26, - "grad_norm": 21.462879180908203, - "learning_rate": 4.953392803447931e-06, - "loss": 0.8966, + "epoch": 0.94, + "grad_norm": 5.892858028411865, + "learning_rate": 1.3719616784504037e-05, + "loss": 1.0621, "step": 7507 }, { - "epoch": 2.26, - "grad_norm": 9.330449104309082, - "learning_rate": 4.951388192843541e-06, - "loss": 0.8809, + "epoch": 0.94, + "grad_norm": 14.162527084350586, + "learning_rate": 1.371878006944735e-05, + "loss": 3.3544, "step": 7508 }, { - "epoch": 2.26, - "grad_norm": 31.02956771850586, - "learning_rate": 4.949383582239151e-06, - "loss": 1.8236, + "epoch": 0.94, + "grad_norm": 21.386547088623047, + "learning_rate": 1.3717943354390664e-05, + "loss": 2.5531, "step": 7509 }, { - "epoch": 2.26, - "grad_norm": 19.69917869567871, - "learning_rate": 4.94737897163476e-06, - "loss": 1.4695, + "epoch": 0.94, + "grad_norm": 6.791244029998779, + "learning_rate": 1.3717106639333976e-05, + "loss": 1.5847, "step": 7510 }, { - "epoch": 2.26, - "grad_norm": 13.709826469421387, - "learning_rate": 4.94537436103037e-06, - "loss": 0.9547, + "epoch": 0.94, + "grad_norm": 15.625334739685059, + "learning_rate": 1.3716269924277288e-05, + "loss": 2.3537, "step": 7511 }, { - "epoch": 2.26, - "grad_norm": 14.561116218566895, - "learning_rate": 4.94336975042598e-06, - "loss": 2.0196, + "epoch": 0.94, + "grad_norm": 14.458335876464844, + "learning_rate": 1.3715433209220602e-05, + "loss": 1.6641, "step": 7512 }, { - "epoch": 2.26, - "grad_norm": 12.211623191833496, - "learning_rate": 4.94136513982159e-06, - "loss": 1.0027, + "epoch": 0.94, + "grad_norm": 7.3591389656066895, + "learning_rate": 1.3714596494163914e-05, + "loss": 0.5241, "step": 7513 }, { - "epoch": 2.26, - "grad_norm": 94.37039947509766, - "learning_rate": 4.9393605292171995e-06, - "loss": 1.9356, + "epoch": 0.94, + "grad_norm": 11.480550765991211, + "learning_rate": 1.3713759779107226e-05, + "loss": 1.361, "step": 7514 }, { - "epoch": 2.26, - "grad_norm": 14.454056739807129, - "learning_rate": 4.9373559186128105e-06, - "loss": 1.0738, + "epoch": 0.94, + "grad_norm": 19.844825744628906, + "learning_rate": 1.3712923064050538e-05, + "loss": 1.3203, "step": 7515 }, { - "epoch": 2.26, - "grad_norm": 88.53865051269531, - "learning_rate": 4.93535130800842e-06, - "loss": 1.7283, + "epoch": 0.94, + "grad_norm": 32.37255096435547, + "learning_rate": 1.3712086348993851e-05, + "loss": 2.0873, "step": 7516 }, { - "epoch": 2.26, - "grad_norm": 225.10995483398438, - "learning_rate": 4.93334669740403e-06, - "loss": 1.8414, + "epoch": 0.94, + "grad_norm": 11.962457656860352, + "learning_rate": 1.3711249633937165e-05, + "loss": 1.2032, "step": 7517 }, { - "epoch": 2.26, - "grad_norm": 16.295265197753906, - "learning_rate": 4.93134208679964e-06, - "loss": 1.7194, + "epoch": 0.94, + "grad_norm": 11.72451114654541, + "learning_rate": 1.3710412918880475e-05, + "loss": 1.5265, "step": 7518 }, { - "epoch": 2.26, - "grad_norm": 102.90641021728516, - "learning_rate": 4.929337476195249e-06, - "loss": 1.1804, + "epoch": 0.94, + "grad_norm": 11.540800094604492, + "learning_rate": 1.3709576203823789e-05, + "loss": 1.6858, "step": 7519 }, { - "epoch": 2.26, - "grad_norm": 22.84174156188965, - "learning_rate": 4.927332865590859e-06, - "loss": 1.0631, + "epoch": 0.94, + "grad_norm": 4.787374973297119, + "learning_rate": 1.3708739488767102e-05, + "loss": 0.5134, "step": 7520 }, { - "epoch": 2.26, - "grad_norm": 29.34662628173828, - "learning_rate": 4.925328254986469e-06, - "loss": 1.2535, + "epoch": 0.94, + "grad_norm": 20.71868133544922, + "learning_rate": 1.3707902773710413e-05, + "loss": 1.6976, "step": 7521 }, { - "epoch": 2.26, - "grad_norm": 11.276557922363281, - "learning_rate": 4.923323644382079e-06, - "loss": 1.1659, + "epoch": 0.94, + "grad_norm": 5.886478424072266, + "learning_rate": 1.3707066058653726e-05, + "loss": 0.7904, "step": 7522 }, { - "epoch": 2.26, - "grad_norm": 14.418801307678223, - "learning_rate": 4.921319033777689e-06, - "loss": 1.9511, + "epoch": 0.94, + "grad_norm": 21.81075096130371, + "learning_rate": 1.370622934359704e-05, + "loss": 3.5599, "step": 7523 }, { - "epoch": 2.26, - "grad_norm": 20.165433883666992, - "learning_rate": 4.9193144231732995e-06, - "loss": 1.258, + "epoch": 0.94, + "grad_norm": 9.835027694702148, + "learning_rate": 1.3705392628540352e-05, + "loss": 0.9441, "step": 7524 }, { - "epoch": 2.26, - "grad_norm": 15.483535766601562, - "learning_rate": 4.917309812568909e-06, - "loss": 1.0741, + "epoch": 0.94, + "grad_norm": 21.744504928588867, + "learning_rate": 1.3704555913483664e-05, + "loss": 2.4696, "step": 7525 }, { - "epoch": 2.26, - "grad_norm": 23.793718338012695, - "learning_rate": 4.915305201964519e-06, - "loss": 1.4441, + "epoch": 0.94, + "grad_norm": 52.457340240478516, + "learning_rate": 1.3703719198426978e-05, + "loss": 1.0649, "step": 7526 }, { - "epoch": 2.26, - "grad_norm": 23.1822452545166, - "learning_rate": 4.913300591360129e-06, - "loss": 1.3305, + "epoch": 0.94, + "grad_norm": 13.692154884338379, + "learning_rate": 1.370288248337029e-05, + "loss": 0.7034, "step": 7527 }, { - "epoch": 2.26, - "grad_norm": 18.903297424316406, - "learning_rate": 4.911295980755739e-06, - "loss": 1.1903, + "epoch": 0.94, + "grad_norm": 17.706499099731445, + "learning_rate": 1.3702045768313602e-05, + "loss": 2.0586, "step": 7528 }, { - "epoch": 2.26, - "grad_norm": 18.78779411315918, - "learning_rate": 4.909291370151348e-06, - "loss": 1.7469, + "epoch": 0.94, + "grad_norm": 7.35901403427124, + "learning_rate": 1.3701209053256913e-05, + "loss": 1.4902, "step": 7529 }, { - "epoch": 2.26, - "grad_norm": 15.670270919799805, - "learning_rate": 4.907286759546958e-06, - "loss": 1.1513, + "epoch": 0.95, + "grad_norm": 9.223427772521973, + "learning_rate": 1.3700372338200227e-05, + "loss": 1.7911, "step": 7530 }, { - "epoch": 2.26, - "grad_norm": 22.992828369140625, - "learning_rate": 4.905282148942568e-06, - "loss": 0.9966, + "epoch": 0.95, + "grad_norm": 21.43293571472168, + "learning_rate": 1.369953562314354e-05, + "loss": 1.225, "step": 7531 }, { - "epoch": 2.26, - "grad_norm": 16.692344665527344, - "learning_rate": 4.9032775383381785e-06, - "loss": 1.3511, + "epoch": 0.95, + "grad_norm": 14.161319732666016, + "learning_rate": 1.3698698908086851e-05, + "loss": 1.5195, "step": 7532 }, { - "epoch": 2.26, - "grad_norm": 17.386524200439453, - "learning_rate": 4.901272927733789e-06, - "loss": 1.4587, + "epoch": 0.95, + "grad_norm": 16.956146240234375, + "learning_rate": 1.3697862193030165e-05, + "loss": 1.6587, "step": 7533 }, { - "epoch": 2.27, - "grad_norm": 18.931062698364258, - "learning_rate": 4.899268317129398e-06, - "loss": 2.273, + "epoch": 0.95, + "grad_norm": 3.7091715335845947, + "learning_rate": 1.3697025477973478e-05, + "loss": 0.1733, "step": 7534 }, { - "epoch": 2.27, - "grad_norm": 86.52537536621094, - "learning_rate": 4.897263706525008e-06, - "loss": 3.4529, + "epoch": 0.95, + "grad_norm": 5.567541599273682, + "learning_rate": 1.3696188762916789e-05, + "loss": 0.3337, "step": 7535 }, { - "epoch": 2.27, - "grad_norm": 9.628124237060547, - "learning_rate": 4.895259095920617e-06, - "loss": 0.8626, + "epoch": 0.95, + "grad_norm": 16.873165130615234, + "learning_rate": 1.3695352047860102e-05, + "loss": 1.8054, "step": 7536 }, { - "epoch": 2.27, - "grad_norm": 10.610788345336914, - "learning_rate": 4.893254485316228e-06, - "loss": 1.2612, + "epoch": 0.95, + "grad_norm": 10.89290714263916, + "learning_rate": 1.3694515332803416e-05, + "loss": 0.8305, "step": 7537 }, { - "epoch": 2.27, - "grad_norm": 15.068879127502441, - "learning_rate": 4.891249874711837e-06, - "loss": 1.2702, + "epoch": 0.95, + "grad_norm": 43.21706008911133, + "learning_rate": 1.3693678617746728e-05, + "loss": 1.1597, "step": 7538 }, { - "epoch": 2.27, - "grad_norm": 25.841157913208008, - "learning_rate": 4.889245264107447e-06, - "loss": 1.5596, + "epoch": 0.95, + "grad_norm": 5.436285972595215, + "learning_rate": 1.369284190269004e-05, + "loss": 0.44, "step": 7539 }, { - "epoch": 2.27, - "grad_norm": 13.19726276397705, - "learning_rate": 4.8872406535030575e-06, - "loss": 1.1938, + "epoch": 0.95, + "grad_norm": 13.519515037536621, + "learning_rate": 1.3692005187633353e-05, + "loss": 1.7031, "step": 7540 }, { - "epoch": 2.27, - "grad_norm": 30.624975204467773, - "learning_rate": 4.8852360428986676e-06, - "loss": 2.6475, + "epoch": 0.95, + "grad_norm": 21.97677993774414, + "learning_rate": 1.3691168472576665e-05, + "loss": 1.8174, "step": 7541 }, { - "epoch": 2.27, - "grad_norm": 17.774322509765625, - "learning_rate": 4.883231432294277e-06, - "loss": 0.8152, + "epoch": 0.95, + "grad_norm": 35.823272705078125, + "learning_rate": 1.3690331757519977e-05, + "loss": 2.8414, "step": 7542 }, { - "epoch": 2.27, - "grad_norm": 13.231738090515137, - "learning_rate": 4.881226821689887e-06, - "loss": 1.0322, + "epoch": 0.95, + "grad_norm": 10.470995903015137, + "learning_rate": 1.368949504246329e-05, + "loss": 1.1076, "step": 7543 }, { - "epoch": 2.27, - "grad_norm": 18.530324935913086, - "learning_rate": 4.879222211085497e-06, - "loss": 1.6454, + "epoch": 0.95, + "grad_norm": 19.127220153808594, + "learning_rate": 1.3688658327406603e-05, + "loss": 1.3538, "step": 7544 }, { - "epoch": 2.27, - "grad_norm": 32.340023040771484, - "learning_rate": 4.877217600481107e-06, - "loss": 1.1846, + "epoch": 0.95, + "grad_norm": 57.466224670410156, + "learning_rate": 1.3687821612349917e-05, + "loss": 2.9337, "step": 7545 }, { - "epoch": 2.27, - "grad_norm": 13.608917236328125, - "learning_rate": 4.875212989876717e-06, - "loss": 1.307, + "epoch": 0.95, + "grad_norm": 35.5211067199707, + "learning_rate": 1.3686984897293227e-05, + "loss": 2.0792, "step": 7546 }, { - "epoch": 2.27, - "grad_norm": 33.83692932128906, - "learning_rate": 4.873208379272326e-06, - "loss": 1.4187, + "epoch": 0.95, + "grad_norm": 5.022341728210449, + "learning_rate": 1.368614818223654e-05, + "loss": 0.4209, "step": 7547 }, { - "epoch": 2.27, - "grad_norm": 41.42520523071289, - "learning_rate": 4.8712037686679365e-06, - "loss": 1.5752, + "epoch": 0.95, + "grad_norm": 13.89211654663086, + "learning_rate": 1.3685311467179854e-05, + "loss": 1.5118, "step": 7548 }, { - "epoch": 2.27, - "grad_norm": 7.894377708435059, - "learning_rate": 4.8691991580635466e-06, - "loss": 1.1328, + "epoch": 0.95, + "grad_norm": 12.629494667053223, + "learning_rate": 1.3684474752123164e-05, + "loss": 1.1239, "step": 7549 }, { - "epoch": 2.27, - "grad_norm": 22.25946807861328, - "learning_rate": 4.867194547459157e-06, - "loss": 2.0451, + "epoch": 0.95, + "grad_norm": 16.415287017822266, + "learning_rate": 1.3683638037066478e-05, + "loss": 2.3969, "step": 7550 }, { - "epoch": 2.27, - "grad_norm": 49.385562896728516, - "learning_rate": 4.865189936854766e-06, - "loss": 2.2722, + "epoch": 0.95, + "grad_norm": 22.39629554748535, + "learning_rate": 1.3682801322009792e-05, + "loss": 2.0676, "step": 7551 }, { - "epoch": 2.27, - "grad_norm": 13.534929275512695, - "learning_rate": 4.863185326250377e-06, - "loss": 0.7596, + "epoch": 0.95, + "grad_norm": 17.810590744018555, + "learning_rate": 1.3681964606953104e-05, + "loss": 2.9491, "step": 7552 }, { - "epoch": 2.27, - "grad_norm": 20.882999420166016, - "learning_rate": 4.861180715645986e-06, - "loss": 1.9043, + "epoch": 0.95, + "grad_norm": 12.154153823852539, + "learning_rate": 1.3681127891896416e-05, + "loss": 0.3778, "step": 7553 }, { - "epoch": 2.27, - "grad_norm": 9.814292907714844, - "learning_rate": 4.859176105041596e-06, - "loss": 1.0012, + "epoch": 0.95, + "grad_norm": 7.335411071777344, + "learning_rate": 1.3680291176839728e-05, + "loss": 1.0368, "step": 7554 }, { - "epoch": 2.27, - "grad_norm": 23.16642951965332, - "learning_rate": 4.857171494437206e-06, - "loss": 1.4161, + "epoch": 0.95, + "grad_norm": 37.25257873535156, + "learning_rate": 1.3679454461783041e-05, + "loss": 1.8574, "step": 7555 }, { - "epoch": 2.27, - "grad_norm": 41.616703033447266, - "learning_rate": 4.8551668838328154e-06, - "loss": 2.3166, + "epoch": 0.95, + "grad_norm": 12.6161470413208, + "learning_rate": 1.3678617746726353e-05, + "loss": 1.5904, "step": 7556 }, { - "epoch": 2.27, - "grad_norm": 14.239105224609375, - "learning_rate": 4.8531622732284255e-06, - "loss": 1.1065, + "epoch": 0.95, + "grad_norm": 11.606048583984375, + "learning_rate": 1.3677781031669665e-05, + "loss": 0.53, "step": 7557 }, { - "epoch": 2.27, - "grad_norm": 8.06943130493164, - "learning_rate": 4.851157662624036e-06, - "loss": 0.6586, + "epoch": 0.95, + "grad_norm": 14.20860767364502, + "learning_rate": 1.3676944316612979e-05, + "loss": 1.2397, "step": 7558 }, { - "epoch": 2.27, - "grad_norm": 16.735288619995117, - "learning_rate": 4.849153052019646e-06, - "loss": 0.4483, + "epoch": 0.95, + "grad_norm": 9.279431343078613, + "learning_rate": 1.3676107601556292e-05, + "loss": 1.2488, "step": 7559 }, { - "epoch": 2.27, - "grad_norm": 13.807044982910156, - "learning_rate": 4.847148441415255e-06, - "loss": 1.3318, - "step": 7560 - }, - { - "epoch": 2.27, - "eval_loss": 0.1706262081861496, - "eval_runtime": 43.7986, - "eval_samples_per_second": 33.768, - "eval_steps_per_second": 33.768, + "epoch": 0.95, + "grad_norm": 11.00720500946045, + "learning_rate": 1.3675270886499603e-05, + "loss": 0.8454, "step": 7560 }, { - "epoch": 2.27, - "grad_norm": 18.019256591796875, - "learning_rate": 4.845143830810866e-06, - "loss": 1.3919, + "epoch": 0.95, + "grad_norm": 10.767986297607422, + "learning_rate": 1.3674434171442916e-05, + "loss": 1.8124, "step": 7561 }, { - "epoch": 2.27, - "grad_norm": 16.097368240356445, - "learning_rate": 4.843139220206475e-06, - "loss": 1.5764, + "epoch": 0.95, + "grad_norm": 21.558813095092773, + "learning_rate": 1.367359745638623e-05, + "loss": 0.5925, "step": 7562 }, { - "epoch": 2.27, - "grad_norm": 13.278281211853027, - "learning_rate": 4.841134609602085e-06, - "loss": 1.0933, + "epoch": 0.95, + "grad_norm": 12.875472068786621, + "learning_rate": 1.367276074132954e-05, + "loss": 2.0704, "step": 7563 }, { - "epoch": 2.27, - "grad_norm": 10.068906784057617, - "learning_rate": 4.839129998997695e-06, - "loss": 0.8741, + "epoch": 0.95, + "grad_norm": 60.924991607666016, + "learning_rate": 1.3671924026272854e-05, + "loss": 1.8117, "step": 7564 }, { - "epoch": 2.27, - "grad_norm": 16.215852737426758, - "learning_rate": 4.837125388393305e-06, - "loss": 2.1497, + "epoch": 0.95, + "grad_norm": 11.496397972106934, + "learning_rate": 1.3671087311216168e-05, + "loss": 3.8066, "step": 7565 }, { - "epoch": 2.27, - "grad_norm": 11.731410026550293, - "learning_rate": 4.835120777788915e-06, - "loss": 1.7863, + "epoch": 0.95, + "grad_norm": 9.729010581970215, + "learning_rate": 1.367025059615948e-05, + "loss": 1.7768, "step": 7566 }, { - "epoch": 2.28, - "grad_norm": 10.969109535217285, - "learning_rate": 4.833116167184525e-06, - "loss": 0.8401, + "epoch": 0.95, + "grad_norm": 14.498326301574707, + "learning_rate": 1.3669413881102791e-05, + "loss": 1.6837, "step": 7567 }, { - "epoch": 2.28, - "grad_norm": 10.927252769470215, - "learning_rate": 4.831111556580135e-06, - "loss": 1.4045, + "epoch": 0.95, + "grad_norm": 11.079991340637207, + "learning_rate": 1.3668577166046103e-05, + "loss": 2.1257, "step": 7568 }, { - "epoch": 2.28, - "grad_norm": 41.4652214050293, - "learning_rate": 4.829106945975745e-06, - "loss": 3.292, + "epoch": 0.95, + "grad_norm": 16.234020233154297, + "learning_rate": 1.3667740450989417e-05, + "loss": 1.7121, "step": 7569 }, { - "epoch": 2.28, - "grad_norm": 8.78602123260498, - "learning_rate": 4.827102335371355e-06, - "loss": 1.6186, + "epoch": 0.95, + "grad_norm": 7.767157077789307, + "learning_rate": 1.3666903735932729e-05, + "loss": 0.791, "step": 7570 }, { - "epoch": 2.28, - "grad_norm": 30.659353256225586, - "learning_rate": 4.825097724766964e-06, - "loss": 1.3395, + "epoch": 0.95, + "grad_norm": 10.612770080566406, + "learning_rate": 1.3666067020876041e-05, + "loss": 1.6558, "step": 7571 }, { - "epoch": 2.28, - "grad_norm": 9.962449073791504, - "learning_rate": 4.823093114162574e-06, - "loss": 1.2939, + "epoch": 0.95, + "grad_norm": 17.550827026367188, + "learning_rate": 1.3665230305819355e-05, + "loss": 1.4342, "step": 7572 }, { - "epoch": 2.28, - "grad_norm": 10.435197830200195, - "learning_rate": 4.821088503558184e-06, - "loss": 1.0703, + "epoch": 0.95, + "grad_norm": 19.115427017211914, + "learning_rate": 1.3664393590762668e-05, + "loss": 2.5511, "step": 7573 }, { - "epoch": 2.28, - "grad_norm": 20.504138946533203, - "learning_rate": 4.819083892953794e-06, - "loss": 1.5566, + "epoch": 0.95, + "grad_norm": 26.63739585876465, + "learning_rate": 1.3663556875705979e-05, + "loss": 1.2658, "step": 7574 }, { - "epoch": 2.28, - "grad_norm": 15.632285118103027, - "learning_rate": 4.817079282349404e-06, - "loss": 1.2294, + "epoch": 0.95, + "grad_norm": 4.140101432800293, + "learning_rate": 1.3662720160649292e-05, + "loss": 1.0531, "step": 7575 }, { - "epoch": 2.28, - "grad_norm": 15.574999809265137, - "learning_rate": 4.815074671745015e-06, - "loss": 1.4429, + "epoch": 0.95, + "grad_norm": 9.40150260925293, + "learning_rate": 1.3661883445592606e-05, + "loss": 1.5241, "step": 7576 }, { - "epoch": 2.28, - "grad_norm": 10.924356460571289, - "learning_rate": 4.813070061140624e-06, - "loss": 1.0845, + "epoch": 0.95, + "grad_norm": 10.271926879882812, + "learning_rate": 1.3661046730535916e-05, + "loss": 1.0597, "step": 7577 }, { - "epoch": 2.28, - "grad_norm": 24.5418758392334, - "learning_rate": 4.811065450536234e-06, - "loss": 1.175, + "epoch": 0.95, + "grad_norm": 13.581698417663574, + "learning_rate": 1.366021001547923e-05, + "loss": 2.2879, "step": 7578 }, { - "epoch": 2.28, - "grad_norm": 29.236740112304688, - "learning_rate": 4.809060839931843e-06, - "loss": 0.9208, + "epoch": 0.95, + "grad_norm": 11.900755882263184, + "learning_rate": 1.3659373300422543e-05, + "loss": 1.3348, "step": 7579 }, { - "epoch": 2.28, - "grad_norm": 11.928509712219238, - "learning_rate": 4.807056229327453e-06, - "loss": 0.7309, + "epoch": 0.95, + "grad_norm": 20.219989776611328, + "learning_rate": 1.3658536585365855e-05, + "loss": 2.6691, "step": 7580 }, { - "epoch": 2.28, - "grad_norm": 31.78061294555664, - "learning_rate": 4.805051618723063e-06, - "loss": 1.4187, + "epoch": 0.95, + "grad_norm": 21.341461181640625, + "learning_rate": 1.3657699870309167e-05, + "loss": 2.4714, "step": 7581 }, { - "epoch": 2.28, - "grad_norm": 18.418073654174805, - "learning_rate": 4.803047008118673e-06, - "loss": 2.4135, + "epoch": 0.95, + "grad_norm": 10.618818283081055, + "learning_rate": 1.365686315525248e-05, + "loss": 0.8738, "step": 7582 }, { - "epoch": 2.28, - "grad_norm": 27.633852005004883, - "learning_rate": 4.8010423975142835e-06, - "loss": 1.7401, + "epoch": 0.95, + "grad_norm": 12.787776947021484, + "learning_rate": 1.3656026440195793e-05, + "loss": 1.32, "step": 7583 }, { - "epoch": 2.28, - "grad_norm": 48.656333923339844, - "learning_rate": 4.799037786909893e-06, - "loss": 1.6712, + "epoch": 0.95, + "grad_norm": 5.652602672576904, + "learning_rate": 1.3655189725139105e-05, + "loss": 0.6757, "step": 7584 }, { - "epoch": 2.28, - "grad_norm": 15.722077369689941, - "learning_rate": 4.797033176305503e-06, - "loss": 1.2471, + "epoch": 0.95, + "grad_norm": 12.455671310424805, + "learning_rate": 1.3654353010082417e-05, + "loss": 0.6094, "step": 7585 }, { - "epoch": 2.28, - "grad_norm": 8.848787307739258, - "learning_rate": 4.795028565701113e-06, - "loss": 0.993, + "epoch": 0.95, + "grad_norm": 9.646262168884277, + "learning_rate": 1.365351629502573e-05, + "loss": 1.026, "step": 7586 }, { - "epoch": 2.28, - "grad_norm": 15.616233825683594, - "learning_rate": 4.793023955096723e-06, - "loss": 1.3871, + "epoch": 0.95, + "grad_norm": 14.054344177246094, + "learning_rate": 1.3652679579969044e-05, + "loss": 2.245, "step": 7587 }, { - "epoch": 2.28, - "grad_norm": 24.031694412231445, - "learning_rate": 4.791019344492332e-06, - "loss": 1.2137, + "epoch": 0.95, + "grad_norm": 16.830785751342773, + "learning_rate": 1.3651842864912354e-05, + "loss": 2.6661, "step": 7588 }, { - "epoch": 2.28, - "grad_norm": 11.082610130310059, - "learning_rate": 4.789014733887943e-06, - "loss": 1.1946, + "epoch": 0.95, + "grad_norm": 21.882461547851562, + "learning_rate": 1.3651006149855668e-05, + "loss": 2.4278, "step": 7589 }, { - "epoch": 2.28, - "grad_norm": 18.813764572143555, - "learning_rate": 4.787010123283552e-06, - "loss": 1.1478, + "epoch": 0.95, + "grad_norm": 11.206304550170898, + "learning_rate": 1.3650169434798982e-05, + "loss": 1.0506, "step": 7590 }, { - "epoch": 2.28, - "grad_norm": 20.931020736694336, - "learning_rate": 4.7850055126791625e-06, - "loss": 1.3695, + "epoch": 0.95, + "grad_norm": 16.634540557861328, + "learning_rate": 1.3649332719742292e-05, + "loss": 2.2103, "step": 7591 }, { - "epoch": 2.28, - "grad_norm": 15.224300384521484, - "learning_rate": 4.7830009020747726e-06, - "loss": 1.0471, + "epoch": 0.95, + "grad_norm": 6.956151962280273, + "learning_rate": 1.3648496004685606e-05, + "loss": 1.965, "step": 7592 }, { - "epoch": 2.28, - "grad_norm": 13.662450790405273, - "learning_rate": 4.780996291470383e-06, - "loss": 1.0492, + "epoch": 0.95, + "grad_norm": 11.869979858398438, + "learning_rate": 1.364765928962892e-05, + "loss": 0.9426, "step": 7593 }, { - "epoch": 2.28, - "grad_norm": 11.05825424194336, - "learning_rate": 4.778991680865992e-06, - "loss": 1.4997, + "epoch": 0.95, + "grad_norm": 9.617378234863281, + "learning_rate": 1.3646822574572231e-05, + "loss": 0.8194, "step": 7594 }, { - "epoch": 2.28, - "grad_norm": 14.814659118652344, - "learning_rate": 4.776987070261602e-06, - "loss": 1.5903, + "epoch": 0.95, + "grad_norm": 11.88089370727539, + "learning_rate": 1.3645985859515543e-05, + "loss": 0.4161, "step": 7595 }, { - "epoch": 2.28, - "grad_norm": 20.72893524169922, - "learning_rate": 4.774982459657212e-06, - "loss": 0.9354, + "epoch": 0.95, + "grad_norm": 32.03239059448242, + "learning_rate": 1.3645149144458855e-05, + "loss": 1.8739, "step": 7596 }, { - "epoch": 2.28, - "grad_norm": 16.968223571777344, - "learning_rate": 4.772977849052821e-06, - "loss": 1.7869, + "epoch": 0.95, + "grad_norm": 10.117530822753906, + "learning_rate": 1.3644312429402169e-05, + "loss": 0.7434, "step": 7597 }, { - "epoch": 2.28, - "grad_norm": 12.797143936157227, - "learning_rate": 4.770973238448432e-06, - "loss": 1.4657, + "epoch": 0.95, + "grad_norm": 9.655041694641113, + "learning_rate": 1.364347571434548e-05, + "loss": 0.6933, "step": 7598 }, { - "epoch": 2.28, - "grad_norm": 278.7440185546875, - "learning_rate": 4.7689686278440414e-06, - "loss": 1.1522, + "epoch": 0.95, + "grad_norm": 15.120823860168457, + "learning_rate": 1.3642638999288793e-05, + "loss": 0.9372, "step": 7599 }, { - "epoch": 2.29, - "grad_norm": 11.309410095214844, - "learning_rate": 4.7669640172396515e-06, - "loss": 1.6979, + "epoch": 0.95, + "grad_norm": 29.873746871948242, + "learning_rate": 1.3641802284232106e-05, + "loss": 2.1381, "step": 7600 }, { - "epoch": 2.29, - "grad_norm": 17.630992889404297, - "learning_rate": 4.764959406635262e-06, - "loss": 1.2774, + "epoch": 0.95, + "eval_loss": 0.10611134022474289, + "eval_runtime": 154.8741, + "eval_samples_per_second": 22.87, + "eval_steps_per_second": 22.87, + "step": 7600 + }, + { + "epoch": 0.95, + "grad_norm": 17.3528995513916, + "learning_rate": 1.364096556917542e-05, + "loss": 1.4973, "step": 7601 }, { - "epoch": 2.29, - "grad_norm": 64.68533325195312, - "learning_rate": 4.762954796030872e-06, - "loss": 1.6083, + "epoch": 0.95, + "grad_norm": 24.112226486206055, + "learning_rate": 1.364012885411873e-05, + "loss": 1.4021, "step": 7602 }, { - "epoch": 2.29, - "grad_norm": 12.072052001953125, - "learning_rate": 4.760950185426481e-06, - "loss": 0.8717, + "epoch": 0.95, + "grad_norm": 21.563318252563477, + "learning_rate": 1.3639292139062044e-05, + "loss": 1.032, "step": 7603 }, { - "epoch": 2.29, - "grad_norm": 13.834127426147461, - "learning_rate": 4.758945574822091e-06, - "loss": 1.385, + "epoch": 0.95, + "grad_norm": 4.783206939697266, + "learning_rate": 1.3638455424005358e-05, + "loss": 0.2047, "step": 7604 }, { - "epoch": 2.29, - "grad_norm": 18.952781677246094, - "learning_rate": 4.756940964217701e-06, - "loss": 1.2853, + "epoch": 0.95, + "grad_norm": 5.902559757232666, + "learning_rate": 1.3637618708948668e-05, + "loss": 1.2226, "step": 7605 }, { - "epoch": 2.29, - "grad_norm": 18.030406951904297, - "learning_rate": 4.754936353613311e-06, - "loss": 1.6859, + "epoch": 0.95, + "grad_norm": 12.657173156738281, + "learning_rate": 1.3636781993891981e-05, + "loss": 1.155, "step": 7606 }, { - "epoch": 2.29, - "grad_norm": 35.28142166137695, - "learning_rate": 4.752931743008921e-06, - "loss": 2.1156, + "epoch": 0.95, + "grad_norm": 34.17289733886719, + "learning_rate": 1.3635945278835293e-05, + "loss": 2.0155, "step": 7607 }, { - "epoch": 2.29, - "grad_norm": 28.663076400756836, - "learning_rate": 4.7509271324045305e-06, - "loss": 2.1559, + "epoch": 0.95, + "grad_norm": 6.413711071014404, + "learning_rate": 1.3635108563778607e-05, + "loss": 0.7814, "step": 7608 }, { - "epoch": 2.29, - "grad_norm": 11.263956069946289, - "learning_rate": 4.748922521800141e-06, - "loss": 0.9984, + "epoch": 0.95, + "grad_norm": 9.342193603515625, + "learning_rate": 1.3634271848721919e-05, + "loss": 0.6112, "step": 7609 }, { - "epoch": 2.29, - "grad_norm": 9.154500007629395, - "learning_rate": 4.746917911195751e-06, - "loss": 1.3324, + "epoch": 0.96, + "grad_norm": 23.81905174255371, + "learning_rate": 1.3633435133665231e-05, + "loss": 1.6003, "step": 7610 }, { - "epoch": 2.29, - "grad_norm": 18.66733169555664, - "learning_rate": 4.744913300591361e-06, - "loss": 0.6866, + "epoch": 0.96, + "grad_norm": 116.54635620117188, + "learning_rate": 1.3632598418608545e-05, + "loss": 3.7441, "step": 7611 }, { - "epoch": 2.29, - "grad_norm": 22.89703369140625, - "learning_rate": 4.74290868998697e-06, - "loss": 2.2213, + "epoch": 0.96, + "grad_norm": 22.9312744140625, + "learning_rate": 1.3631761703551855e-05, + "loss": 1.9241, "step": 7612 }, { - "epoch": 2.29, - "grad_norm": 9.362771987915039, - "learning_rate": 4.740904079382581e-06, - "loss": 1.069, + "epoch": 0.96, + "grad_norm": 78.7513427734375, + "learning_rate": 1.3630924988495168e-05, + "loss": 1.4171, "step": 7613 }, { - "epoch": 2.29, - "grad_norm": 100.19771575927734, - "learning_rate": 4.73889946877819e-06, - "loss": 1.1066, + "epoch": 0.96, + "grad_norm": 5.988093852996826, + "learning_rate": 1.3630088273438482e-05, + "loss": 0.489, "step": 7614 }, { - "epoch": 2.29, - "grad_norm": 12.872753143310547, - "learning_rate": 4.7368948581738e-06, - "loss": 1.3306, + "epoch": 0.96, + "grad_norm": 26.50092887878418, + "learning_rate": 1.3629251558381796e-05, + "loss": 0.8656, "step": 7615 }, { - "epoch": 2.29, - "grad_norm": 13.48827075958252, - "learning_rate": 4.73489024756941e-06, - "loss": 0.8473, + "epoch": 0.96, + "grad_norm": 11.180814743041992, + "learning_rate": 1.3628414843325106e-05, + "loss": 1.6053, "step": 7616 }, { - "epoch": 2.29, - "grad_norm": 8.565572738647461, - "learning_rate": 4.7328856369650204e-06, - "loss": 0.7963, + "epoch": 0.96, + "grad_norm": 39.32979202270508, + "learning_rate": 1.362757812826842e-05, + "loss": 1.2344, "step": 7617 }, { - "epoch": 2.29, - "grad_norm": 16.844417572021484, - "learning_rate": 4.73088102636063e-06, - "loss": 0.9599, + "epoch": 0.96, + "grad_norm": 33.10567855834961, + "learning_rate": 1.3626741413211733e-05, + "loss": 2.0224, "step": 7618 }, { - "epoch": 2.29, - "grad_norm": 16.240650177001953, - "learning_rate": 4.72887641575624e-06, - "loss": 1.0792, + "epoch": 0.96, + "grad_norm": 9.995561599731445, + "learning_rate": 1.3625904698155044e-05, + "loss": 1.8112, "step": 7619 }, { - "epoch": 2.29, - "grad_norm": 13.393976211547852, - "learning_rate": 4.72687180515185e-06, - "loss": 1.1918, + "epoch": 0.96, + "grad_norm": 20.029680252075195, + "learning_rate": 1.3625067983098357e-05, + "loss": 1.5666, "step": 7620 }, { - "epoch": 2.29, - "grad_norm": 27.26254653930664, - "learning_rate": 4.724867194547459e-06, - "loss": 1.5554, + "epoch": 0.96, + "grad_norm": 12.913830757141113, + "learning_rate": 1.362423126804167e-05, + "loss": 3.1843, "step": 7621 }, { - "epoch": 2.29, - "grad_norm": 19.264162063598633, - "learning_rate": 4.722862583943069e-06, - "loss": 1.0877, + "epoch": 0.96, + "grad_norm": 15.697308540344238, + "learning_rate": 1.3623394552984983e-05, + "loss": 2.1712, "step": 7622 }, { - "epoch": 2.29, - "grad_norm": 14.326976776123047, - "learning_rate": 4.720857973338679e-06, - "loss": 1.4076, + "epoch": 0.96, + "grad_norm": 8.030444145202637, + "learning_rate": 1.3622557837928295e-05, + "loss": 0.3436, "step": 7623 }, { - "epoch": 2.29, - "grad_norm": 15.193740844726562, - "learning_rate": 4.718853362734289e-06, - "loss": 1.6355, + "epoch": 0.96, + "grad_norm": 5.636011600494385, + "learning_rate": 1.3621721122871607e-05, + "loss": 0.7249, "step": 7624 }, { - "epoch": 2.29, - "grad_norm": 9.33708381652832, - "learning_rate": 4.7168487521298986e-06, - "loss": 1.4984, + "epoch": 0.96, + "grad_norm": 11.708727836608887, + "learning_rate": 1.362088440781492e-05, + "loss": 0.9984, "step": 7625 }, { - "epoch": 2.29, - "grad_norm": 84.12486267089844, - "learning_rate": 4.7148441415255095e-06, - "loss": 1.9932, + "epoch": 0.96, + "grad_norm": 14.825302124023438, + "learning_rate": 1.362004769275823e-05, + "loss": 1.047, "step": 7626 }, { - "epoch": 2.29, - "grad_norm": 21.830821990966797, - "learning_rate": 4.712839530921119e-06, - "loss": 1.4148, + "epoch": 0.96, + "grad_norm": 16.708148956298828, + "learning_rate": 1.3619210977701544e-05, + "loss": 2.2662, "step": 7627 }, { - "epoch": 2.29, - "grad_norm": 11.36988353729248, - "learning_rate": 4.710834920316729e-06, - "loss": 1.3706, + "epoch": 0.96, + "grad_norm": 27.896203994750977, + "learning_rate": 1.3618374262644858e-05, + "loss": 1.142, "step": 7628 }, { - "epoch": 2.29, - "grad_norm": 4.701085567474365, - "learning_rate": 4.708830309712339e-06, - "loss": 0.4887, + "epoch": 0.96, + "grad_norm": 9.181163787841797, + "learning_rate": 1.3617537547588172e-05, + "loss": 1.565, "step": 7629 }, { - "epoch": 2.29, - "grad_norm": 40.99164962768555, - "learning_rate": 4.706825699107949e-06, - "loss": 3.0745, + "epoch": 0.96, + "grad_norm": 40.83127975463867, + "learning_rate": 1.3616700832531482e-05, + "loss": 1.8924, "step": 7630 }, { - "epoch": 2.29, - "grad_norm": 46.76643753051758, - "learning_rate": 4.704821088503558e-06, - "loss": 1.5348, + "epoch": 0.96, + "grad_norm": 10.27275276184082, + "learning_rate": 1.3615864117474796e-05, + "loss": 1.2299, "step": 7631 }, { - "epoch": 2.29, - "grad_norm": 24.31828498840332, - "learning_rate": 4.702816477899168e-06, - "loss": 1.1564, + "epoch": 0.96, + "grad_norm": 13.874832153320312, + "learning_rate": 1.361502740241811e-05, + "loss": 1.5729, "step": 7632 }, { - "epoch": 2.29, - "grad_norm": 32.74114227294922, - "learning_rate": 4.700811867294778e-06, - "loss": 0.9132, + "epoch": 0.96, + "grad_norm": 15.549036026000977, + "learning_rate": 1.361419068736142e-05, + "loss": 1.6644, "step": 7633 }, { - "epoch": 2.3, - "grad_norm": 36.050968170166016, - "learning_rate": 4.6988072566903885e-06, - "loss": 1.0512, + "epoch": 0.96, + "grad_norm": 10.457554817199707, + "learning_rate": 1.3613353972304733e-05, + "loss": 1.2516, "step": 7634 }, { - "epoch": 2.3, - "grad_norm": 11.422075271606445, - "learning_rate": 4.6968026460859986e-06, - "loss": 1.2059, + "epoch": 0.96, + "grad_norm": 9.416202545166016, + "learning_rate": 1.3612517257248045e-05, + "loss": 1.1955, "step": 7635 }, { - "epoch": 2.3, - "grad_norm": 35.184654235839844, - "learning_rate": 4.694798035481608e-06, - "loss": 1.4813, + "epoch": 0.96, + "grad_norm": 15.376522064208984, + "learning_rate": 1.3611680542191359e-05, + "loss": 1.8816, "step": 7636 }, { - "epoch": 2.3, - "grad_norm": 45.07157516479492, - "learning_rate": 4.692793424877218e-06, - "loss": 1.2941, + "epoch": 0.96, + "grad_norm": 7.528784275054932, + "learning_rate": 1.361084382713467e-05, + "loss": 0.8651, "step": 7637 }, { - "epoch": 2.3, - "grad_norm": 37.6901969909668, - "learning_rate": 4.690788814272828e-06, - "loss": 3.1126, + "epoch": 0.96, + "grad_norm": 20.97710418701172, + "learning_rate": 1.3610007112077983e-05, + "loss": 1.6182, "step": 7638 }, { - "epoch": 2.3, - "grad_norm": 21.26262664794922, - "learning_rate": 4.688784203668438e-06, - "loss": 1.3052, + "epoch": 0.96, + "grad_norm": 29.202354431152344, + "learning_rate": 1.3609170397021296e-05, + "loss": 2.1021, "step": 7639 }, { - "epoch": 2.3, - "grad_norm": 26.654037475585938, - "learning_rate": 4.686779593064047e-06, - "loss": 1.6718, + "epoch": 0.96, + "grad_norm": 6.581846237182617, + "learning_rate": 1.3608333681964607e-05, + "loss": 0.3949, "step": 7640 }, { - "epoch": 2.3, - "grad_norm": 19.267364501953125, - "learning_rate": 4.684774982459657e-06, - "loss": 1.1093, + "epoch": 0.96, + "grad_norm": 20.662389755249023, + "learning_rate": 1.360749696690792e-05, + "loss": 3.3764, "step": 7641 }, { - "epoch": 2.3, - "grad_norm": 11.072561264038086, - "learning_rate": 4.6827703718552674e-06, - "loss": 1.1142, + "epoch": 0.96, + "grad_norm": 17.10767364501953, + "learning_rate": 1.3606660251851234e-05, + "loss": 2.4709, "step": 7642 }, { - "epoch": 2.3, - "grad_norm": 17.212007522583008, - "learning_rate": 4.6807657612508775e-06, - "loss": 1.1688, + "epoch": 0.96, + "grad_norm": 9.228837013244629, + "learning_rate": 1.3605823536794547e-05, + "loss": 0.6915, "step": 7643 }, { - "epoch": 2.3, - "grad_norm": 14.088850021362305, - "learning_rate": 4.678761150646488e-06, - "loss": 1.0786, + "epoch": 0.96, + "grad_norm": 11.638644218444824, + "learning_rate": 1.3604986821737858e-05, + "loss": 1.1213, "step": 7644 }, { - "epoch": 2.3, - "grad_norm": 8.411274909973145, - "learning_rate": 4.676756540042097e-06, - "loss": 1.6672, + "epoch": 0.96, + "grad_norm": 18.108949661254883, + "learning_rate": 1.3604150106681171e-05, + "loss": 1.7587, "step": 7645 }, { - "epoch": 2.3, - "grad_norm": 10.209150314331055, - "learning_rate": 4.674751929437707e-06, - "loss": 0.6725, + "epoch": 0.96, + "grad_norm": 20.314796447753906, + "learning_rate": 1.3603313391624483e-05, + "loss": 2.1261, "step": 7646 }, { - "epoch": 2.3, - "grad_norm": 8.190345764160156, - "learning_rate": 4.672747318833317e-06, - "loss": 0.7018, + "epoch": 0.96, + "grad_norm": 3.3429651260375977, + "learning_rate": 1.3602476676567795e-05, + "loss": 0.0715, "step": 7647 }, { - "epoch": 2.3, - "grad_norm": 17.66701316833496, - "learning_rate": 4.670742708228927e-06, - "loss": 1.9426, + "epoch": 0.96, + "grad_norm": 24.380075454711914, + "learning_rate": 1.3601639961511109e-05, + "loss": 1.8533, "step": 7648 }, { - "epoch": 2.3, - "grad_norm": 29.10671043395996, - "learning_rate": 4.668738097624536e-06, - "loss": 1.8796, + "epoch": 0.96, + "grad_norm": 10.046948432922363, + "learning_rate": 1.3600803246454421e-05, + "loss": 0.2984, "step": 7649 }, { - "epoch": 2.3, - "grad_norm": 10.14902114868164, - "learning_rate": 4.666733487020147e-06, - "loss": 0.9493, + "epoch": 0.96, + "grad_norm": 12.797595977783203, + "learning_rate": 1.3599966531397735e-05, + "loss": 0.838, "step": 7650 }, { - "epoch": 2.3, - "grad_norm": 21.747249603271484, - "learning_rate": 4.6647288764157565e-06, - "loss": 0.8706, + "epoch": 0.96, + "grad_norm": 8.366950035095215, + "learning_rate": 1.3599129816341046e-05, + "loss": 0.4222, "step": 7651 }, { - "epoch": 2.3, - "grad_norm": 52.81174087524414, - "learning_rate": 4.662724265811367e-06, - "loss": 2.5637, + "epoch": 0.96, + "grad_norm": 17.715896606445312, + "learning_rate": 1.3598293101284358e-05, + "loss": 1.1354, "step": 7652 }, { - "epoch": 2.3, - "grad_norm": 21.7281494140625, - "learning_rate": 4.660719655206977e-06, - "loss": 1.5396, + "epoch": 0.96, + "grad_norm": 35.103858947753906, + "learning_rate": 1.3597456386227672e-05, + "loss": 3.3299, "step": 7653 }, { - "epoch": 2.3, - "grad_norm": 9.083329200744629, - "learning_rate": 4.658715044602587e-06, - "loss": 0.701, + "epoch": 0.96, + "grad_norm": 8.418503761291504, + "learning_rate": 1.3596619671170982e-05, + "loss": 0.4461, "step": 7654 }, { - "epoch": 2.3, - "grad_norm": 8.001774787902832, - "learning_rate": 4.656710433998196e-06, - "loss": 0.9427, + "epoch": 0.96, + "grad_norm": 8.805988311767578, + "learning_rate": 1.3595782956114296e-05, + "loss": 0.6346, "step": 7655 }, { - "epoch": 2.3, - "grad_norm": 8.815278053283691, - "learning_rate": 4.654705823393806e-06, - "loss": 1.6543, + "epoch": 0.96, + "grad_norm": 37.53118133544922, + "learning_rate": 1.359494624105761e-05, + "loss": 1.512, "step": 7656 }, { - "epoch": 2.3, - "grad_norm": 37.267616271972656, - "learning_rate": 4.652701212789416e-06, - "loss": 0.9176, + "epoch": 0.96, + "grad_norm": 27.297487258911133, + "learning_rate": 1.3594109526000923e-05, + "loss": 2.6313, "step": 7657 }, { - "epoch": 2.3, - "grad_norm": 32.767486572265625, - "learning_rate": 4.650696602185025e-06, - "loss": 1.6066, + "epoch": 0.96, + "grad_norm": 4.831480979919434, + "learning_rate": 1.3593272810944234e-05, + "loss": 0.2401, "step": 7658 }, { - "epoch": 2.3, - "grad_norm": 15.666519165039062, - "learning_rate": 4.648691991580636e-06, - "loss": 1.3438, + "epoch": 0.96, + "grad_norm": 9.539240837097168, + "learning_rate": 1.3592436095887547e-05, + "loss": 1.9658, "step": 7659 }, { - "epoch": 2.3, - "grad_norm": 31.18556785583496, - "learning_rate": 4.646687380976246e-06, - "loss": 2.5968, + "epoch": 0.96, + "grad_norm": 18.462636947631836, + "learning_rate": 1.359159938083086e-05, + "loss": 2.1001, "step": 7660 }, { - "epoch": 2.3, - "grad_norm": 21.61691665649414, - "learning_rate": 4.644682770371856e-06, - "loss": 1.7081, + "epoch": 0.96, + "grad_norm": 71.91975402832031, + "learning_rate": 1.3590762665774171e-05, + "loss": 3.0899, "step": 7661 }, { - "epoch": 2.3, - "grad_norm": 6.294021129608154, - "learning_rate": 4.642678159767465e-06, - "loss": 0.5644, + "epoch": 0.96, + "grad_norm": 18.084308624267578, + "learning_rate": 1.3589925950717485e-05, + "loss": 0.8901, "step": 7662 }, { - "epoch": 2.3, - "grad_norm": 17.326904296875, - "learning_rate": 4.640673549163076e-06, - "loss": 1.2007, + "epoch": 0.96, + "grad_norm": 19.814807891845703, + "learning_rate": 1.3589089235660797e-05, + "loss": 1.5361, "step": 7663 }, { - "epoch": 2.3, - "grad_norm": 7.6418986320495605, - "learning_rate": 4.638668938558685e-06, - "loss": 1.5581, + "epoch": 0.96, + "grad_norm": 13.233928680419922, + "learning_rate": 1.358825252060411e-05, + "loss": 2.3141, "step": 7664 }, { - "epoch": 2.3, - "grad_norm": 36.48931121826172, - "learning_rate": 4.636664327954295e-06, - "loss": 1.8396, + "epoch": 0.96, + "grad_norm": 11.917679786682129, + "learning_rate": 1.358741580554742e-05, + "loss": 2.5707, "step": 7665 }, { - "epoch": 2.3, - "grad_norm": 12.859515190124512, - "learning_rate": 4.634659717349905e-06, - "loss": 1.6651, + "epoch": 0.96, + "grad_norm": 14.535420417785645, + "learning_rate": 1.3586579090490734e-05, + "loss": 0.7456, "step": 7666 }, { - "epoch": 2.31, - "grad_norm": 19.518774032592773, - "learning_rate": 4.632655106745515e-06, - "loss": 1.0046, + "epoch": 0.96, + "grad_norm": 23.91950035095215, + "learning_rate": 1.3585742375434048e-05, + "loss": 2.8434, "step": 7667 }, { - "epoch": 2.31, - "grad_norm": 17.942481994628906, - "learning_rate": 4.6306504961411246e-06, - "loss": 1.7626, + "epoch": 0.96, + "grad_norm": 9.482237815856934, + "learning_rate": 1.3584905660377358e-05, + "loss": 0.607, "step": 7668 }, { - "epoch": 2.31, - "grad_norm": 17.79322624206543, - "learning_rate": 4.628645885536735e-06, - "loss": 0.8779, + "epoch": 0.96, + "grad_norm": 31.98341178894043, + "learning_rate": 1.3584068945320672e-05, + "loss": 2.7799, "step": 7669 }, { - "epoch": 2.31, - "grad_norm": 17.742807388305664, - "learning_rate": 4.626641274932345e-06, - "loss": 0.9714, + "epoch": 0.96, + "grad_norm": 15.53034782409668, + "learning_rate": 1.3583232230263985e-05, + "loss": 3.3731, "step": 7670 }, { - "epoch": 2.31, - "grad_norm": 14.361837387084961, - "learning_rate": 4.624636664327955e-06, - "loss": 1.1075, + "epoch": 0.96, + "grad_norm": 11.994458198547363, + "learning_rate": 1.3582395515207299e-05, + "loss": 3.1805, "step": 7671 }, { - "epoch": 2.31, - "grad_norm": 19.260719299316406, - "learning_rate": 4.622632053723565e-06, - "loss": 2.0738, + "epoch": 0.96, + "grad_norm": 13.355704307556152, + "learning_rate": 1.358155880015061e-05, + "loss": 1.1367, "step": 7672 }, { - "epoch": 2.31, - "grad_norm": 19.078481674194336, - "learning_rate": 4.620627443119174e-06, - "loss": 1.3898, + "epoch": 0.96, + "grad_norm": 95.1605453491211, + "learning_rate": 1.3580722085093923e-05, + "loss": 1.2786, "step": 7673 }, { - "epoch": 2.31, - "grad_norm": 11.892518043518066, - "learning_rate": 4.618622832514784e-06, - "loss": 1.4389, + "epoch": 0.96, + "grad_norm": 12.803677558898926, + "learning_rate": 1.3579885370037235e-05, + "loss": 1.5783, "step": 7674 }, { - "epoch": 2.31, - "grad_norm": 37.109336853027344, - "learning_rate": 4.616618221910394e-06, - "loss": 1.5463, + "epoch": 0.96, + "grad_norm": 10.651666641235352, + "learning_rate": 1.3579048654980547e-05, + "loss": 0.6602, "step": 7675 }, { - "epoch": 2.31, - "grad_norm": 35.61430358886719, - "learning_rate": 4.614613611306004e-06, - "loss": 1.2612, + "epoch": 0.96, + "grad_norm": 8.594017028808594, + "learning_rate": 1.357821193992386e-05, + "loss": 0.7785, "step": 7676 }, { - "epoch": 2.31, - "grad_norm": 10.030492782592773, - "learning_rate": 4.612609000701614e-06, - "loss": 1.0428, + "epoch": 0.96, + "grad_norm": 8.036545753479004, + "learning_rate": 1.3577375224867173e-05, + "loss": 0.5393, "step": 7677 }, { - "epoch": 2.31, - "grad_norm": 32.46242141723633, - "learning_rate": 4.6106043900972246e-06, - "loss": 1.2366, + "epoch": 0.96, + "grad_norm": 12.696398735046387, + "learning_rate": 1.3576538509810485e-05, + "loss": 1.9092, "step": 7678 }, { - "epoch": 2.31, - "grad_norm": 63.76512908935547, - "learning_rate": 4.608599779492834e-06, - "loss": 1.8638, + "epoch": 0.96, + "grad_norm": 45.462074279785156, + "learning_rate": 1.3575701794753796e-05, + "loss": 2.7158, "step": 7679 }, { - "epoch": 2.31, - "grad_norm": 21.762617111206055, - "learning_rate": 4.606595168888444e-06, - "loss": 1.7875, - "step": 7680 - }, - { - "epoch": 2.31, - "eval_loss": 0.16951656341552734, - "eval_runtime": 43.7895, - "eval_samples_per_second": 33.775, - "eval_steps_per_second": 33.775, + "epoch": 0.96, + "grad_norm": 16.446996688842773, + "learning_rate": 1.357486507969711e-05, + "loss": 1.7265, "step": 7680 }, { - "epoch": 2.31, - "grad_norm": 10.09822940826416, - "learning_rate": 4.604590558284054e-06, - "loss": 1.1142, + "epoch": 0.96, + "grad_norm": 10.649942398071289, + "learning_rate": 1.3574028364640424e-05, + "loss": 2.2305, "step": 7681 }, { - "epoch": 2.31, - "grad_norm": 10.086823463439941, - "learning_rate": 4.602585947679663e-06, - "loss": 0.8318, + "epoch": 0.96, + "grad_norm": 15.220854759216309, + "learning_rate": 1.3573191649583734e-05, + "loss": 1.5713, "step": 7682 }, { - "epoch": 2.31, - "grad_norm": 11.713811874389648, - "learning_rate": 4.600581337075273e-06, - "loss": 0.9897, + "epoch": 0.96, + "grad_norm": 11.791414260864258, + "learning_rate": 1.3572354934527048e-05, + "loss": 1.2042, "step": 7683 }, { - "epoch": 2.31, - "grad_norm": 20.750566482543945, - "learning_rate": 4.598576726470883e-06, - "loss": 1.5894, + "epoch": 0.96, + "grad_norm": 21.493785858154297, + "learning_rate": 1.3571518219470361e-05, + "loss": 2.3257, "step": 7684 }, { - "epoch": 2.31, - "grad_norm": 17.660737991333008, - "learning_rate": 4.5965721158664935e-06, - "loss": 1.4457, + "epoch": 0.96, + "grad_norm": 10.986868858337402, + "learning_rate": 1.3570681504413672e-05, + "loss": 0.9414, "step": 7685 }, { - "epoch": 2.31, - "grad_norm": 11.275466918945312, - "learning_rate": 4.594567505262103e-06, - "loss": 1.0097, + "epoch": 0.96, + "grad_norm": 11.292412757873535, + "learning_rate": 1.3569844789356985e-05, + "loss": 1.0636, "step": 7686 }, { - "epoch": 2.31, - "grad_norm": 9.963224411010742, - "learning_rate": 4.592562894657714e-06, - "loss": 1.7383, + "epoch": 0.96, + "grad_norm": 26.578144073486328, + "learning_rate": 1.3569008074300299e-05, + "loss": 1.8814, "step": 7687 }, { - "epoch": 2.31, - "grad_norm": 5.169731140136719, - "learning_rate": 4.590558284053323e-06, - "loss": 0.5528, + "epoch": 0.96, + "grad_norm": 8.212207794189453, + "learning_rate": 1.356817135924361e-05, + "loss": 1.4536, "step": 7688 }, { - "epoch": 2.31, - "grad_norm": 10.396340370178223, - "learning_rate": 4.588553673448933e-06, - "loss": 0.967, + "epoch": 0.96, + "grad_norm": 22.602903366088867, + "learning_rate": 1.3567334644186923e-05, + "loss": 1.8578, "step": 7689 }, { - "epoch": 2.31, - "grad_norm": 15.52597427368164, - "learning_rate": 4.586549062844543e-06, - "loss": 0.9266, + "epoch": 0.97, + "grad_norm": 8.634163856506348, + "learning_rate": 1.3566497929130236e-05, + "loss": 1.0865, "step": 7690 }, { - "epoch": 2.31, - "grad_norm": 12.798748016357422, - "learning_rate": 4.584544452240153e-06, - "loss": 1.288, + "epoch": 0.97, + "grad_norm": 13.972951889038086, + "learning_rate": 1.3565661214073548e-05, + "loss": 1.5003, "step": 7691 }, { - "epoch": 2.31, - "grad_norm": 21.744783401489258, - "learning_rate": 4.582539841635762e-06, - "loss": 1.3389, + "epoch": 0.97, + "grad_norm": 9.140802383422852, + "learning_rate": 1.356482449901686e-05, + "loss": 0.5627, "step": 7692 }, { - "epoch": 2.31, - "grad_norm": 7.416219711303711, - "learning_rate": 4.5805352310313724e-06, - "loss": 0.9147, + "epoch": 0.97, + "grad_norm": 11.660378456115723, + "learning_rate": 1.3563987783960172e-05, + "loss": 1.4375, "step": 7693 }, { - "epoch": 2.31, - "grad_norm": 27.035898208618164, - "learning_rate": 4.5785306204269825e-06, - "loss": 2.5926, + "epoch": 0.97, + "grad_norm": 17.761133193969727, + "learning_rate": 1.3563151068903486e-05, + "loss": 2.2885, "step": 7694 }, { - "epoch": 2.31, - "grad_norm": 18.482648849487305, - "learning_rate": 4.576526009822593e-06, - "loss": 2.276, + "epoch": 0.97, + "grad_norm": 10.578373908996582, + "learning_rate": 1.35623143538468e-05, + "loss": 2.5974, "step": 7695 }, { - "epoch": 2.31, - "grad_norm": 52.933494567871094, - "learning_rate": 4.574521399218203e-06, - "loss": 2.2703, + "epoch": 0.97, + "grad_norm": 12.733040809631348, + "learning_rate": 1.356147763879011e-05, + "loss": 2.3853, "step": 7696 }, { - "epoch": 2.31, - "grad_norm": 11.743091583251953, - "learning_rate": 4.572516788613812e-06, - "loss": 0.9347, + "epoch": 0.97, + "grad_norm": 10.997481346130371, + "learning_rate": 1.3560640923733424e-05, + "loss": 0.7155, "step": 7697 }, { - "epoch": 2.31, - "grad_norm": 16.696788787841797, - "learning_rate": 4.570512178009422e-06, - "loss": 1.3391, + "epoch": 0.97, + "grad_norm": 11.658534049987793, + "learning_rate": 1.3559804208676737e-05, + "loss": 0.583, "step": 7698 }, { - "epoch": 2.31, - "grad_norm": 23.81439971923828, - "learning_rate": 4.568507567405032e-06, - "loss": 2.2063, + "epoch": 0.97, + "grad_norm": 71.93724822998047, + "learning_rate": 1.3558967493620047e-05, + "loss": 2.1163, "step": 7699 }, { - "epoch": 2.32, - "grad_norm": 11.924534797668457, - "learning_rate": 4.566502956800642e-06, - "loss": 1.0567, + "epoch": 0.97, + "grad_norm": 15.736628532409668, + "learning_rate": 1.3558130778563361e-05, + "loss": 0.5738, "step": 7700 }, { - "epoch": 2.32, - "grad_norm": 15.285096168518066, - "learning_rate": 4.564498346196251e-06, - "loss": 1.6988, + "epoch": 0.97, + "grad_norm": 7.046619892120361, + "learning_rate": 1.3557294063506675e-05, + "loss": 1.2248, "step": 7701 }, { - "epoch": 2.32, - "grad_norm": 14.276802062988281, - "learning_rate": 4.562493735591862e-06, - "loss": 0.7809, + "epoch": 0.97, + "grad_norm": 15.224453926086426, + "learning_rate": 1.3556457348449987e-05, + "loss": 1.245, "step": 7702 }, { - "epoch": 2.32, - "grad_norm": 19.88568687438965, - "learning_rate": 4.560489124987472e-06, - "loss": 1.8969, + "epoch": 0.97, + "grad_norm": 6.766767978668213, + "learning_rate": 1.3555620633393299e-05, + "loss": 2.0219, "step": 7703 }, { - "epoch": 2.32, - "grad_norm": 22.174179077148438, - "learning_rate": 4.558484514383082e-06, - "loss": 1.4157, + "epoch": 0.97, + "grad_norm": 10.02777099609375, + "learning_rate": 1.3554783918336612e-05, + "loss": 1.544, "step": 7704 }, { - "epoch": 2.32, - "grad_norm": 26.93759536743164, - "learning_rate": 4.556479903778691e-06, - "loss": 0.7634, + "epoch": 0.97, + "grad_norm": 36.131717681884766, + "learning_rate": 1.3553947203279924e-05, + "loss": 2.2543, "step": 7705 }, { - "epoch": 2.32, - "grad_norm": 14.25698184967041, - "learning_rate": 4.554475293174301e-06, - "loss": 0.9706, + "epoch": 0.97, + "grad_norm": 17.951614379882812, + "learning_rate": 1.3553110488223236e-05, + "loss": 1.875, "step": 7706 }, { - "epoch": 2.32, - "grad_norm": 24.950761795043945, - "learning_rate": 4.552470682569911e-06, - "loss": 1.938, + "epoch": 0.97, + "grad_norm": 32.99787139892578, + "learning_rate": 1.3552273773166548e-05, + "loss": 1.0474, "step": 7707 }, { - "epoch": 2.32, - "grad_norm": 7.129110336303711, - "learning_rate": 4.550466071965521e-06, - "loss": 0.538, + "epoch": 0.97, + "grad_norm": 16.231891632080078, + "learning_rate": 1.3551437058109862e-05, + "loss": 0.8705, "step": 7708 }, { - "epoch": 2.32, - "grad_norm": 10.459171295166016, - "learning_rate": 4.548461461361131e-06, - "loss": 1.3554, + "epoch": 0.97, + "grad_norm": 11.002676963806152, + "learning_rate": 1.3550600343053175e-05, + "loss": 2.3549, "step": 7709 }, { - "epoch": 2.32, - "grad_norm": 26.577316284179688, - "learning_rate": 4.5464568507567405e-06, - "loss": 2.1107, + "epoch": 0.97, + "grad_norm": 15.858479499816895, + "learning_rate": 1.3549763627996486e-05, + "loss": 1.044, "step": 7710 }, { - "epoch": 2.32, - "grad_norm": 87.70563507080078, - "learning_rate": 4.5444522401523506e-06, - "loss": 1.3952, + "epoch": 0.97, + "grad_norm": 13.546462059020996, + "learning_rate": 1.35489269129398e-05, + "loss": 1.4793, "step": 7711 }, { - "epoch": 2.32, - "grad_norm": 10.34211254119873, - "learning_rate": 4.542447629547961e-06, - "loss": 0.7507, + "epoch": 0.97, + "grad_norm": 5.651560306549072, + "learning_rate": 1.3548090197883113e-05, + "loss": 0.34, "step": 7712 }, { - "epoch": 2.32, - "grad_norm": 33.580265045166016, - "learning_rate": 4.540443018943571e-06, - "loss": 1.8236, + "epoch": 0.97, + "grad_norm": 21.489553451538086, + "learning_rate": 1.3547253482826423e-05, + "loss": 2.1118, "step": 7713 }, { - "epoch": 2.32, - "grad_norm": 13.009629249572754, - "learning_rate": 4.53843840833918e-06, - "loss": 1.2249, + "epoch": 0.97, + "grad_norm": 41.191070556640625, + "learning_rate": 1.3546416767769737e-05, + "loss": 3.4881, "step": 7714 }, { - "epoch": 2.32, - "grad_norm": 8.66175365447998, - "learning_rate": 4.536433797734791e-06, - "loss": 0.9531, + "epoch": 0.97, + "grad_norm": 24.51534080505371, + "learning_rate": 1.354558005271305e-05, + "loss": 1.7158, "step": 7715 }, { - "epoch": 2.32, - "grad_norm": 17.59461784362793, - "learning_rate": 4.5344291871304e-06, - "loss": 1.6012, + "epoch": 0.97, + "grad_norm": 8.854673385620117, + "learning_rate": 1.3544743337656363e-05, + "loss": 0.5715, "step": 7716 }, { - "epoch": 2.32, - "grad_norm": 26.786701202392578, - "learning_rate": 4.53242457652601e-06, - "loss": 1.3363, + "epoch": 0.97, + "grad_norm": 34.15606689453125, + "learning_rate": 1.3543906622599674e-05, + "loss": 1.631, "step": 7717 }, { - "epoch": 2.32, - "grad_norm": 10.355632781982422, - "learning_rate": 4.53041996592162e-06, - "loss": 0.8374, + "epoch": 0.97, + "grad_norm": 8.685201644897461, + "learning_rate": 1.3543069907542986e-05, + "loss": 0.8118, "step": 7718 }, { - "epoch": 2.32, - "grad_norm": 13.772407531738281, - "learning_rate": 4.52841535531723e-06, - "loss": 1.9609, + "epoch": 0.97, + "grad_norm": 9.957879066467285, + "learning_rate": 1.35422331924863e-05, + "loss": 0.7961, "step": 7719 }, { - "epoch": 2.32, - "grad_norm": 10.235556602478027, - "learning_rate": 4.52641074471284e-06, - "loss": 1.4712, + "epoch": 0.97, + "grad_norm": 27.317323684692383, + "learning_rate": 1.3541396477429612e-05, + "loss": 1.1692, "step": 7720 }, { - "epoch": 2.32, - "grad_norm": 37.70382308959961, - "learning_rate": 4.52440613410845e-06, - "loss": 1.8212, + "epoch": 0.97, + "grad_norm": 6.7141618728637695, + "learning_rate": 1.3540559762372924e-05, + "loss": 1.0255, "step": 7721 }, { - "epoch": 2.32, - "grad_norm": 16.85622215270996, - "learning_rate": 4.52240152350406e-06, - "loss": 1.3294, + "epoch": 0.97, + "grad_norm": 14.603387832641602, + "learning_rate": 1.3539723047316238e-05, + "loss": 2.6129, "step": 7722 }, { - "epoch": 2.32, - "grad_norm": 13.376880645751953, - "learning_rate": 4.520396912899669e-06, - "loss": 0.8711, + "epoch": 0.97, + "grad_norm": 11.722940444946289, + "learning_rate": 1.3538886332259551e-05, + "loss": 0.8843, "step": 7723 }, { - "epoch": 2.32, - "grad_norm": 24.79469871520996, - "learning_rate": 4.51839230229528e-06, - "loss": 1.125, + "epoch": 0.97, + "grad_norm": 107.16452026367188, + "learning_rate": 1.3538049617202862e-05, + "loss": 2.1597, "step": 7724 }, { - "epoch": 2.32, - "grad_norm": 21.333232879638672, - "learning_rate": 4.516387691690889e-06, - "loss": 1.7883, + "epoch": 0.97, + "grad_norm": 36.03486251831055, + "learning_rate": 1.3537212902146175e-05, + "loss": 2.2156, "step": 7725 }, { - "epoch": 2.32, - "grad_norm": 11.4044771194458, - "learning_rate": 4.514383081086499e-06, - "loss": 1.1805, + "epoch": 0.97, + "grad_norm": 21.66221046447754, + "learning_rate": 1.3536376187089489e-05, + "loss": 1.7477, "step": 7726 }, { - "epoch": 2.32, - "grad_norm": 51.108253479003906, - "learning_rate": 4.512378470482109e-06, - "loss": 1.9263, + "epoch": 0.97, + "grad_norm": 21.030221939086914, + "learning_rate": 1.3535539472032799e-05, + "loss": 1.8737, "step": 7727 }, { - "epoch": 2.32, - "grad_norm": 21.30112075805664, - "learning_rate": 4.5103738598777195e-06, - "loss": 1.3295, + "epoch": 0.97, + "grad_norm": 8.940326690673828, + "learning_rate": 1.3534702756976113e-05, + "loss": 1.1877, "step": 7728 }, { - "epoch": 2.32, - "grad_norm": 26.95627784729004, - "learning_rate": 4.508369249273329e-06, - "loss": 2.7558, + "epoch": 0.97, + "grad_norm": 9.069717407226562, + "learning_rate": 1.3533866041919426e-05, + "loss": 0.7334, "step": 7729 }, { - "epoch": 2.32, - "grad_norm": 13.784543991088867, - "learning_rate": 4.506364638668939e-06, - "loss": 2.2716, + "epoch": 0.97, + "grad_norm": 15.025553703308105, + "learning_rate": 1.3533029326862738e-05, + "loss": 1.7665, "step": 7730 }, { - "epoch": 2.32, - "grad_norm": 55.046142578125, - "learning_rate": 4.504360028064549e-06, - "loss": 1.819, + "epoch": 0.97, + "grad_norm": 23.173837661743164, + "learning_rate": 1.353219261180605e-05, + "loss": 0.7893, "step": 7731 }, { - "epoch": 2.32, - "grad_norm": 10.930252075195312, - "learning_rate": 4.502355417460159e-06, - "loss": 0.8165, + "epoch": 0.97, + "grad_norm": 17.162948608398438, + "learning_rate": 1.3531355896749362e-05, + "loss": 1.1794, "step": 7732 }, { - "epoch": 2.33, - "grad_norm": 23.520112991333008, - "learning_rate": 4.500350806855769e-06, - "loss": 0.9347, + "epoch": 0.97, + "grad_norm": 22.472871780395508, + "learning_rate": 1.3530519181692676e-05, + "loss": 2.3618, "step": 7733 }, { - "epoch": 2.33, - "grad_norm": 8.282185554504395, - "learning_rate": 4.498346196251378e-06, - "loss": 1.1021, + "epoch": 0.97, + "grad_norm": 31.161813735961914, + "learning_rate": 1.3529682466635988e-05, + "loss": 0.5731, "step": 7734 }, { - "epoch": 2.33, - "grad_norm": 15.41140365600586, - "learning_rate": 4.496341585646988e-06, - "loss": 1.5547, + "epoch": 0.97, + "grad_norm": 21.318912506103516, + "learning_rate": 1.35288457515793e-05, + "loss": 0.3049, "step": 7735 }, { - "epoch": 2.33, - "grad_norm": 26.10579490661621, - "learning_rate": 4.4943369750425984e-06, - "loss": 1.77, + "epoch": 0.97, + "grad_norm": 11.425654411315918, + "learning_rate": 1.3528009036522613e-05, + "loss": 1.5813, "step": 7736 }, { - "epoch": 2.33, - "grad_norm": 11.937204360961914, - "learning_rate": 4.4923323644382085e-06, - "loss": 1.0362, + "epoch": 0.97, + "grad_norm": 18.119468688964844, + "learning_rate": 1.3527172321465927e-05, + "loss": 1.351, "step": 7737 }, { - "epoch": 2.33, - "grad_norm": 21.467954635620117, - "learning_rate": 4.490327753833818e-06, - "loss": 1.802, + "epoch": 0.97, + "grad_norm": 14.75485897064209, + "learning_rate": 1.3526335606409237e-05, + "loss": 1.5202, "step": 7738 }, { - "epoch": 2.33, - "grad_norm": 17.1977596282959, - "learning_rate": 4.488323143229429e-06, - "loss": 1.2613, + "epoch": 0.97, + "grad_norm": 22.677011489868164, + "learning_rate": 1.3525498891352551e-05, + "loss": 0.48, "step": 7739 }, { - "epoch": 2.33, - "grad_norm": 11.602217674255371, - "learning_rate": 4.486318532625038e-06, - "loss": 1.5562, + "epoch": 0.97, + "grad_norm": 17.857650756835938, + "learning_rate": 1.3524662176295865e-05, + "loss": 1.4076, "step": 7740 }, { - "epoch": 2.33, - "grad_norm": 50.02082061767578, - "learning_rate": 4.484313922020648e-06, - "loss": 1.2008, + "epoch": 0.97, + "grad_norm": 16.588102340698242, + "learning_rate": 1.3523825461239175e-05, + "loss": 1.1867, "step": 7741 }, { - "epoch": 2.33, - "grad_norm": 16.229761123657227, - "learning_rate": 4.482309311416258e-06, - "loss": 1.5572, + "epoch": 0.97, + "grad_norm": 21.662322998046875, + "learning_rate": 1.3522988746182489e-05, + "loss": 1.6806, "step": 7742 }, { - "epoch": 2.33, - "grad_norm": 16.347227096557617, - "learning_rate": 4.480304700811867e-06, - "loss": 1.4817, + "epoch": 0.97, + "grad_norm": 14.984989166259766, + "learning_rate": 1.3522152031125802e-05, + "loss": 0.6457, "step": 7743 }, { - "epoch": 2.33, - "grad_norm": 9.144948959350586, - "learning_rate": 4.478300090207477e-06, - "loss": 0.6118, + "epoch": 0.97, + "grad_norm": 15.336214065551758, + "learning_rate": 1.3521315316069114e-05, + "loss": 1.5232, "step": 7744 }, { - "epoch": 2.33, - "grad_norm": 45.46603775024414, - "learning_rate": 4.4762954796030875e-06, - "loss": 1.9398, + "epoch": 0.97, + "grad_norm": 7.916646480560303, + "learning_rate": 1.3520478601012426e-05, + "loss": 1.0675, "step": 7745 }, { - "epoch": 2.33, - "grad_norm": 10.924186706542969, - "learning_rate": 4.474290868998698e-06, - "loss": 1.4317, + "epoch": 0.97, + "grad_norm": 28.336755752563477, + "learning_rate": 1.3519641885955738e-05, + "loss": 1.9443, "step": 7746 }, { - "epoch": 2.33, - "grad_norm": 39.43347930908203, - "learning_rate": 4.472286258394307e-06, - "loss": 2.4369, + "epoch": 0.97, + "grad_norm": 8.822896003723145, + "learning_rate": 1.3518805170899052e-05, + "loss": 2.6556, "step": 7747 }, { - "epoch": 2.33, - "grad_norm": 124.04064178466797, - "learning_rate": 4.470281647789917e-06, - "loss": 3.3272, + "epoch": 0.97, + "grad_norm": 13.60275936126709, + "learning_rate": 1.3517968455842364e-05, + "loss": 1.3155, "step": 7748 }, { - "epoch": 2.33, - "grad_norm": 6.773767471313477, - "learning_rate": 4.468277037185527e-06, - "loss": 0.5183, + "epoch": 0.97, + "grad_norm": 11.431167602539062, + "learning_rate": 1.3517131740785676e-05, + "loss": 1.6894, "step": 7749 }, { - "epoch": 2.33, - "grad_norm": 21.174972534179688, - "learning_rate": 4.466272426581137e-06, - "loss": 2.1152, + "epoch": 0.97, + "grad_norm": 8.643516540527344, + "learning_rate": 1.351629502572899e-05, + "loss": 0.2998, "step": 7750 }, { - "epoch": 2.33, - "grad_norm": 15.640390396118164, - "learning_rate": 4.464267815976746e-06, - "loss": 1.0071, + "epoch": 0.97, + "grad_norm": 2.41274356842041, + "learning_rate": 1.3515458310672303e-05, + "loss": 0.0267, "step": 7751 }, { - "epoch": 2.33, - "grad_norm": 11.41567611694336, - "learning_rate": 4.462263205372357e-06, - "loss": 0.9003, + "epoch": 0.97, + "grad_norm": 10.9503173828125, + "learning_rate": 1.3514621595615613e-05, + "loss": 1.6653, "step": 7752 }, { - "epoch": 2.33, - "grad_norm": 13.2136869430542, - "learning_rate": 4.4602585947679665e-06, - "loss": 1.4521, + "epoch": 0.97, + "grad_norm": 8.803579330444336, + "learning_rate": 1.3513784880558927e-05, + "loss": 2.2309, "step": 7753 }, { - "epoch": 2.33, - "grad_norm": 24.674755096435547, - "learning_rate": 4.4582539841635766e-06, - "loss": 1.1701, + "epoch": 0.97, + "grad_norm": 16.392528533935547, + "learning_rate": 1.351294816550224e-05, + "loss": 1.046, "step": 7754 }, { - "epoch": 2.33, - "grad_norm": 27.077539443969727, - "learning_rate": 4.456249373559187e-06, - "loss": 1.3348, + "epoch": 0.97, + "grad_norm": 22.46368980407715, + "learning_rate": 1.351211145044555e-05, + "loss": 2.3814, "step": 7755 }, { - "epoch": 2.33, - "grad_norm": 15.865095138549805, - "learning_rate": 4.454244762954797e-06, - "loss": 0.9423, + "epoch": 0.97, + "grad_norm": 18.388086318969727, + "learning_rate": 1.3511274735388864e-05, + "loss": 1.015, "step": 7756 }, { - "epoch": 2.33, - "grad_norm": 10.586139678955078, - "learning_rate": 4.452240152350406e-06, - "loss": 0.9348, + "epoch": 0.97, + "grad_norm": 21.396652221679688, + "learning_rate": 1.3510438020332176e-05, + "loss": 1.7366, "step": 7757 }, { - "epoch": 2.33, - "grad_norm": 14.161200523376465, - "learning_rate": 4.450235541746016e-06, - "loss": 1.5252, + "epoch": 0.97, + "grad_norm": 9.764254570007324, + "learning_rate": 1.350960130527549e-05, + "loss": 0.4437, "step": 7758 }, { - "epoch": 2.33, - "grad_norm": 23.082622528076172, - "learning_rate": 4.448230931141626e-06, - "loss": 1.3711, + "epoch": 0.97, + "grad_norm": 4.522057056427002, + "learning_rate": 1.3508764590218802e-05, + "loss": 0.2982, "step": 7759 }, { - "epoch": 2.33, - "grad_norm": 19.711462020874023, - "learning_rate": 4.446226320537235e-06, - "loss": 2.655, + "epoch": 0.97, + "grad_norm": 4.771112442016602, + "learning_rate": 1.3507927875162114e-05, + "loss": 0.1846, "step": 7760 }, { - "epoch": 2.33, - "grad_norm": 24.49384880065918, - "learning_rate": 4.444221709932846e-06, - "loss": 1.0757, + "epoch": 0.97, + "grad_norm": 14.318256378173828, + "learning_rate": 1.3507091160105428e-05, + "loss": 1.8543, "step": 7761 }, { - "epoch": 2.33, - "grad_norm": 9.763320922851562, - "learning_rate": 4.4422170993284555e-06, - "loss": 0.9065, + "epoch": 0.97, + "grad_norm": 15.801634788513184, + "learning_rate": 1.350625444504874e-05, + "loss": 1.1831, "step": 7762 }, { - "epoch": 2.33, - "grad_norm": 8.923637390136719, - "learning_rate": 4.440212488724066e-06, - "loss": 1.1599, + "epoch": 0.97, + "grad_norm": 22.527746200561523, + "learning_rate": 1.3505417729992051e-05, + "loss": 2.1718, "step": 7763 }, { - "epoch": 2.33, - "grad_norm": 193.84384155273438, - "learning_rate": 4.438207878119676e-06, - "loss": 1.5881, + "epoch": 0.97, + "grad_norm": 11.585763931274414, + "learning_rate": 1.3504581014935365e-05, + "loss": 1.3421, "step": 7764 }, { - "epoch": 2.33, - "grad_norm": 4.945940971374512, - "learning_rate": 4.436203267515286e-06, - "loss": 0.698, + "epoch": 0.97, + "grad_norm": 8.993477821350098, + "learning_rate": 1.3503744299878679e-05, + "loss": 1.382, "step": 7765 }, { - "epoch": 2.33, - "grad_norm": 8.121231079101562, - "learning_rate": 4.434198656910895e-06, - "loss": 0.9452, + "epoch": 0.97, + "grad_norm": 10.9657564163208, + "learning_rate": 1.3502907584821989e-05, + "loss": 1.5614, "step": 7766 }, { - "epoch": 2.34, - "grad_norm": 28.268239974975586, - "learning_rate": 4.432194046306505e-06, - "loss": 1.5929, + "epoch": 0.97, + "grad_norm": 22.3695125579834, + "learning_rate": 1.3502070869765303e-05, + "loss": 3.1019, "step": 7767 }, { - "epoch": 2.34, - "grad_norm": 70.76728057861328, - "learning_rate": 4.430189435702115e-06, - "loss": 1.6534, + "epoch": 0.97, + "grad_norm": 16.434736251831055, + "learning_rate": 1.3501234154708616e-05, + "loss": 2.7474, "step": 7768 }, { - "epoch": 2.34, - "grad_norm": 23.936372756958008, - "learning_rate": 4.428184825097725e-06, - "loss": 1.2137, + "epoch": 0.97, + "grad_norm": 9.681111335754395, + "learning_rate": 1.3500397439651927e-05, + "loss": 1.5325, "step": 7769 }, { - "epoch": 2.34, - "grad_norm": 9.887136459350586, - "learning_rate": 4.426180214493335e-06, - "loss": 0.8383, + "epoch": 0.98, + "grad_norm": 3.7469215393066406, + "learning_rate": 1.349956072459524e-05, + "loss": 0.266, "step": 7770 }, { - "epoch": 2.34, - "grad_norm": 14.792794227600098, - "learning_rate": 4.424175603888945e-06, - "loss": 0.9405, + "epoch": 0.98, + "grad_norm": 14.711934089660645, + "learning_rate": 1.3498724009538552e-05, + "loss": 1.5471, "step": 7771 }, { - "epoch": 2.34, - "grad_norm": 17.71567726135254, - "learning_rate": 4.422170993284555e-06, - "loss": 1.063, + "epoch": 0.98, + "grad_norm": 43.18144226074219, + "learning_rate": 1.3497887294481866e-05, + "loss": 1.7386, "step": 7772 }, { - "epoch": 2.34, - "grad_norm": 17.9732608795166, - "learning_rate": 4.420166382680165e-06, - "loss": 1.4261, + "epoch": 0.98, + "grad_norm": 13.835360527038574, + "learning_rate": 1.3497050579425178e-05, + "loss": 1.1166, "step": 7773 }, { - "epoch": 2.34, - "grad_norm": 25.94655418395996, - "learning_rate": 4.418161772075775e-06, - "loss": 2.0324, + "epoch": 0.98, + "grad_norm": 14.814412117004395, + "learning_rate": 1.349621386436849e-05, + "loss": 1.0632, "step": 7774 }, { - "epoch": 2.34, - "grad_norm": 39.89909744262695, - "learning_rate": 4.416157161471384e-06, - "loss": 1.7023, + "epoch": 0.98, + "grad_norm": 5.545921802520752, + "learning_rate": 1.3495377149311803e-05, + "loss": 0.9194, "step": 7775 }, { - "epoch": 2.34, - "grad_norm": 23.909780502319336, - "learning_rate": 4.414152550866995e-06, - "loss": 0.9315, + "epoch": 0.98, + "grad_norm": 10.252397537231445, + "learning_rate": 1.3494540434255114e-05, + "loss": 1.2437, "step": 7776 }, { - "epoch": 2.34, - "grad_norm": 13.213065147399902, - "learning_rate": 4.412147940262604e-06, - "loss": 0.9687, + "epoch": 0.98, + "grad_norm": 8.675298690795898, + "learning_rate": 1.3493703719198427e-05, + "loss": 0.8212, "step": 7777 }, { - "epoch": 2.34, - "grad_norm": 45.419342041015625, - "learning_rate": 4.410143329658214e-06, - "loss": 1.2235, + "epoch": 0.98, + "grad_norm": 15.157878875732422, + "learning_rate": 1.3492867004141741e-05, + "loss": 2.6183, "step": 7778 }, { - "epoch": 2.34, - "grad_norm": 18.985164642333984, - "learning_rate": 4.4081387190538244e-06, - "loss": 1.544, + "epoch": 0.98, + "grad_norm": 11.1221284866333, + "learning_rate": 1.3492030289085055e-05, + "loss": 1.5943, "step": 7779 }, { - "epoch": 2.34, - "grad_norm": 9.602522850036621, - "learning_rate": 4.4061341084494345e-06, - "loss": 0.7917, + "epoch": 0.98, + "grad_norm": 46.772125244140625, + "learning_rate": 1.3491193574028365e-05, + "loss": 2.1687, "step": 7780 }, { - "epoch": 2.34, - "grad_norm": 38.75558090209961, - "learning_rate": 4.404129497845044e-06, - "loss": 1.5868, + "epoch": 0.98, + "grad_norm": 23.514244079589844, + "learning_rate": 1.3490356858971679e-05, + "loss": 1.2343, "step": 7781 }, { - "epoch": 2.34, - "grad_norm": 51.216766357421875, - "learning_rate": 4.402124887240654e-06, - "loss": 1.2511, + "epoch": 0.98, + "grad_norm": 11.823355674743652, + "learning_rate": 1.3489520143914992e-05, + "loss": 0.7507, "step": 7782 }, { - "epoch": 2.34, - "grad_norm": 9.934971809387207, - "learning_rate": 4.400120276636264e-06, - "loss": 1.18, + "epoch": 0.98, + "grad_norm": 19.944049835205078, + "learning_rate": 1.3488683428858302e-05, + "loss": 1.3162, "step": 7783 }, { - "epoch": 2.34, - "grad_norm": 10.260887145996094, - "learning_rate": 4.398115666031873e-06, - "loss": 0.5996, + "epoch": 0.98, + "grad_norm": 15.35301685333252, + "learning_rate": 1.3487846713801616e-05, + "loss": 2.3952, "step": 7784 }, { - "epoch": 2.34, - "grad_norm": 26.906917572021484, - "learning_rate": 4.396111055427484e-06, - "loss": 1.635, + "epoch": 0.98, + "grad_norm": 12.974126815795898, + "learning_rate": 1.3487009998744928e-05, + "loss": 0.6608, "step": 7785 }, { - "epoch": 2.34, - "grad_norm": 11.61650562286377, - "learning_rate": 4.394106444823093e-06, - "loss": 0.8312, + "epoch": 0.98, + "grad_norm": 7.92689847946167, + "learning_rate": 1.3486173283688242e-05, + "loss": 0.4993, "step": 7786 }, { - "epoch": 2.34, - "grad_norm": 40.0263786315918, - "learning_rate": 4.392101834218703e-06, - "loss": 0.9066, + "epoch": 0.98, + "grad_norm": 12.310649871826172, + "learning_rate": 1.3485336568631554e-05, + "loss": 0.8984, "step": 7787 }, { - "epoch": 2.34, - "grad_norm": 31.856773376464844, - "learning_rate": 4.390097223614313e-06, - "loss": 1.9564, + "epoch": 0.98, + "grad_norm": 13.593944549560547, + "learning_rate": 1.3484499853574866e-05, + "loss": 0.9544, "step": 7788 }, { - "epoch": 2.34, - "grad_norm": 33.76646423339844, - "learning_rate": 4.388092613009924e-06, - "loss": 1.9249, + "epoch": 0.98, + "grad_norm": 10.441452026367188, + "learning_rate": 1.348366313851818e-05, + "loss": 2.1254, "step": 7789 }, { - "epoch": 2.34, - "grad_norm": 24.660625457763672, - "learning_rate": 4.386088002405533e-06, - "loss": 2.0123, + "epoch": 0.98, + "grad_norm": 19.644514083862305, + "learning_rate": 1.348282642346149e-05, + "loss": 1.4696, "step": 7790 }, { - "epoch": 2.34, - "grad_norm": 16.023160934448242, - "learning_rate": 4.384083391801143e-06, - "loss": 2.5144, + "epoch": 0.98, + "grad_norm": 6.888669013977051, + "learning_rate": 1.3481989708404803e-05, + "loss": 0.8234, "step": 7791 }, { - "epoch": 2.34, - "grad_norm": 18.865354537963867, - "learning_rate": 4.382078781196753e-06, - "loss": 1.4033, + "epoch": 0.98, + "grad_norm": 6.575916767120361, + "learning_rate": 1.3481152993348117e-05, + "loss": 0.9467, "step": 7792 }, { - "epoch": 2.34, - "grad_norm": 17.00137710571289, - "learning_rate": 4.380074170592363e-06, - "loss": 1.7571, + "epoch": 0.98, + "grad_norm": 15.154905319213867, + "learning_rate": 1.348031627829143e-05, + "loss": 1.0205, "step": 7793 }, { - "epoch": 2.34, - "grad_norm": 22.194595336914062, - "learning_rate": 4.378069559987972e-06, - "loss": 1.5826, + "epoch": 0.98, + "grad_norm": 52.29448699951172, + "learning_rate": 1.347947956323474e-05, + "loss": 1.3675, "step": 7794 }, { - "epoch": 2.34, - "grad_norm": 24.60336685180664, - "learning_rate": 4.376064949383582e-06, - "loss": 1.7541, + "epoch": 0.98, + "grad_norm": 25.070926666259766, + "learning_rate": 1.3478642848178054e-05, + "loss": 2.9792, "step": 7795 }, { - "epoch": 2.34, - "grad_norm": 14.990224838256836, - "learning_rate": 4.3740603387791925e-06, - "loss": 1.1141, + "epoch": 0.98, + "grad_norm": 12.322017669677734, + "learning_rate": 1.3477806133121368e-05, + "loss": 1.9872, "step": 7796 }, { - "epoch": 2.34, - "grad_norm": 17.370750427246094, - "learning_rate": 4.3720557281748026e-06, - "loss": 0.947, + "epoch": 0.98, + "grad_norm": 27.83733558654785, + "learning_rate": 1.3476969418064678e-05, + "loss": 0.6899, "step": 7797 }, { - "epoch": 2.34, - "grad_norm": 32.88360595703125, - "learning_rate": 4.370051117570413e-06, - "loss": 1.035, + "epoch": 0.98, + "grad_norm": 15.455463409423828, + "learning_rate": 1.3476132703007992e-05, + "loss": 1.3138, "step": 7798 }, { - "epoch": 2.34, - "grad_norm": 13.272078514099121, - "learning_rate": 4.368046506966022e-06, - "loss": 0.8005, + "epoch": 0.98, + "grad_norm": 29.341999053955078, + "learning_rate": 1.3475295987951304e-05, + "loss": 1.3879, "step": 7799 }, { - "epoch": 2.35, - "grad_norm": 28.69300651550293, - "learning_rate": 4.366041896361632e-06, - "loss": 1.6131, - "step": 7800 - }, - { - "epoch": 2.35, - "eval_loss": 0.17263449728488922, - "eval_runtime": 43.5025, - "eval_samples_per_second": 33.998, - "eval_steps_per_second": 33.998, + "epoch": 0.98, + "grad_norm": 21.08104705810547, + "learning_rate": 1.3474459272894618e-05, + "loss": 1.296, "step": 7800 }, { - "epoch": 2.35, - "grad_norm": 21.5859317779541, - "learning_rate": 4.364037285757242e-06, - "loss": 1.961, + "epoch": 0.98, + "grad_norm": 1.2503612041473389, + "learning_rate": 1.347362255783793e-05, + "loss": 0.0344, "step": 7801 }, { - "epoch": 2.35, - "grad_norm": 45.89167785644531, - "learning_rate": 4.362032675152852e-06, - "loss": 0.887, + "epoch": 0.98, + "grad_norm": 14.25670337677002, + "learning_rate": 1.3472785842781241e-05, + "loss": 2.0681, "step": 7802 }, { - "epoch": 2.35, - "grad_norm": 9.14841079711914, - "learning_rate": 4.360028064548461e-06, - "loss": 0.82, + "epoch": 0.98, + "grad_norm": 8.220523834228516, + "learning_rate": 1.3471949127724555e-05, + "loss": 1.7112, "step": 7803 }, { - "epoch": 2.35, - "grad_norm": 13.917359352111816, - "learning_rate": 4.358023453944072e-06, - "loss": 0.6227, + "epoch": 0.98, + "grad_norm": 5.848735332489014, + "learning_rate": 1.3471112412667865e-05, + "loss": 0.461, "step": 7804 }, { - "epoch": 2.35, - "grad_norm": 10.889512062072754, - "learning_rate": 4.3560188433396815e-06, - "loss": 0.885, + "epoch": 0.98, + "grad_norm": 51.04117202758789, + "learning_rate": 1.3470275697611179e-05, + "loss": 1.8216, "step": 7805 }, { - "epoch": 2.35, - "grad_norm": 11.779256820678711, - "learning_rate": 4.354014232735292e-06, - "loss": 1.6273, + "epoch": 0.98, + "grad_norm": 23.475095748901367, + "learning_rate": 1.3469438982554493e-05, + "loss": 2.4093, "step": 7806 }, { - "epoch": 2.35, - "grad_norm": 9.816779136657715, - "learning_rate": 4.352009622130902e-06, - "loss": 1.2789, + "epoch": 0.98, + "grad_norm": 9.354192733764648, + "learning_rate": 1.3468602267497806e-05, + "loss": 0.8441, "step": 7807 }, { - "epoch": 2.35, - "grad_norm": 10.623241424560547, - "learning_rate": 4.350005011526511e-06, - "loss": 0.888, + "epoch": 0.98, + "grad_norm": 7.1941728591918945, + "learning_rate": 1.3467765552441117e-05, + "loss": 0.6073, "step": 7808 }, { - "epoch": 2.35, - "grad_norm": 15.834614753723145, - "learning_rate": 4.348000400922121e-06, - "loss": 0.8899, + "epoch": 0.98, + "grad_norm": 12.23647403717041, + "learning_rate": 1.346692883738443e-05, + "loss": 2.0373, "step": 7809 }, { - "epoch": 2.35, - "grad_norm": 8.324065208435059, - "learning_rate": 4.345995790317731e-06, - "loss": 0.7989, + "epoch": 0.98, + "grad_norm": 14.28429889678955, + "learning_rate": 1.3466092122327742e-05, + "loss": 1.3354, "step": 7810 }, { - "epoch": 2.35, - "grad_norm": 17.0432186126709, - "learning_rate": 4.343991179713341e-06, - "loss": 2.3516, + "epoch": 0.98, + "grad_norm": 6.032944679260254, + "learning_rate": 1.3465255407271054e-05, + "loss": 0.2188, "step": 7811 }, { - "epoch": 2.35, - "grad_norm": 20.02086639404297, - "learning_rate": 4.3419865691089504e-06, - "loss": 0.8566, + "epoch": 0.98, + "grad_norm": 7.6251325607299805, + "learning_rate": 1.3464418692214368e-05, + "loss": 1.3673, "step": 7812 }, { - "epoch": 2.35, - "grad_norm": 7.930890083312988, - "learning_rate": 4.339981958504561e-06, - "loss": 1.2351, + "epoch": 0.98, + "grad_norm": 38.285099029541016, + "learning_rate": 1.346358197715768e-05, + "loss": 2.4264, "step": 7813 }, { - "epoch": 2.35, - "grad_norm": 30.14038848876953, - "learning_rate": 4.337977347900171e-06, - "loss": 1.7222, + "epoch": 0.98, + "grad_norm": 5.613325595855713, + "learning_rate": 1.3462745262100993e-05, + "loss": 1.3284, "step": 7814 }, { - "epoch": 2.35, - "grad_norm": 12.645387649536133, - "learning_rate": 4.335972737295781e-06, - "loss": 1.3162, + "epoch": 0.98, + "grad_norm": 21.35129165649414, + "learning_rate": 1.3461908547044305e-05, + "loss": 0.6903, "step": 7815 }, { - "epoch": 2.35, - "grad_norm": 17.415475845336914, - "learning_rate": 4.333968126691391e-06, - "loss": 1.8934, + "epoch": 0.98, + "grad_norm": 49.54762649536133, + "learning_rate": 1.3461071831987617e-05, + "loss": 2.2245, "step": 7816 }, { - "epoch": 2.35, - "grad_norm": 13.79565143585205, - "learning_rate": 4.331963516087001e-06, - "loss": 1.4519, + "epoch": 0.98, + "grad_norm": 27.526241302490234, + "learning_rate": 1.3460235116930931e-05, + "loss": 2.7754, "step": 7817 }, { - "epoch": 2.35, - "grad_norm": 14.080390930175781, - "learning_rate": 4.32995890548261e-06, - "loss": 0.899, + "epoch": 0.98, + "grad_norm": 120.94630432128906, + "learning_rate": 1.3459398401874241e-05, + "loss": 2.2525, "step": 7818 }, { - "epoch": 2.35, - "grad_norm": 71.52656555175781, - "learning_rate": 4.32795429487822e-06, - "loss": 1.7353, + "epoch": 0.98, + "grad_norm": 12.913002967834473, + "learning_rate": 1.3458561686817555e-05, + "loss": 1.2333, "step": 7819 }, { - "epoch": 2.35, - "grad_norm": 20.801849365234375, - "learning_rate": 4.32594968427383e-06, - "loss": 1.2644, + "epoch": 0.98, + "grad_norm": 16.263164520263672, + "learning_rate": 1.3457724971760868e-05, + "loss": 0.798, "step": 7820 }, { - "epoch": 2.35, - "grad_norm": 40.78676223754883, - "learning_rate": 4.32394507366944e-06, - "loss": 2.8361, + "epoch": 0.98, + "grad_norm": 6.407655239105225, + "learning_rate": 1.3456888256704182e-05, + "loss": 0.7711, "step": 7821 }, { - "epoch": 2.35, - "grad_norm": 14.759114265441895, - "learning_rate": 4.3219404630650504e-06, - "loss": 1.0354, + "epoch": 0.98, + "grad_norm": 17.457874298095703, + "learning_rate": 1.3456051541647492e-05, + "loss": 1.9697, "step": 7822 }, { - "epoch": 2.35, - "grad_norm": 24.163408279418945, - "learning_rate": 4.31993585246066e-06, - "loss": 1.1916, + "epoch": 0.98, + "grad_norm": 13.347925186157227, + "learning_rate": 1.3455214826590806e-05, + "loss": 0.7008, "step": 7823 }, { - "epoch": 2.35, - "grad_norm": 8.51475715637207, - "learning_rate": 4.31793124185627e-06, - "loss": 1.5931, + "epoch": 0.98, + "grad_norm": 26.760360717773438, + "learning_rate": 1.3454378111534118e-05, + "loss": 3.1434, "step": 7824 }, { - "epoch": 2.35, - "grad_norm": 16.245559692382812, - "learning_rate": 4.31592663125188e-06, - "loss": 1.3994, + "epoch": 0.98, + "grad_norm": 14.519676208496094, + "learning_rate": 1.345354139647743e-05, + "loss": 0.5767, "step": 7825 }, { - "epoch": 2.35, - "grad_norm": 78.04993438720703, - "learning_rate": 4.31392202064749e-06, - "loss": 2.4156, + "epoch": 0.98, + "grad_norm": 57.62624740600586, + "learning_rate": 1.3452704681420744e-05, + "loss": 1.4939, "step": 7826 }, { - "epoch": 2.35, - "grad_norm": 11.318324089050293, - "learning_rate": 4.311917410043099e-06, - "loss": 1.7852, + "epoch": 0.98, + "grad_norm": 8.744379043579102, + "learning_rate": 1.3451867966364056e-05, + "loss": 1.1015, "step": 7827 }, { - "epoch": 2.35, - "grad_norm": 9.104801177978516, - "learning_rate": 4.309912799438709e-06, - "loss": 0.7264, + "epoch": 0.98, + "grad_norm": 21.934654235839844, + "learning_rate": 1.345103125130737e-05, + "loss": 1.5129, "step": 7828 }, { - "epoch": 2.35, - "grad_norm": 12.194317817687988, - "learning_rate": 4.307908188834319e-06, - "loss": 1.4084, + "epoch": 0.98, + "grad_norm": 10.702959060668945, + "learning_rate": 1.345019453625068e-05, + "loss": 1.1694, "step": 7829 }, { - "epoch": 2.35, - "grad_norm": 39.487327575683594, - "learning_rate": 4.305903578229929e-06, - "loss": 1.4993, + "epoch": 0.98, + "grad_norm": 13.19983959197998, + "learning_rate": 1.3449357821193993e-05, + "loss": 1.1448, "step": 7830 }, { - "epoch": 2.35, - "grad_norm": 24.5257511138916, - "learning_rate": 4.303898967625539e-06, - "loss": 1.7682, + "epoch": 0.98, + "grad_norm": 12.683655738830566, + "learning_rate": 1.3448521106137307e-05, + "loss": 1.0168, "step": 7831 }, { - "epoch": 2.35, - "grad_norm": 22.789091110229492, - "learning_rate": 4.301894357021149e-06, - "loss": 2.2842, + "epoch": 0.98, + "grad_norm": 44.0377197265625, + "learning_rate": 1.3447684391080617e-05, + "loss": 1.3388, "step": 7832 }, { - "epoch": 2.36, - "grad_norm": 14.122344017028809, - "learning_rate": 4.299889746416759e-06, - "loss": 0.7481, + "epoch": 0.98, + "grad_norm": 11.622760772705078, + "learning_rate": 1.344684767602393e-05, + "loss": 0.6615, "step": 7833 }, { - "epoch": 2.36, - "grad_norm": 9.010689735412598, - "learning_rate": 4.297885135812369e-06, - "loss": 1.0806, + "epoch": 0.98, + "grad_norm": 12.449114799499512, + "learning_rate": 1.3446010960967244e-05, + "loss": 2.0388, "step": 7834 }, { - "epoch": 2.36, - "grad_norm": 20.649883270263672, - "learning_rate": 4.295880525207979e-06, - "loss": 1.4924, + "epoch": 0.98, + "grad_norm": 10.657881736755371, + "learning_rate": 1.3445174245910558e-05, + "loss": 0.3719, "step": 7835 }, { - "epoch": 2.36, - "grad_norm": 24.006229400634766, - "learning_rate": 4.293875914603588e-06, - "loss": 1.4348, + "epoch": 0.98, + "grad_norm": 82.41954803466797, + "learning_rate": 1.3444337530853868e-05, + "loss": 1.0537, "step": 7836 }, { - "epoch": 2.36, - "grad_norm": 30.37126350402832, - "learning_rate": 4.291871303999198e-06, - "loss": 1.5881, + "epoch": 0.98, + "grad_norm": 5.820470333099365, + "learning_rate": 1.3443500815797182e-05, + "loss": 0.3194, "step": 7837 }, { - "epoch": 2.36, - "grad_norm": 10.542387962341309, - "learning_rate": 4.289866693394808e-06, - "loss": 0.7792, + "epoch": 0.98, + "grad_norm": 35.94833755493164, + "learning_rate": 1.3442664100740494e-05, + "loss": 1.8463, "step": 7838 }, { - "epoch": 2.36, - "grad_norm": 19.010889053344727, - "learning_rate": 4.2878620827904185e-06, - "loss": 1.4441, + "epoch": 0.98, + "grad_norm": 23.072025299072266, + "learning_rate": 1.3441827385683806e-05, + "loss": 2.4423, "step": 7839 }, { - "epoch": 2.36, - "grad_norm": 13.315712928771973, - "learning_rate": 4.285857472186028e-06, - "loss": 1.0043, + "epoch": 0.98, + "grad_norm": 26.65378761291504, + "learning_rate": 1.344099067062712e-05, + "loss": 1.1492, "step": 7840 }, { - "epoch": 2.36, - "grad_norm": 11.758505821228027, - "learning_rate": 4.283852861581639e-06, - "loss": 1.304, + "epoch": 0.98, + "grad_norm": 44.74949264526367, + "learning_rate": 1.3440153955570431e-05, + "loss": 2.5089, "step": 7841 }, { - "epoch": 2.36, - "grad_norm": 14.62195873260498, - "learning_rate": 4.281848250977248e-06, - "loss": 1.1863, + "epoch": 0.98, + "grad_norm": 9.149215698242188, + "learning_rate": 1.3439317240513745e-05, + "loss": 1.724, "step": 7842 }, { - "epoch": 2.36, - "grad_norm": 17.724573135375977, - "learning_rate": 4.279843640372858e-06, - "loss": 1.0244, + "epoch": 0.98, + "grad_norm": 16.55966567993164, + "learning_rate": 1.3438480525457055e-05, + "loss": 1.4478, "step": 7843 }, { - "epoch": 2.36, - "grad_norm": 49.88128662109375, - "learning_rate": 4.277839029768468e-06, - "loss": 1.6886, + "epoch": 0.98, + "grad_norm": 80.31683349609375, + "learning_rate": 1.3437643810400369e-05, + "loss": 2.3294, "step": 7844 }, { - "epoch": 2.36, - "grad_norm": 28.493061065673828, - "learning_rate": 4.275834419164077e-06, - "loss": 1.7018, + "epoch": 0.98, + "grad_norm": 35.48666000366211, + "learning_rate": 1.3436807095343683e-05, + "loss": 1.6704, "step": 7845 }, { - "epoch": 2.36, - "grad_norm": 17.226518630981445, - "learning_rate": 4.273829808559687e-06, - "loss": 1.3155, + "epoch": 0.98, + "grad_norm": 63.827598571777344, + "learning_rate": 1.3435970380286993e-05, + "loss": 2.8656, "step": 7846 }, { - "epoch": 2.36, - "grad_norm": 14.533112525939941, - "learning_rate": 4.2718251979552975e-06, - "loss": 1.6489, + "epoch": 0.98, + "grad_norm": 21.52463722229004, + "learning_rate": 1.3435133665230307e-05, + "loss": 0.9325, "step": 7847 }, { - "epoch": 2.36, - "grad_norm": 154.60177612304688, - "learning_rate": 4.2698205873509075e-06, - "loss": 3.9771, + "epoch": 0.98, + "grad_norm": 24.884567260742188, + "learning_rate": 1.343429695017362e-05, + "loss": 2.3111, "step": 7848 }, { - "epoch": 2.36, - "grad_norm": 24.33751678466797, - "learning_rate": 4.267815976746517e-06, - "loss": 1.5103, + "epoch": 0.99, + "grad_norm": 13.400162696838379, + "learning_rate": 1.3433460235116934e-05, + "loss": 2.0544, "step": 7849 }, { - "epoch": 2.36, - "grad_norm": 7.450550556182861, - "learning_rate": 4.265811366142128e-06, - "loss": 0.8299, + "epoch": 0.99, + "grad_norm": 12.546667098999023, + "learning_rate": 1.3432623520060244e-05, + "loss": 1.994, "step": 7850 }, { - "epoch": 2.36, - "grad_norm": 11.310479164123535, - "learning_rate": 4.263806755537737e-06, - "loss": 1.0471, + "epoch": 0.99, + "grad_norm": 60.43228530883789, + "learning_rate": 1.3431786805003558e-05, + "loss": 2.4401, "step": 7851 }, { - "epoch": 2.36, - "grad_norm": 22.815073013305664, - "learning_rate": 4.261802144933347e-06, - "loss": 2.6051, + "epoch": 0.99, + "grad_norm": 11.646256446838379, + "learning_rate": 1.343095008994687e-05, + "loss": 1.1565, "step": 7852 }, { - "epoch": 2.36, - "grad_norm": 49.23891067504883, - "learning_rate": 4.259797534328957e-06, - "loss": 1.1229, + "epoch": 0.99, + "grad_norm": 18.161865234375, + "learning_rate": 1.3430113374890182e-05, + "loss": 1.9924, "step": 7853 }, { - "epoch": 2.36, - "grad_norm": 12.846585273742676, - "learning_rate": 4.257792923724567e-06, - "loss": 1.0635, + "epoch": 0.99, + "grad_norm": 17.5528507232666, + "learning_rate": 1.3429276659833495e-05, + "loss": 2.2338, "step": 7854 }, { - "epoch": 2.36, - "grad_norm": 7.844564437866211, - "learning_rate": 4.2557883131201764e-06, - "loss": 0.8797, + "epoch": 0.99, + "grad_norm": 13.696662902832031, + "learning_rate": 1.3428439944776807e-05, + "loss": 1.6114, "step": 7855 }, { - "epoch": 2.36, - "grad_norm": 12.018916130065918, - "learning_rate": 4.2537837025157865e-06, - "loss": 1.2833, + "epoch": 0.99, + "grad_norm": 12.333256721496582, + "learning_rate": 1.3427603229720121e-05, + "loss": 1.2289, "step": 7856 }, { - "epoch": 2.36, - "grad_norm": 15.914363861083984, - "learning_rate": 4.251779091911397e-06, - "loss": 1.2172, + "epoch": 0.99, + "grad_norm": 20.567373275756836, + "learning_rate": 1.3426766514663431e-05, + "loss": 1.2595, "step": 7857 }, { - "epoch": 2.36, - "grad_norm": 20.0510311126709, - "learning_rate": 4.249774481307007e-06, - "loss": 0.7942, + "epoch": 0.99, + "grad_norm": 18.33525848388672, + "learning_rate": 1.3425929799606745e-05, + "loss": 1.8468, "step": 7858 }, { - "epoch": 2.36, - "grad_norm": 45.73860549926758, - "learning_rate": 4.247769870702617e-06, - "loss": 2.2663, + "epoch": 0.99, + "grad_norm": 32.14639663696289, + "learning_rate": 1.3425093084550058e-05, + "loss": 2.8181, "step": 7859 }, { - "epoch": 2.36, - "grad_norm": 18.577871322631836, - "learning_rate": 4.245765260098226e-06, - "loss": 1.8624, + "epoch": 0.99, + "grad_norm": 8.10889720916748, + "learning_rate": 1.3424256369493369e-05, + "loss": 1.4549, "step": 7860 }, { - "epoch": 2.36, - "grad_norm": 14.684171676635742, - "learning_rate": 4.243760649493836e-06, - "loss": 1.2921, + "epoch": 0.99, + "grad_norm": 17.031816482543945, + "learning_rate": 1.3423419654436682e-05, + "loss": 2.38, "step": 7861 }, { - "epoch": 2.36, - "grad_norm": 20.421449661254883, - "learning_rate": 4.241756038889446e-06, - "loss": 1.2322, + "epoch": 0.99, + "grad_norm": 16.39002227783203, + "learning_rate": 1.3422582939379996e-05, + "loss": 1.4777, "step": 7862 }, { - "epoch": 2.36, - "grad_norm": 11.350706100463867, - "learning_rate": 4.239751428285056e-06, - "loss": 1.3288, + "epoch": 0.99, + "grad_norm": 10.644102096557617, + "learning_rate": 1.3421746224323308e-05, + "loss": 1.2411, "step": 7863 }, { - "epoch": 2.36, - "grad_norm": 21.545028686523438, - "learning_rate": 4.2377468176806655e-06, - "loss": 1.4258, + "epoch": 0.99, + "grad_norm": 14.060710906982422, + "learning_rate": 1.342090950926662e-05, + "loss": 2.5573, "step": 7864 }, { - "epoch": 2.36, - "grad_norm": 18.22039222717285, - "learning_rate": 4.2357422070762764e-06, - "loss": 1.4584, + "epoch": 0.99, + "grad_norm": 16.674654006958008, + "learning_rate": 1.3420072794209934e-05, + "loss": 1.6149, "step": 7865 }, { - "epoch": 2.37, - "grad_norm": 28.657711029052734, - "learning_rate": 4.233737596471886e-06, - "loss": 2.444, + "epoch": 0.99, + "grad_norm": 16.86676025390625, + "learning_rate": 1.3419236079153246e-05, + "loss": 1.0251, "step": 7866 }, { - "epoch": 2.37, - "grad_norm": 7.091336727142334, - "learning_rate": 4.231732985867496e-06, - "loss": 1.0055, + "epoch": 0.99, + "grad_norm": 12.13718032836914, + "learning_rate": 1.3418399364096557e-05, + "loss": 1.0789, "step": 7867 }, { - "epoch": 2.37, - "grad_norm": 10.07259750366211, - "learning_rate": 4.229728375263106e-06, - "loss": 0.9513, + "epoch": 0.99, + "grad_norm": 21.83576202392578, + "learning_rate": 1.341756264903987e-05, + "loss": 0.7112, "step": 7868 }, { - "epoch": 2.37, - "grad_norm": 32.32286834716797, - "learning_rate": 4.227723764658715e-06, - "loss": 1.2243, + "epoch": 0.99, + "grad_norm": 19.9146728515625, + "learning_rate": 1.3416725933983183e-05, + "loss": 3.1556, "step": 7869 }, { - "epoch": 2.37, - "grad_norm": 14.63747787475586, - "learning_rate": 4.225719154054325e-06, - "loss": 1.3113, + "epoch": 0.99, + "grad_norm": 16.90733528137207, + "learning_rate": 1.3415889218926497e-05, + "loss": 1.7789, "step": 7870 }, { - "epoch": 2.37, - "grad_norm": 13.216215133666992, - "learning_rate": 4.223714543449935e-06, - "loss": 1.3608, + "epoch": 0.99, + "grad_norm": 15.959580421447754, + "learning_rate": 1.3415052503869807e-05, + "loss": 1.3213, "step": 7871 }, { - "epoch": 2.37, - "grad_norm": 22.928085327148438, - "learning_rate": 4.221709932845545e-06, - "loss": 1.5632, + "epoch": 0.99, + "grad_norm": 7.629356384277344, + "learning_rate": 1.341421578881312e-05, + "loss": 0.6035, "step": 7872 }, { - "epoch": 2.37, - "grad_norm": 14.180635452270508, - "learning_rate": 4.2197053222411546e-06, - "loss": 1.0618, + "epoch": 0.99, + "grad_norm": 12.714489936828613, + "learning_rate": 1.3413379073756434e-05, + "loss": 1.6759, "step": 7873 }, { - "epoch": 2.37, - "grad_norm": 13.465377807617188, - "learning_rate": 4.217700711636765e-06, - "loss": 2.2106, + "epoch": 0.99, + "grad_norm": 50.399253845214844, + "learning_rate": 1.3412542358699745e-05, + "loss": 2.6481, "step": 7874 }, { - "epoch": 2.37, - "grad_norm": 21.83644676208496, - "learning_rate": 4.215696101032375e-06, - "loss": 1.1141, + "epoch": 0.99, + "grad_norm": 16.95625114440918, + "learning_rate": 1.3411705643643058e-05, + "loss": 1.0485, "step": 7875 }, { - "epoch": 2.37, - "grad_norm": 9.465641021728516, - "learning_rate": 4.213691490427985e-06, - "loss": 0.9889, + "epoch": 0.99, + "grad_norm": 12.690945625305176, + "learning_rate": 1.3410868928586372e-05, + "loss": 1.2125, "step": 7876 }, { - "epoch": 2.37, - "grad_norm": 38.847965240478516, - "learning_rate": 4.211686879823594e-06, - "loss": 2.6969, + "epoch": 0.99, + "grad_norm": 12.52241039276123, + "learning_rate": 1.3410032213529684e-05, + "loss": 2.4324, "step": 7877 }, { - "epoch": 2.37, - "grad_norm": 12.433218002319336, - "learning_rate": 4.209682269219205e-06, - "loss": 0.5288, + "epoch": 0.99, + "grad_norm": 27.132232666015625, + "learning_rate": 1.3409195498472996e-05, + "loss": 1.5371, "step": 7878 }, { - "epoch": 2.37, - "grad_norm": 13.307668685913086, - "learning_rate": 4.207677658614814e-06, - "loss": 0.9523, + "epoch": 0.99, + "grad_norm": 15.67525863647461, + "learning_rate": 1.340835878341631e-05, + "loss": 1.0848, "step": 7879 }, { - "epoch": 2.37, - "grad_norm": 57.23488235473633, - "learning_rate": 4.205673048010424e-06, - "loss": 3.0293, + "epoch": 0.99, + "grad_norm": 7.077928066253662, + "learning_rate": 1.3407522068359621e-05, + "loss": 1.9246, "step": 7880 }, { - "epoch": 2.37, - "grad_norm": 16.167064666748047, - "learning_rate": 4.203668437406034e-06, - "loss": 1.9897, + "epoch": 0.99, + "grad_norm": 13.227967262268066, + "learning_rate": 1.3406685353302933e-05, + "loss": 1.9851, "step": 7881 }, { - "epoch": 2.37, - "grad_norm": 29.6549072265625, - "learning_rate": 4.2016638268016445e-06, - "loss": 1.3998, + "epoch": 0.99, + "grad_norm": 17.633745193481445, + "learning_rate": 1.3405848638246245e-05, + "loss": 1.5183, "step": 7882 }, { - "epoch": 2.37, - "grad_norm": 10.331962585449219, - "learning_rate": 4.199659216197254e-06, - "loss": 0.8686, + "epoch": 0.99, + "grad_norm": 8.09128189086914, + "learning_rate": 1.3405011923189559e-05, + "loss": 2.1152, "step": 7883 }, { - "epoch": 2.37, - "grad_norm": 16.08567237854004, - "learning_rate": 4.197654605592864e-06, - "loss": 1.3522, + "epoch": 0.99, + "grad_norm": 12.385307312011719, + "learning_rate": 1.3404175208132873e-05, + "loss": 1.4056, "step": 7884 }, { - "epoch": 2.37, - "grad_norm": 21.387054443359375, - "learning_rate": 4.195649994988474e-06, - "loss": 1.0533, + "epoch": 0.99, + "grad_norm": 17.078733444213867, + "learning_rate": 1.3403338493076183e-05, + "loss": 1.6283, "step": 7885 }, { - "epoch": 2.37, - "grad_norm": 18.664920806884766, - "learning_rate": 4.193645384384083e-06, - "loss": 1.7003, + "epoch": 0.99, + "grad_norm": 9.437175750732422, + "learning_rate": 1.3402501778019496e-05, + "loss": 0.3968, "step": 7886 }, { - "epoch": 2.37, - "grad_norm": 12.59611988067627, - "learning_rate": 4.191640773779694e-06, - "loss": 1.0916, + "epoch": 0.99, + "grad_norm": 21.20670509338379, + "learning_rate": 1.340166506296281e-05, + "loss": 1.8604, "step": 7887 }, { - "epoch": 2.37, - "grad_norm": 11.082344055175781, - "learning_rate": 4.189636163175303e-06, - "loss": 1.3448, + "epoch": 0.99, + "grad_norm": 7.615916728973389, + "learning_rate": 1.340082834790612e-05, + "loss": 0.7451, "step": 7888 }, { - "epoch": 2.37, - "grad_norm": 15.192607879638672, - "learning_rate": 4.187631552570913e-06, - "loss": 1.1133, + "epoch": 0.99, + "grad_norm": 6.689842224121094, + "learning_rate": 1.3399991632849434e-05, + "loss": 1.3682, "step": 7889 }, { - "epoch": 2.37, - "grad_norm": 12.27822494506836, - "learning_rate": 4.1856269419665235e-06, - "loss": 1.4396, + "epoch": 0.99, + "grad_norm": 16.554161071777344, + "learning_rate": 1.3399154917792748e-05, + "loss": 1.4628, "step": 7890 }, { - "epoch": 2.37, - "grad_norm": 18.4125919342041, - "learning_rate": 4.1836223313621335e-06, - "loss": 0.9025, + "epoch": 0.99, + "grad_norm": 9.20697021484375, + "learning_rate": 1.339831820273606e-05, + "loss": 1.1317, "step": 7891 }, { - "epoch": 2.37, - "grad_norm": 20.866134643554688, - "learning_rate": 4.181617720757743e-06, - "loss": 1.1648, + "epoch": 0.99, + "grad_norm": 12.420833587646484, + "learning_rate": 1.3397481487679372e-05, + "loss": 0.8767, "step": 7892 }, { - "epoch": 2.37, - "grad_norm": 16.63862419128418, - "learning_rate": 4.179613110153353e-06, - "loss": 1.4421, + "epoch": 0.99, + "grad_norm": 9.886767387390137, + "learning_rate": 1.3396644772622685e-05, + "loss": 0.9803, "step": 7893 }, { - "epoch": 2.37, - "grad_norm": 9.658064842224121, - "learning_rate": 4.177608499548963e-06, - "loss": 0.6854, + "epoch": 0.99, + "grad_norm": 22.698192596435547, + "learning_rate": 1.3395808057565997e-05, + "loss": 3.4859, "step": 7894 }, { - "epoch": 2.37, - "grad_norm": 12.731464385986328, - "learning_rate": 4.175603888944573e-06, - "loss": 1.374, + "epoch": 0.99, + "grad_norm": 20.673337936401367, + "learning_rate": 1.3394971342509309e-05, + "loss": 0.9118, "step": 7895 }, { - "epoch": 2.37, - "grad_norm": 8.585317611694336, - "learning_rate": 4.173599278340183e-06, - "loss": 0.5775, + "epoch": 0.99, + "grad_norm": 19.7998104095459, + "learning_rate": 1.3394134627452621e-05, + "loss": 0.3165, "step": 7896 }, { - "epoch": 2.37, - "grad_norm": 24.50328826904297, - "learning_rate": 4.171594667735792e-06, - "loss": 1.5935, + "epoch": 0.99, + "grad_norm": 8.360078811645508, + "learning_rate": 1.3393297912395935e-05, + "loss": 1.5516, "step": 7897 }, { - "epoch": 2.37, - "grad_norm": 16.08628273010254, - "learning_rate": 4.1695900571314024e-06, - "loss": 1.0627, + "epoch": 0.99, + "grad_norm": 21.86443519592285, + "learning_rate": 1.3392461197339248e-05, + "loss": 1.0845, "step": 7898 }, { - "epoch": 2.37, - "grad_norm": 15.948634147644043, - "learning_rate": 4.1675854465270125e-06, - "loss": 1.4136, + "epoch": 0.99, + "grad_norm": 14.555537223815918, + "learning_rate": 1.3391624482282559e-05, + "loss": 2.7609, "step": 7899 }, { - "epoch": 2.38, - "grad_norm": 31.818986892700195, - "learning_rate": 4.165580835922623e-06, - "loss": 1.8128, + "epoch": 0.99, + "grad_norm": 9.91486644744873, + "learning_rate": 1.3390787767225872e-05, + "loss": 0.8442, "step": 7900 }, { - "epoch": 2.38, - "grad_norm": 37.07505798339844, - "learning_rate": 4.163576225318232e-06, - "loss": 1.9246, + "epoch": 0.99, + "grad_norm": 16.55044174194336, + "learning_rate": 1.3389951052169186e-05, + "loss": 2.1581, "step": 7901 }, { - "epoch": 2.38, - "grad_norm": 15.36788558959961, - "learning_rate": 4.161571614713843e-06, - "loss": 1.9555, + "epoch": 0.99, + "grad_norm": 27.388978958129883, + "learning_rate": 1.3389114337112496e-05, + "loss": 1.3599, "step": 7902 }, { - "epoch": 2.38, - "grad_norm": 17.946441650390625, - "learning_rate": 4.159567004109452e-06, - "loss": 1.1702, + "epoch": 0.99, + "grad_norm": 5.923564910888672, + "learning_rate": 1.338827762205581e-05, + "loss": 0.4826, "step": 7903 }, { - "epoch": 2.38, - "grad_norm": 12.651650428771973, - "learning_rate": 4.157562393505062e-06, - "loss": 1.0666, + "epoch": 0.99, + "grad_norm": 11.348058700561523, + "learning_rate": 1.3387440906999124e-05, + "loss": 1.8191, "step": 7904 }, { - "epoch": 2.38, - "grad_norm": 31.045000076293945, - "learning_rate": 4.155557782900672e-06, - "loss": 1.553, + "epoch": 0.99, + "grad_norm": 7.931649684906006, + "learning_rate": 1.3386604191942435e-05, + "loss": 1.7959, "step": 7905 }, { - "epoch": 2.38, - "grad_norm": 11.69579029083252, - "learning_rate": 4.153553172296282e-06, - "loss": 1.5206, + "epoch": 0.99, + "grad_norm": 9.703704833984375, + "learning_rate": 1.3385767476885747e-05, + "loss": 1.1969, "step": 7906 }, { - "epoch": 2.38, - "grad_norm": 37.5781135559082, - "learning_rate": 4.1515485616918915e-06, - "loss": 2.1329, + "epoch": 0.99, + "grad_norm": 14.642962455749512, + "learning_rate": 1.3384930761829061e-05, + "loss": 0.6981, "step": 7907 }, { - "epoch": 2.38, - "grad_norm": 21.697532653808594, - "learning_rate": 4.149543951087502e-06, - "loss": 1.4265, + "epoch": 0.99, + "grad_norm": 9.679839134216309, + "learning_rate": 1.3384094046772373e-05, + "loss": 1.6064, "step": 7908 }, { - "epoch": 2.38, - "grad_norm": 9.745403289794922, - "learning_rate": 4.147539340483112e-06, - "loss": 1.1323, + "epoch": 0.99, + "grad_norm": 7.588835716247559, + "learning_rate": 1.3383257331715685e-05, + "loss": 0.8063, "step": 7909 }, { - "epoch": 2.38, - "grad_norm": 15.652579307556152, - "learning_rate": 4.145534729878721e-06, - "loss": 1.3761, + "epoch": 0.99, + "grad_norm": 5.758579730987549, + "learning_rate": 1.3382420616658997e-05, + "loss": 0.5944, "step": 7910 }, { - "epoch": 2.38, - "grad_norm": 12.394492149353027, - "learning_rate": 4.143530119274332e-06, - "loss": 0.9479, + "epoch": 0.99, + "grad_norm": 11.920259475708008, + "learning_rate": 1.338158390160231e-05, + "loss": 1.72, "step": 7911 }, { - "epoch": 2.38, - "grad_norm": 19.919248580932617, - "learning_rate": 4.141525508669941e-06, - "loss": 1.2497, + "epoch": 0.99, + "grad_norm": 29.732891082763672, + "learning_rate": 1.3380747186545624e-05, + "loss": 1.4716, "step": 7912 }, { - "epoch": 2.38, - "grad_norm": 13.425742149353027, - "learning_rate": 4.139520898065551e-06, - "loss": 1.0198, + "epoch": 0.99, + "grad_norm": 20.857481002807617, + "learning_rate": 1.3379910471488934e-05, + "loss": 2.462, "step": 7913 }, { - "epoch": 2.38, - "grad_norm": 17.13833999633789, - "learning_rate": 4.13751628746116e-06, - "loss": 2.0541, + "epoch": 0.99, + "grad_norm": 9.390600204467773, + "learning_rate": 1.3379073756432248e-05, + "loss": 2.1183, "step": 7914 }, { - "epoch": 2.38, - "grad_norm": 13.618558883666992, - "learning_rate": 4.135511676856771e-06, - "loss": 1.3842, + "epoch": 0.99, + "grad_norm": 16.786588668823242, + "learning_rate": 1.3378237041375562e-05, + "loss": 1.5882, "step": 7915 }, { - "epoch": 2.38, - "grad_norm": 5.930107593536377, - "learning_rate": 4.1335070662523806e-06, - "loss": 0.517, + "epoch": 0.99, + "grad_norm": 16.013607025146484, + "learning_rate": 1.3377400326318872e-05, + "loss": 1.1315, "step": 7916 }, { - "epoch": 2.38, - "grad_norm": 4.825802803039551, - "learning_rate": 4.131502455647991e-06, - "loss": 0.3964, + "epoch": 0.99, + "grad_norm": 13.470029830932617, + "learning_rate": 1.3376563611262186e-05, + "loss": 1.48, "step": 7917 }, { - "epoch": 2.38, - "grad_norm": 25.973148345947266, - "learning_rate": 4.129497845043601e-06, - "loss": 0.9199, + "epoch": 0.99, + "grad_norm": 8.650081634521484, + "learning_rate": 1.33757268962055e-05, + "loss": 0.4609, "step": 7918 }, { - "epoch": 2.38, - "grad_norm": 9.384675979614258, - "learning_rate": 4.127493234439211e-06, - "loss": 1.1662, + "epoch": 0.99, + "grad_norm": 16.66744613647461, + "learning_rate": 1.3374890181148811e-05, + "loss": 2.0696, "step": 7919 }, { - "epoch": 2.38, - "grad_norm": 59.6272087097168, - "learning_rate": 4.12548862383482e-06, - "loss": 2.4387, - "step": 7920 - }, - { - "epoch": 2.38, - "eval_loss": 0.17472867667675018, - "eval_runtime": 43.5367, - "eval_samples_per_second": 33.971, - "eval_steps_per_second": 33.971, + "epoch": 0.99, + "grad_norm": 13.903091430664062, + "learning_rate": 1.3374053466092123e-05, + "loss": 1.9955, "step": 7920 }, { - "epoch": 2.38, - "grad_norm": 17.422163009643555, - "learning_rate": 4.12348401323043e-06, - "loss": 0.9547, + "epoch": 0.99, + "grad_norm": 23.533159255981445, + "learning_rate": 1.3373216751035435e-05, + "loss": 2.6719, "step": 7921 }, { - "epoch": 2.38, - "grad_norm": 19.566699981689453, - "learning_rate": 4.12147940262604e-06, - "loss": 1.364, + "epoch": 0.99, + "grad_norm": 28.801076889038086, + "learning_rate": 1.3372380035978749e-05, + "loss": 2.1541, "step": 7922 }, { - "epoch": 2.38, - "grad_norm": 13.921435356140137, - "learning_rate": 4.11947479202165e-06, - "loss": 1.5498, + "epoch": 0.99, + "grad_norm": 31.399919509887695, + "learning_rate": 1.337154332092206e-05, + "loss": 2.4247, "step": 7923 }, { - "epoch": 2.38, - "grad_norm": 46.268829345703125, - "learning_rate": 4.11747018141726e-06, - "loss": 1.7792, + "epoch": 0.99, + "grad_norm": 19.266828536987305, + "learning_rate": 1.3370706605865373e-05, + "loss": 2.8681, "step": 7924 }, { - "epoch": 2.38, - "grad_norm": 37.72319793701172, - "learning_rate": 4.11546557081287e-06, - "loss": 1.582, + "epoch": 0.99, + "grad_norm": 8.171679496765137, + "learning_rate": 1.3369869890808686e-05, + "loss": 0.48, "step": 7925 }, { - "epoch": 2.38, - "grad_norm": 13.244497299194336, - "learning_rate": 4.11346096020848e-06, - "loss": 1.7341, + "epoch": 0.99, + "grad_norm": 35.95570373535156, + "learning_rate": 1.3369033175752e-05, + "loss": 2.1517, "step": 7926 }, { - "epoch": 2.38, - "grad_norm": 10.264554977416992, - "learning_rate": 4.11145634960409e-06, - "loss": 0.902, + "epoch": 0.99, + "grad_norm": 9.317283630371094, + "learning_rate": 1.336819646069531e-05, + "loss": 0.9883, "step": 7927 }, { - "epoch": 2.38, - "grad_norm": 16.762826919555664, - "learning_rate": 4.1094517389997e-06, - "loss": 1.2989, + "epoch": 0.99, + "grad_norm": 13.376811981201172, + "learning_rate": 1.3367359745638624e-05, + "loss": 0.2741, "step": 7928 }, { - "epoch": 2.38, - "grad_norm": 27.864768981933594, - "learning_rate": 4.107447128395309e-06, - "loss": 1.0548, + "epoch": 1.0, + "grad_norm": 27.397188186645508, + "learning_rate": 1.3366523030581938e-05, + "loss": 0.8773, "step": 7929 }, { - "epoch": 2.38, - "grad_norm": 9.768196105957031, - "learning_rate": 4.105442517790919e-06, - "loss": 0.7929, + "epoch": 1.0, + "grad_norm": 5.359547138214111, + "learning_rate": 1.3365686315525248e-05, + "loss": 0.2089, "step": 7930 }, { - "epoch": 2.38, - "grad_norm": 8.759325981140137, - "learning_rate": 4.103437907186529e-06, - "loss": 0.6668, + "epoch": 1.0, + "grad_norm": 11.692357063293457, + "learning_rate": 1.3364849600468562e-05, + "loss": 1.8957, "step": 7931 }, { - "epoch": 2.38, - "grad_norm": 29.228063583374023, - "learning_rate": 4.101433296582139e-06, - "loss": 1.5968, + "epoch": 1.0, + "grad_norm": 11.032751083374023, + "learning_rate": 1.3364012885411875e-05, + "loss": 1.8149, "step": 7932 }, { - "epoch": 2.39, - "grad_norm": 15.066993713378906, - "learning_rate": 4.0994286859777495e-06, - "loss": 1.457, + "epoch": 1.0, + "grad_norm": 22.042510986328125, + "learning_rate": 1.3363176170355187e-05, + "loss": 1.264, "step": 7933 }, { - "epoch": 2.39, - "grad_norm": 12.962151527404785, - "learning_rate": 4.097424075373359e-06, - "loss": 1.568, + "epoch": 1.0, + "grad_norm": 24.833213806152344, + "learning_rate": 1.3362339455298499e-05, + "loss": 1.1884, "step": 7934 }, { - "epoch": 2.39, - "grad_norm": 9.651278495788574, - "learning_rate": 4.095419464768969e-06, - "loss": 0.9769, + "epoch": 1.0, + "grad_norm": 14.692005157470703, + "learning_rate": 1.3361502740241811e-05, + "loss": 1.085, "step": 7935 }, { - "epoch": 2.39, - "grad_norm": 27.313953399658203, - "learning_rate": 4.093414854164579e-06, - "loss": 1.1726, + "epoch": 1.0, + "grad_norm": 8.472384452819824, + "learning_rate": 1.3360666025185125e-05, + "loss": 1.341, "step": 7936 }, { - "epoch": 2.39, - "grad_norm": 30.882871627807617, - "learning_rate": 4.091410243560189e-06, - "loss": 2.3507, + "epoch": 1.0, + "grad_norm": 19.474328994750977, + "learning_rate": 1.3359829310128437e-05, + "loss": 1.6633, "step": 7937 }, { - "epoch": 2.39, - "grad_norm": 16.88389015197754, - "learning_rate": 4.089405632955798e-06, - "loss": 0.9228, + "epoch": 1.0, + "grad_norm": 12.89220905303955, + "learning_rate": 1.3358992595071749e-05, + "loss": 0.9524, "step": 7938 }, { - "epoch": 2.39, - "grad_norm": 54.354469299316406, - "learning_rate": 4.087401022351409e-06, - "loss": 2.5421, + "epoch": 1.0, + "grad_norm": 38.02602767944336, + "learning_rate": 1.3358155880015062e-05, + "loss": 1.6038, "step": 7939 }, { - "epoch": 2.39, - "grad_norm": 18.750852584838867, - "learning_rate": 4.085396411747018e-06, - "loss": 1.2235, + "epoch": 1.0, + "grad_norm": 7.536783218383789, + "learning_rate": 1.3357319164958376e-05, + "loss": 0.5629, "step": 7940 }, { - "epoch": 2.39, - "grad_norm": 7.334155559539795, - "learning_rate": 4.0833918011426284e-06, - "loss": 0.647, + "epoch": 1.0, + "grad_norm": 23.697574615478516, + "learning_rate": 1.3356482449901686e-05, + "loss": 2.2301, "step": 7941 }, { - "epoch": 2.39, - "grad_norm": 14.79666519165039, - "learning_rate": 4.0813871905382385e-06, - "loss": 0.8448, + "epoch": 1.0, + "grad_norm": 16.0355281829834, + "learning_rate": 1.3355645734845e-05, + "loss": 1.0149, "step": 7942 }, { - "epoch": 2.39, - "grad_norm": 35.3593864440918, - "learning_rate": 4.079382579933849e-06, - "loss": 1.4009, + "epoch": 1.0, + "grad_norm": 11.679667472839355, + "learning_rate": 1.3354809019788313e-05, + "loss": 1.4238, "step": 7943 }, { - "epoch": 2.39, - "grad_norm": 25.9478702545166, - "learning_rate": 4.077377969329458e-06, - "loss": 1.7573, + "epoch": 1.0, + "grad_norm": 7.693532466888428, + "learning_rate": 1.3353972304731624e-05, + "loss": 1.9168, "step": 7944 }, { - "epoch": 2.39, - "grad_norm": 13.793614387512207, - "learning_rate": 4.075373358725068e-06, - "loss": 1.5591, + "epoch": 1.0, + "grad_norm": 23.826108932495117, + "learning_rate": 1.3353135589674937e-05, + "loss": 2.9985, "step": 7945 }, { - "epoch": 2.39, - "grad_norm": 15.494868278503418, - "learning_rate": 4.073368748120678e-06, - "loss": 1.6407, + "epoch": 1.0, + "grad_norm": 16.03788185119629, + "learning_rate": 1.3352298874618251e-05, + "loss": 0.7254, "step": 7946 }, { - "epoch": 2.39, - "grad_norm": 26.147010803222656, - "learning_rate": 4.071364137516287e-06, - "loss": 1.6419, + "epoch": 1.0, + "grad_norm": 57.76662063598633, + "learning_rate": 1.3351462159561563e-05, + "loss": 1.2052, "step": 7947 }, { - "epoch": 2.39, - "grad_norm": 19.630136489868164, - "learning_rate": 4.069359526911898e-06, - "loss": 1.5473, + "epoch": 1.0, + "grad_norm": 128.8223114013672, + "learning_rate": 1.3350625444504875e-05, + "loss": 2.3229, "step": 7948 }, { - "epoch": 2.39, - "grad_norm": 75.69630432128906, - "learning_rate": 4.0673549163075074e-06, - "loss": 2.2357, + "epoch": 1.0, + "grad_norm": 12.937859535217285, + "learning_rate": 1.3349788729448187e-05, + "loss": 1.7812, "step": 7949 }, { - "epoch": 2.39, - "grad_norm": 82.69189453125, - "learning_rate": 4.0653503057031175e-06, - "loss": 1.675, + "epoch": 1.0, + "grad_norm": 83.09746551513672, + "learning_rate": 1.33489520143915e-05, + "loss": 1.2496, "step": 7950 }, { - "epoch": 2.39, - "grad_norm": 12.869571685791016, - "learning_rate": 4.063345695098728e-06, - "loss": 1.1058, + "epoch": 1.0, + "grad_norm": 18.56004524230957, + "learning_rate": 1.3348115299334812e-05, + "loss": 3.7962, "step": 7951 }, { - "epoch": 2.39, - "grad_norm": 16.09258270263672, - "learning_rate": 4.061341084494338e-06, - "loss": 1.3042, + "epoch": 1.0, + "grad_norm": 7.800342559814453, + "learning_rate": 1.3347278584278124e-05, + "loss": 0.9315, "step": 7952 }, { - "epoch": 2.39, - "grad_norm": 15.368246078491211, - "learning_rate": 4.059336473889947e-06, - "loss": 0.7744, + "epoch": 1.0, + "grad_norm": 10.73358154296875, + "learning_rate": 1.3346441869221438e-05, + "loss": 1.1961, "step": 7953 }, { - "epoch": 2.39, - "grad_norm": 56.09071350097656, - "learning_rate": 4.057331863285557e-06, - "loss": 1.4905, + "epoch": 1.0, + "grad_norm": 23.34194564819336, + "learning_rate": 1.3345605154164752e-05, + "loss": 1.1545, "step": 7954 }, { - "epoch": 2.39, - "grad_norm": 11.899616241455078, - "learning_rate": 4.055327252681167e-06, - "loss": 1.1252, + "epoch": 1.0, + "grad_norm": 37.151573181152344, + "learning_rate": 1.3344768439108062e-05, + "loss": 1.6355, "step": 7955 }, { - "epoch": 2.39, - "grad_norm": 17.915721893310547, - "learning_rate": 4.053322642076777e-06, - "loss": 1.8459, + "epoch": 1.0, + "grad_norm": 29.568134307861328, + "learning_rate": 1.3343931724051376e-05, + "loss": 0.9761, "step": 7956 }, { - "epoch": 2.39, - "grad_norm": 12.431897163391113, - "learning_rate": 4.051318031472386e-06, - "loss": 1.0242, + "epoch": 1.0, + "grad_norm": 8.467724800109863, + "learning_rate": 1.334309500899469e-05, + "loss": 1.0828, "step": 7957 }, { - "epoch": 2.39, - "grad_norm": 15.987489700317383, - "learning_rate": 4.0493134208679965e-06, - "loss": 2.4057, + "epoch": 1.0, + "grad_norm": 12.173320770263672, + "learning_rate": 1.3342258293938e-05, + "loss": 1.5829, "step": 7958 }, { - "epoch": 2.39, - "grad_norm": 14.070905685424805, - "learning_rate": 4.0473088102636066e-06, - "loss": 1.2005, + "epoch": 1.0, + "grad_norm": 7.051929950714111, + "learning_rate": 1.3341421578881313e-05, + "loss": 0.9812, "step": 7959 }, { - "epoch": 2.39, - "grad_norm": 26.776086807250977, - "learning_rate": 4.045304199659217e-06, - "loss": 1.1287, + "epoch": 1.0, + "grad_norm": 10.67720890045166, + "learning_rate": 1.3340584863824627e-05, + "loss": 1.0112, "step": 7960 }, { - "epoch": 2.39, - "grad_norm": 21.03402328491211, - "learning_rate": 4.043299589054827e-06, - "loss": 1.6414, + "epoch": 1.0, + "grad_norm": 11.94753360748291, + "learning_rate": 1.3339748148767939e-05, + "loss": 0.6997, "step": 7961 }, { - "epoch": 2.39, - "grad_norm": 16.39931297302246, - "learning_rate": 4.041294978450436e-06, - "loss": 1.7569, + "epoch": 1.0, + "grad_norm": 39.415924072265625, + "learning_rate": 1.333891143371125e-05, + "loss": 1.9105, "step": 7962 }, { - "epoch": 2.39, - "grad_norm": 22.343650817871094, - "learning_rate": 4.039290367846046e-06, - "loss": 1.0703, + "epoch": 1.0, + "grad_norm": 63.462162017822266, + "learning_rate": 1.3338074718654563e-05, + "loss": 2.275, "step": 7963 }, { - "epoch": 2.39, - "grad_norm": 21.395599365234375, - "learning_rate": 4.037285757241656e-06, - "loss": 2.1687, + "epoch": 1.0, + "grad_norm": 17.261051177978516, + "learning_rate": 1.3337238003597876e-05, + "loss": 1.5312, "step": 7964 }, { - "epoch": 2.39, - "grad_norm": 53.524776458740234, - "learning_rate": 4.035281146637266e-06, - "loss": 1.6589, + "epoch": 1.0, + "grad_norm": 23.740114212036133, + "learning_rate": 1.3336401288541188e-05, + "loss": 1.1335, "step": 7965 }, { - "epoch": 2.4, - "grad_norm": 12.918240547180176, - "learning_rate": 4.0332765360328755e-06, - "loss": 0.878, + "epoch": 1.0, + "grad_norm": 12.573298454284668, + "learning_rate": 1.33355645734845e-05, + "loss": 0.9672, "step": 7966 }, { - "epoch": 2.4, - "grad_norm": 10.968676567077637, - "learning_rate": 4.031271925428486e-06, - "loss": 2.1511, + "epoch": 1.0, + "grad_norm": 16.43240737915039, + "learning_rate": 1.3334727858427814e-05, + "loss": 1.1471, "step": 7967 }, { - "epoch": 2.4, - "grad_norm": 21.93174934387207, - "learning_rate": 4.029267314824096e-06, - "loss": 1.5569, + "epoch": 1.0, + "grad_norm": 25.22066879272461, + "learning_rate": 1.3333891143371128e-05, + "loss": 2.2558, "step": 7968 }, { - "epoch": 2.4, - "grad_norm": 18.74276351928711, - "learning_rate": 4.027262704219706e-06, - "loss": 0.7945, + "epoch": 1.0, + "grad_norm": 15.474662780761719, + "learning_rate": 1.3333054428314438e-05, + "loss": 1.2835, "step": 7969 }, { - "epoch": 2.4, - "grad_norm": 9.503168106079102, - "learning_rate": 4.025258093615316e-06, - "loss": 0.8866, + "epoch": 1.0, + "grad_norm": 7.968123435974121, + "learning_rate": 1.3332217713257751e-05, + "loss": 1.2493, "step": 7970 }, { - "epoch": 2.4, - "grad_norm": 39.47118377685547, - "learning_rate": 4.023253483010925e-06, - "loss": 1.4191, + "epoch": 1.0, + "grad_norm": 22.447425842285156, + "learning_rate": 1.3331380998201065e-05, + "loss": 3.0713, "step": 7971 }, { - "epoch": 2.4, - "grad_norm": 11.129521369934082, - "learning_rate": 4.021248872406535e-06, - "loss": 1.1143, + "epoch": 1.0, + "grad_norm": 23.46609878540039, + "learning_rate": 1.3330544283144375e-05, + "loss": 1.6728, "step": 7972 }, { - "epoch": 2.4, - "grad_norm": 13.493324279785156, - "learning_rate": 4.019244261802145e-06, - "loss": 1.5284, + "epoch": 1.0, + "grad_norm": 19.523942947387695, + "learning_rate": 1.3329707568087689e-05, + "loss": 2.5787, "step": 7973 }, { - "epoch": 2.4, - "grad_norm": 12.583949089050293, - "learning_rate": 4.017239651197755e-06, - "loss": 1.6311, + "epoch": 1.0, + "grad_norm": 18.46872329711914, + "learning_rate": 1.3328870853031001e-05, + "loss": 3.5041, "step": 7974 }, { - "epoch": 2.4, - "grad_norm": 7.225354194641113, - "learning_rate": 4.0152350405933645e-06, - "loss": 0.5856, + "epoch": 1.0, + "grad_norm": 30.07337188720703, + "learning_rate": 1.3328034137974315e-05, + "loss": 1.852, "step": 7975 }, { - "epoch": 2.4, - "grad_norm": 15.300594329833984, - "learning_rate": 4.0132304299889755e-06, - "loss": 1.0028, + "epoch": 1.0, + "grad_norm": 15.493572235107422, + "learning_rate": 1.3327197422917627e-05, + "loss": 2.3278, "step": 7976 }, { - "epoch": 2.4, - "grad_norm": 16.86861228942871, - "learning_rate": 4.011225819384585e-06, - "loss": 1.7745, + "epoch": 1.0, + "grad_norm": 13.22238540649414, + "learning_rate": 1.3326360707860939e-05, + "loss": 1.1271, "step": 7977 }, { - "epoch": 2.4, - "grad_norm": 20.883081436157227, - "learning_rate": 4.009221208780195e-06, - "loss": 1.6959, + "epoch": 1.0, + "grad_norm": 19.484376907348633, + "learning_rate": 1.3325523992804252e-05, + "loss": 0.7411, "step": 7978 }, { - "epoch": 2.4, - "grad_norm": 120.636962890625, - "learning_rate": 4.007216598175805e-06, - "loss": 2.2987, + "epoch": 1.0, + "grad_norm": 11.625712394714355, + "learning_rate": 1.3324687277747562e-05, + "loss": 1.6891, "step": 7979 }, { - "epoch": 2.4, - "grad_norm": 76.55606842041016, - "learning_rate": 4.005211987571415e-06, - "loss": 1.4258, + "epoch": 1.0, + "grad_norm": 28.404155731201172, + "learning_rate": 1.3323850562690876e-05, + "loss": 1.3849, "step": 7980 }, { - "epoch": 2.4, - "grad_norm": 15.42659854888916, - "learning_rate": 4.003207376967024e-06, - "loss": 1.5873, + "epoch": 1.0, + "grad_norm": 6.87016487121582, + "learning_rate": 1.332301384763419e-05, + "loss": 0.3202, "step": 7981 }, { - "epoch": 2.4, - "grad_norm": 9.378588676452637, - "learning_rate": 4.001202766362634e-06, - "loss": 0.9196, + "epoch": 1.0, + "grad_norm": 12.507184982299805, + "learning_rate": 1.3322177132577503e-05, + "loss": 1.6465, "step": 7982 }, { - "epoch": 2.4, - "grad_norm": 12.483980178833008, - "learning_rate": 3.999198155758244e-06, - "loss": 0.9071, + "epoch": 1.0, + "grad_norm": 36.98790740966797, + "learning_rate": 1.3321340417520814e-05, + "loss": 2.302, "step": 7983 }, { - "epoch": 2.4, - "grad_norm": 7.117259502410889, - "learning_rate": 3.9971935451538544e-06, - "loss": 0.6292, + "epoch": 1.0, + "grad_norm": 8.607172012329102, + "learning_rate": 1.3320503702464127e-05, + "loss": 0.8184, "step": 7984 }, { - "epoch": 2.4, - "grad_norm": 14.306236267089844, - "learning_rate": 3.9951889345494645e-06, - "loss": 1.0767, + "epoch": 1.0, + "grad_norm": 12.417741775512695, + "learning_rate": 1.3319666987407441e-05, + "loss": 1.9958, "step": 7985 }, { - "epoch": 2.4, - "grad_norm": 46.9137077331543, - "learning_rate": 3.993184323945074e-06, - "loss": 2.4379, + "epoch": 1.0, + "grad_norm": 17.457847595214844, + "learning_rate": 1.3318830272350751e-05, + "loss": 1.7666, "step": 7986 }, { - "epoch": 2.4, - "grad_norm": 17.20838737487793, - "learning_rate": 3.991179713340684e-06, - "loss": 1.2174, + "epoch": 1.0, + "grad_norm": 18.212696075439453, + "learning_rate": 1.3317993557294065e-05, + "loss": 1.4751, "step": 7987 }, { - "epoch": 2.4, - "grad_norm": 9.353306770324707, - "learning_rate": 3.989175102736294e-06, - "loss": 1.0121, + "epoch": 1.0, + "grad_norm": 9.338573455810547, + "learning_rate": 1.3317156842237377e-05, + "loss": 1.1111, "step": 7988 }, { - "epoch": 2.4, - "grad_norm": 17.213781356811523, - "learning_rate": 3.987170492131904e-06, - "loss": 1.4443, + "epoch": 1.0, + "grad_norm": 8.514516830444336, + "learning_rate": 1.331632012718069e-05, + "loss": 0.594, "step": 7989 }, { - "epoch": 2.4, - "grad_norm": 6.010331630706787, - "learning_rate": 3.985165881527513e-06, - "loss": 0.8314, + "epoch": 1.0, + "grad_norm": 61.38825225830078, + "learning_rate": 1.3315483412124002e-05, + "loss": 1.7547, "step": 7990 }, { - "epoch": 2.4, - "grad_norm": 12.470282554626465, - "learning_rate": 3.983161270923124e-06, - "loss": 1.9135, + "epoch": 1.0, + "grad_norm": 9.940217018127441, + "learning_rate": 1.3314646697067314e-05, + "loss": 1.8636, "step": 7991 }, { - "epoch": 2.4, - "grad_norm": 25.282251358032227, - "learning_rate": 3.9811566603187334e-06, - "loss": 1.9866, + "epoch": 1.0, + "grad_norm": 20.792577743530273, + "learning_rate": 1.3313809982010628e-05, + "loss": 1.5036, "step": 7992 }, { - "epoch": 2.4, - "grad_norm": 12.670585632324219, - "learning_rate": 3.9791520497143435e-06, - "loss": 0.8544, + "epoch": 1.0, + "grad_norm": 12.03057861328125, + "learning_rate": 1.3312973266953938e-05, + "loss": 1.1097, "step": 7993 }, { - "epoch": 2.4, - "grad_norm": 21.99867057800293, - "learning_rate": 3.977147439109954e-06, - "loss": 0.906, + "epoch": 1.0, + "grad_norm": 8.143668174743652, + "learning_rate": 1.3312136551897252e-05, + "loss": 1.62, "step": 7994 }, { - "epoch": 2.4, - "grad_norm": 26.105510711669922, - "learning_rate": 3.975142828505563e-06, - "loss": 1.1245, + "epoch": 1.0, + "grad_norm": 22.409191131591797, + "learning_rate": 1.3311299836840566e-05, + "loss": 2.4149, "step": 7995 }, { - "epoch": 2.4, - "grad_norm": 27.73430824279785, - "learning_rate": 3.973138217901173e-06, - "loss": 1.5627, + "epoch": 1.0, + "grad_norm": 20.784591674804688, + "learning_rate": 1.331046312178388e-05, + "loss": 1.3993, "step": 7996 }, { - "epoch": 2.4, - "grad_norm": 31.076269149780273, - "learning_rate": 3.971133607296783e-06, - "loss": 1.5272, + "epoch": 1.0, + "grad_norm": 3.964128017425537, + "learning_rate": 1.330962640672719e-05, + "loss": 0.265, "step": 7997 }, { - "epoch": 2.4, - "grad_norm": 20.252588272094727, - "learning_rate": 3.969128996692393e-06, - "loss": 1.3878, + "epoch": 1.0, + "grad_norm": 11.53171443939209, + "learning_rate": 1.3308789691670503e-05, + "loss": 0.9298, "step": 7998 }, { - "epoch": 2.4, - "grad_norm": 15.601667404174805, - "learning_rate": 3.967124386088002e-06, - "loss": 1.6348, + "epoch": 1.0, + "grad_norm": 20.267475128173828, + "learning_rate": 1.3307952976613817e-05, + "loss": 2.2056, "step": 7999 }, { - "epoch": 2.41, - "grad_norm": 15.273148536682129, - "learning_rate": 3.965119775483612e-06, - "loss": 2.6779, + "epoch": 1.0, + "grad_norm": 9.326493263244629, + "learning_rate": 1.3307116261557127e-05, + "loss": 1.0893, "step": 8000 }, { - "epoch": 2.41, - "grad_norm": 14.738374710083008, - "learning_rate": 3.9631151648792225e-06, - "loss": 1.3222, + "epoch": 1.0, + "eval_loss": 0.10294318199157715, + "eval_runtime": 191.9003, + "eval_samples_per_second": 18.458, + "eval_steps_per_second": 18.458, + "step": 8000 + }, + { + "epoch": 1.0, + "grad_norm": 10.06924057006836, + "learning_rate": 1.330627954650044e-05, + "loss": 2.4817, "step": 8001 }, { - "epoch": 2.41, - "grad_norm": 12.968079566955566, - "learning_rate": 3.9611105542748326e-06, - "loss": 0.8718, + "epoch": 1.0, + "grad_norm": 23.34333610534668, + "learning_rate": 1.3305442831443753e-05, + "loss": 4.3463, "step": 8002 }, { - "epoch": 2.41, - "grad_norm": 15.838987350463867, - "learning_rate": 3.959105943670442e-06, - "loss": 1.1175, + "epoch": 1.0, + "grad_norm": 42.41044235229492, + "learning_rate": 1.3304606116387066e-05, + "loss": 3.5817, "step": 8003 }, { - "epoch": 2.41, - "grad_norm": 11.581687927246094, - "learning_rate": 3.957101333066053e-06, - "loss": 2.2458, + "epoch": 1.0, + "grad_norm": 5.411513805389404, + "learning_rate": 1.3303769401330378e-05, + "loss": 0.5194, "step": 8004 }, { - "epoch": 2.41, - "grad_norm": 13.473747253417969, - "learning_rate": 3.955096722461662e-06, - "loss": 1.0507, + "epoch": 1.0, + "grad_norm": 7.918196678161621, + "learning_rate": 1.330293268627369e-05, + "loss": 1.5817, "step": 8005 }, { - "epoch": 2.41, - "grad_norm": 22.461040496826172, - "learning_rate": 3.953092111857272e-06, - "loss": 1.8377, + "epoch": 1.0, + "grad_norm": 38.20075225830078, + "learning_rate": 1.3302095971217004e-05, + "loss": 3.9696, "step": 8006 }, { - "epoch": 2.41, - "grad_norm": 9.827531814575195, - "learning_rate": 3.951087501252882e-06, - "loss": 1.3932, + "epoch": 1.0, + "grad_norm": 32.289485931396484, + "learning_rate": 1.3301259256160314e-05, + "loss": 0.9916, "step": 8007 }, { - "epoch": 2.41, - "grad_norm": 17.85820198059082, - "learning_rate": 3.949082890648492e-06, - "loss": 1.0887, + "epoch": 1.0, + "grad_norm": 44.22406005859375, + "learning_rate": 1.3300422541103628e-05, + "loss": 2.6136, "step": 8008 }, { - "epoch": 2.41, - "grad_norm": 10.893664360046387, - "learning_rate": 3.9470782800441015e-06, - "loss": 0.6649, + "epoch": 1.01, + "grad_norm": 23.457265853881836, + "learning_rate": 1.3299585826046941e-05, + "loss": 2.8706, "step": 8009 }, { - "epoch": 2.41, - "grad_norm": 27.924177169799805, - "learning_rate": 3.9450736694397116e-06, - "loss": 1.7343, + "epoch": 1.01, + "grad_norm": 26.69243049621582, + "learning_rate": 1.3298749110990255e-05, + "loss": 2.6025, "step": 8010 }, { - "epoch": 2.41, - "grad_norm": 7.996644973754883, - "learning_rate": 3.943069058835322e-06, - "loss": 1.1344, + "epoch": 1.01, + "grad_norm": 14.823225021362305, + "learning_rate": 1.3297912395933565e-05, + "loss": 0.8757, "step": 8011 }, { - "epoch": 2.41, - "grad_norm": 29.98558807373047, - "learning_rate": 3.941064448230931e-06, - "loss": 1.4769, + "epoch": 1.01, + "grad_norm": 7.193357467651367, + "learning_rate": 1.3297075680876879e-05, + "loss": 1.0568, "step": 8012 }, { - "epoch": 2.41, - "grad_norm": 27.507963180541992, - "learning_rate": 3.939059837626542e-06, - "loss": 0.83, + "epoch": 1.01, + "grad_norm": 29.412263870239258, + "learning_rate": 1.3296238965820191e-05, + "loss": 1.5548, "step": 8013 }, { - "epoch": 2.41, - "grad_norm": 7.844414710998535, - "learning_rate": 3.937055227022151e-06, - "loss": 0.9977, + "epoch": 1.01, + "grad_norm": 10.851529121398926, + "learning_rate": 1.3295402250763503e-05, + "loss": 2.2944, "step": 8014 }, { - "epoch": 2.41, - "grad_norm": 51.909061431884766, - "learning_rate": 3.935050616417761e-06, - "loss": 2.3273, + "epoch": 1.01, + "grad_norm": 18.598386764526367, + "learning_rate": 1.3294565535706817e-05, + "loss": 1.5626, "step": 8015 }, { - "epoch": 2.41, - "grad_norm": 10.7676420211792, - "learning_rate": 3.933046005813371e-06, - "loss": 0.986, + "epoch": 1.01, + "grad_norm": 64.27809143066406, + "learning_rate": 1.3293728820650129e-05, + "loss": 1.3406, "step": 8016 }, { - "epoch": 2.41, - "grad_norm": 14.182157516479492, - "learning_rate": 3.931041395208981e-06, - "loss": 1.3547, + "epoch": 1.01, + "grad_norm": 88.03241729736328, + "learning_rate": 1.3292892105593442e-05, + "loss": 1.6087, "step": 8017 }, { - "epoch": 2.41, - "grad_norm": 15.017866134643555, - "learning_rate": 3.9290367846045905e-06, - "loss": 0.7605, + "epoch": 1.01, + "grad_norm": 33.14046096801758, + "learning_rate": 1.3292055390536754e-05, + "loss": 2.5191, "step": 8018 }, { - "epoch": 2.41, - "grad_norm": 22.285579681396484, - "learning_rate": 3.927032174000201e-06, - "loss": 1.5245, + "epoch": 1.01, + "grad_norm": 10.153793334960938, + "learning_rate": 1.3291218675480066e-05, + "loss": 0.6017, "step": 8019 }, { - "epoch": 2.41, - "grad_norm": 69.15910339355469, - "learning_rate": 3.925027563395811e-06, - "loss": 2.0588, + "epoch": 1.01, + "grad_norm": 18.304969787597656, + "learning_rate": 1.329038196042338e-05, + "loss": 1.1005, "step": 8020 }, { - "epoch": 2.41, - "grad_norm": 57.589107513427734, - "learning_rate": 3.923022952791421e-06, - "loss": 2.0236, + "epoch": 1.01, + "grad_norm": 80.81324005126953, + "learning_rate": 1.328954524536669e-05, + "loss": 1.8401, "step": 8021 }, { - "epoch": 2.41, - "grad_norm": 38.282440185546875, - "learning_rate": 3.921018342187031e-06, - "loss": 1.923, + "epoch": 1.01, + "grad_norm": 10.775973320007324, + "learning_rate": 1.3288708530310004e-05, + "loss": 0.8631, "step": 8022 }, { - "epoch": 2.41, - "grad_norm": 16.257158279418945, - "learning_rate": 3.91901373158264e-06, - "loss": 1.2573, + "epoch": 1.01, + "grad_norm": 11.5303373336792, + "learning_rate": 1.3287871815253317e-05, + "loss": 1.0476, "step": 8023 }, { - "epoch": 2.41, - "grad_norm": 13.993424415588379, - "learning_rate": 3.91700912097825e-06, - "loss": 1.1249, + "epoch": 1.01, + "grad_norm": 9.755226135253906, + "learning_rate": 1.3287035100196631e-05, + "loss": 0.5219, "step": 8024 }, { - "epoch": 2.41, - "grad_norm": 18.90182876586914, - "learning_rate": 3.91500451037386e-06, - "loss": 1.4962, + "epoch": 1.01, + "grad_norm": 5.90783166885376, + "learning_rate": 1.3286198385139941e-05, + "loss": 0.7428, "step": 8025 }, { - "epoch": 2.41, - "grad_norm": 19.353958129882812, - "learning_rate": 3.91299989976947e-06, - "loss": 1.0571, + "epoch": 1.01, + "grad_norm": 11.44725513458252, + "learning_rate": 1.3285361670083255e-05, + "loss": 1.4417, "step": 8026 }, { - "epoch": 2.41, - "grad_norm": 31.27533531188965, - "learning_rate": 3.91099528916508e-06, - "loss": 1.6633, + "epoch": 1.01, + "grad_norm": 3.191082000732422, + "learning_rate": 1.3284524955026567e-05, + "loss": 0.0848, "step": 8027 }, { - "epoch": 2.41, - "grad_norm": 15.394329071044922, - "learning_rate": 3.9089906785606905e-06, - "loss": 1.4122, + "epoch": 1.01, + "grad_norm": 30.03740119934082, + "learning_rate": 1.3283688239969879e-05, + "loss": 1.6489, "step": 8028 }, { - "epoch": 2.41, - "grad_norm": 77.46128845214844, - "learning_rate": 3.9069860679563e-06, - "loss": 1.7462, + "epoch": 1.01, + "grad_norm": 8.293747901916504, + "learning_rate": 1.3282851524913192e-05, + "loss": 0.3589, "step": 8029 }, { - "epoch": 2.41, - "grad_norm": 33.42092514038086, - "learning_rate": 3.90498145735191e-06, - "loss": 1.3732, + "epoch": 1.01, + "grad_norm": 5.748569965362549, + "learning_rate": 1.3282014809856504e-05, + "loss": 0.539, "step": 8030 }, { - "epoch": 2.41, - "grad_norm": 9.593501091003418, - "learning_rate": 3.90297684674752e-06, - "loss": 0.9003, + "epoch": 1.01, + "grad_norm": 15.555586814880371, + "learning_rate": 1.3281178094799816e-05, + "loss": 0.4044, "step": 8031 }, { - "epoch": 2.41, - "grad_norm": 16.690101623535156, - "learning_rate": 3.900972236143129e-06, - "loss": 1.6157, + "epoch": 1.01, + "grad_norm": 32.48698425292969, + "learning_rate": 1.3280341379743128e-05, + "loss": 0.5353, "step": 8032 }, { - "epoch": 2.42, - "grad_norm": 14.943684577941895, - "learning_rate": 3.898967625538739e-06, - "loss": 1.5163, + "epoch": 1.01, + "grad_norm": 13.530704498291016, + "learning_rate": 1.3279504664686442e-05, + "loss": 1.6522, "step": 8033 }, { - "epoch": 2.42, - "grad_norm": 59.907310485839844, - "learning_rate": 3.896963014934349e-06, - "loss": 1.6031, + "epoch": 1.01, + "grad_norm": 13.967482566833496, + "learning_rate": 1.3278667949629756e-05, + "loss": 0.6138, "step": 8034 }, { - "epoch": 2.42, - "grad_norm": 32.62445068359375, - "learning_rate": 3.8949584043299594e-06, - "loss": 1.2945, + "epoch": 1.01, + "grad_norm": 8.310638427734375, + "learning_rate": 1.3277831234573066e-05, + "loss": 1.147, "step": 8035 }, { - "epoch": 2.42, - "grad_norm": 92.56492614746094, - "learning_rate": 3.892953793725569e-06, - "loss": 1.5538, + "epoch": 1.01, + "grad_norm": 22.596485137939453, + "learning_rate": 1.327699451951638e-05, + "loss": 1.4125, "step": 8036 }, { - "epoch": 2.42, - "grad_norm": 11.2645902633667, - "learning_rate": 3.890949183121179e-06, - "loss": 1.8437, + "epoch": 1.01, + "grad_norm": 6.3463873863220215, + "learning_rate": 1.3276157804459693e-05, + "loss": 1.7541, "step": 8037 }, { - "epoch": 2.42, - "grad_norm": 13.851442337036133, - "learning_rate": 3.888944572516789e-06, - "loss": 0.9473, + "epoch": 1.01, + "grad_norm": 12.85508918762207, + "learning_rate": 1.3275321089403003e-05, + "loss": 1.5066, "step": 8038 }, { - "epoch": 2.42, - "grad_norm": 17.675752639770508, - "learning_rate": 3.886939961912399e-06, - "loss": 1.4754, + "epoch": 1.01, + "grad_norm": 14.467947006225586, + "learning_rate": 1.3274484374346317e-05, + "loss": 2.1135, "step": 8039 }, { - "epoch": 2.42, - "grad_norm": 14.534573554992676, - "learning_rate": 3.884935351308008e-06, - "loss": 1.2699, - "step": 8040 - }, - { - "epoch": 2.42, - "eval_loss": 0.16647757589817047, - "eval_runtime": 43.4766, - "eval_samples_per_second": 34.018, - "eval_steps_per_second": 34.018, + "epoch": 1.01, + "grad_norm": 8.939553260803223, + "learning_rate": 1.327364765928963e-05, + "loss": 0.5817, "step": 8040 }, { - "epoch": 2.42, - "grad_norm": 10.053756713867188, - "learning_rate": 3.882930740703619e-06, - "loss": 1.1651, + "epoch": 1.01, + "grad_norm": 4.461472511291504, + "learning_rate": 1.3272810944232943e-05, + "loss": 0.3033, "step": 8041 }, { - "epoch": 2.42, - "grad_norm": 15.086660385131836, - "learning_rate": 3.880926130099228e-06, - "loss": 1.2307, + "epoch": 1.01, + "grad_norm": 10.413497924804688, + "learning_rate": 1.3271974229176255e-05, + "loss": 1.1546, "step": 8042 }, { - "epoch": 2.42, - "grad_norm": 45.48273468017578, - "learning_rate": 3.878921519494838e-06, - "loss": 1.7186, + "epoch": 1.01, + "grad_norm": 42.64353942871094, + "learning_rate": 1.3271137514119568e-05, + "loss": 2.6524, "step": 8043 }, { - "epoch": 2.42, - "grad_norm": 34.40294647216797, - "learning_rate": 3.8769169088904485e-06, - "loss": 2.7345, + "epoch": 1.01, + "grad_norm": 16.8046817779541, + "learning_rate": 1.327030079906288e-05, + "loss": 0.566, "step": 8044 }, { - "epoch": 2.42, - "grad_norm": 14.807568550109863, - "learning_rate": 3.8749122982860586e-06, - "loss": 0.8994, + "epoch": 1.01, + "grad_norm": 27.37327003479004, + "learning_rate": 1.3269464084006192e-05, + "loss": 1.344, "step": 8045 }, { - "epoch": 2.42, - "grad_norm": 10.855109214782715, - "learning_rate": 3.872907687681668e-06, - "loss": 1.2174, + "epoch": 1.01, + "grad_norm": 13.533464431762695, + "learning_rate": 1.3268627368949504e-05, + "loss": 1.5702, "step": 8046 }, { - "epoch": 2.42, - "grad_norm": 36.307777404785156, - "learning_rate": 3.870903077077278e-06, - "loss": 2.1917, + "epoch": 1.01, + "grad_norm": 9.839691162109375, + "learning_rate": 1.3267790653892818e-05, + "loss": 0.8166, "step": 8047 }, { - "epoch": 2.42, - "grad_norm": 22.572141647338867, - "learning_rate": 3.868898466472888e-06, - "loss": 0.8395, + "epoch": 1.01, + "grad_norm": 17.93497085571289, + "learning_rate": 1.3266953938836131e-05, + "loss": 0.8191, "step": 8048 }, { - "epoch": 2.42, - "grad_norm": 17.936315536499023, - "learning_rate": 3.866893855868497e-06, - "loss": 2.4869, + "epoch": 1.01, + "grad_norm": 12.403947830200195, + "learning_rate": 1.3266117223779442e-05, + "loss": 2.1308, "step": 8049 }, { - "epoch": 2.42, - "grad_norm": 12.054944038391113, - "learning_rate": 3.864889245264108e-06, - "loss": 1.4992, + "epoch": 1.01, + "grad_norm": 11.181997299194336, + "learning_rate": 1.3265280508722755e-05, + "loss": 1.4637, "step": 8050 }, { - "epoch": 2.42, - "grad_norm": 10.359464645385742, - "learning_rate": 3.862884634659717e-06, - "loss": 1.1123, + "epoch": 1.01, + "grad_norm": 52.8959846496582, + "learning_rate": 1.3264443793666069e-05, + "loss": 4.37, "step": 8051 }, { - "epoch": 2.42, - "grad_norm": 21.69719886779785, - "learning_rate": 3.8608800240553275e-06, - "loss": 1.9015, + "epoch": 1.01, + "grad_norm": 15.241668701171875, + "learning_rate": 1.326360707860938e-05, + "loss": 1.8832, "step": 8052 }, { - "epoch": 2.42, - "grad_norm": 9.15548038482666, - "learning_rate": 3.8588754134509376e-06, - "loss": 1.1113, + "epoch": 1.01, + "grad_norm": 12.627178192138672, + "learning_rate": 1.3262770363552693e-05, + "loss": 0.7974, "step": 8053 }, { - "epoch": 2.42, - "grad_norm": 7.168182373046875, - "learning_rate": 3.856870802846548e-06, - "loss": 0.7675, + "epoch": 1.01, + "grad_norm": 33.42836380004883, + "learning_rate": 1.3261933648496007e-05, + "loss": 1.09, "step": 8054 }, { - "epoch": 2.42, - "grad_norm": 23.34624481201172, - "learning_rate": 3.854866192242157e-06, - "loss": 1.1852, + "epoch": 1.01, + "grad_norm": 32.938899993896484, + "learning_rate": 1.3261096933439318e-05, + "loss": 1.1665, "step": 8055 }, { - "epoch": 2.42, - "grad_norm": 8.906826972961426, - "learning_rate": 3.852861581637767e-06, - "loss": 0.7492, + "epoch": 1.01, + "grad_norm": 3.821079730987549, + "learning_rate": 1.326026021838263e-05, + "loss": 0.2259, "step": 8056 }, { - "epoch": 2.42, - "grad_norm": 17.47923469543457, - "learning_rate": 3.850856971033377e-06, - "loss": 1.1435, + "epoch": 1.01, + "grad_norm": 12.121703147888184, + "learning_rate": 1.3259423503325944e-05, + "loss": 1.0859, "step": 8057 }, { - "epoch": 2.42, - "grad_norm": 8.960250854492188, - "learning_rate": 3.848852360428987e-06, - "loss": 0.9197, + "epoch": 1.01, + "grad_norm": 16.58420753479004, + "learning_rate": 1.3258586788269256e-05, + "loss": 0.6811, "step": 8058 }, { - "epoch": 2.42, - "grad_norm": 17.509536743164062, - "learning_rate": 3.846847749824597e-06, - "loss": 1.5889, + "epoch": 1.01, + "grad_norm": 19.60185432434082, + "learning_rate": 1.3257750073212568e-05, + "loss": 1.2272, "step": 8059 }, { - "epoch": 2.42, - "grad_norm": 14.029973030090332, - "learning_rate": 3.8448431392202065e-06, - "loss": 1.5197, + "epoch": 1.01, + "grad_norm": 17.212610244750977, + "learning_rate": 1.325691335815588e-05, + "loss": 1.0265, "step": 8060 }, { - "epoch": 2.42, - "grad_norm": 13.85709285736084, - "learning_rate": 3.8428385286158165e-06, - "loss": 1.1813, + "epoch": 1.01, + "grad_norm": 26.601634979248047, + "learning_rate": 1.3256076643099194e-05, + "loss": 1.6516, "step": 8061 }, { - "epoch": 2.42, - "grad_norm": 11.46907901763916, - "learning_rate": 3.840833918011427e-06, - "loss": 1.1435, + "epoch": 1.01, + "grad_norm": 9.53249454498291, + "learning_rate": 1.3255239928042507e-05, + "loss": 0.9406, "step": 8062 }, { - "epoch": 2.42, - "grad_norm": 11.111536026000977, - "learning_rate": 3.838829307407037e-06, - "loss": 1.6378, + "epoch": 1.01, + "grad_norm": 19.139719009399414, + "learning_rate": 1.3254403212985817e-05, + "loss": 0.7048, "step": 8063 }, { - "epoch": 2.42, - "grad_norm": 13.92691421508789, - "learning_rate": 3.836824696802646e-06, - "loss": 1.0666, + "epoch": 1.01, + "grad_norm": 24.55410385131836, + "learning_rate": 1.3253566497929131e-05, + "loss": 2.0259, "step": 8064 }, { - "epoch": 2.42, - "grad_norm": 11.448002815246582, - "learning_rate": 3.834820086198257e-06, - "loss": 1.2318, + "epoch": 1.01, + "grad_norm": 7.9865803718566895, + "learning_rate": 1.3252729782872445e-05, + "loss": 0.8302, "step": 8065 }, { - "epoch": 2.43, - "grad_norm": 13.583879470825195, - "learning_rate": 3.832815475593866e-06, - "loss": 0.7262, + "epoch": 1.01, + "grad_norm": 12.298362731933594, + "learning_rate": 1.3251893067815755e-05, + "loss": 0.9395, "step": 8066 }, { - "epoch": 2.43, - "grad_norm": 17.306129455566406, - "learning_rate": 3.830810864989476e-06, - "loss": 1.5805, + "epoch": 1.01, + "grad_norm": 12.418360710144043, + "learning_rate": 1.3251056352759069e-05, + "loss": 3.1056, "step": 8067 }, { - "epoch": 2.43, - "grad_norm": 17.71708106994629, - "learning_rate": 3.828806254385086e-06, - "loss": 1.5627, + "epoch": 1.01, + "grad_norm": 16.77568244934082, + "learning_rate": 1.3250219637702382e-05, + "loss": 1.1351, "step": 8068 }, { - "epoch": 2.43, - "grad_norm": 13.716277122497559, - "learning_rate": 3.826801643780696e-06, - "loss": 1.223, + "epoch": 1.01, + "grad_norm": 20.997798919677734, + "learning_rate": 1.3249382922645694e-05, + "loss": 1.6183, "step": 8069 }, { - "epoch": 2.43, - "grad_norm": 9.647749900817871, - "learning_rate": 3.824797033176306e-06, - "loss": 2.1291, + "epoch": 1.01, + "grad_norm": 34.50931167602539, + "learning_rate": 1.3248546207589006e-05, + "loss": 2.1002, "step": 8070 }, { - "epoch": 2.43, - "grad_norm": 15.30841064453125, - "learning_rate": 3.822792422571916e-06, - "loss": 1.1331, + "epoch": 1.01, + "grad_norm": 24.325166702270508, + "learning_rate": 1.324770949253232e-05, + "loss": 1.509, "step": 8071 }, { - "epoch": 2.43, - "grad_norm": 18.855783462524414, - "learning_rate": 3.820787811967526e-06, - "loss": 1.7297, + "epoch": 1.01, + "grad_norm": 11.502999305725098, + "learning_rate": 1.3246872777475632e-05, + "loss": 1.1475, "step": 8072 }, { - "epoch": 2.43, - "grad_norm": 35.978248596191406, - "learning_rate": 3.818783201363135e-06, - "loss": 0.7591, + "epoch": 1.01, + "grad_norm": 6.817028045654297, + "learning_rate": 1.3246036062418944e-05, + "loss": 0.7322, "step": 8073 }, { - "epoch": 2.43, - "grad_norm": 19.79545021057129, - "learning_rate": 3.816778590758746e-06, - "loss": 1.878, + "epoch": 1.01, + "grad_norm": 23.855792999267578, + "learning_rate": 1.3245199347362256e-05, + "loss": 1.0183, "step": 8074 }, { - "epoch": 2.43, - "grad_norm": 33.37739562988281, - "learning_rate": 3.814773980154355e-06, - "loss": 1.5872, + "epoch": 1.01, + "grad_norm": 36.64528274536133, + "learning_rate": 1.324436263230557e-05, + "loss": 2.4726, "step": 8075 }, { - "epoch": 2.43, - "grad_norm": 19.29977035522461, - "learning_rate": 3.8127693695499653e-06, - "loss": 0.9943, + "epoch": 1.01, + "grad_norm": 15.954262733459473, + "learning_rate": 1.3243525917248883e-05, + "loss": 0.7668, "step": 8076 }, { - "epoch": 2.43, - "grad_norm": 22.01044273376465, - "learning_rate": 3.8107647589455753e-06, - "loss": 1.9346, + "epoch": 1.01, + "grad_norm": 19.742395401000977, + "learning_rate": 1.3242689202192193e-05, + "loss": 1.0538, "step": 8077 }, { - "epoch": 2.43, - "grad_norm": 24.812864303588867, - "learning_rate": 3.808760148341185e-06, - "loss": 1.4286, + "epoch": 1.01, + "grad_norm": 11.046028137207031, + "learning_rate": 1.3241852487135507e-05, + "loss": 1.2064, "step": 8078 }, { - "epoch": 2.43, - "grad_norm": 16.758087158203125, - "learning_rate": 3.8067555377367947e-06, - "loss": 0.6716, + "epoch": 1.01, + "grad_norm": 16.42095947265625, + "learning_rate": 1.324101577207882e-05, + "loss": 1.3735, "step": 8079 }, { - "epoch": 2.43, - "grad_norm": 10.366033554077148, - "learning_rate": 3.8047509271324047e-06, - "loss": 1.3467, + "epoch": 1.01, + "grad_norm": 8.204858779907227, + "learning_rate": 1.3240179057022131e-05, + "loss": 0.8107, "step": 8080 }, { - "epoch": 2.43, - "grad_norm": 20.55314064025879, - "learning_rate": 3.802746316528015e-06, - "loss": 1.4071, + "epoch": 1.01, + "grad_norm": 19.455652236938477, + "learning_rate": 1.3239342341965445e-05, + "loss": 2.0798, "step": 8081 }, { - "epoch": 2.43, - "grad_norm": 31.28022003173828, - "learning_rate": 3.8007417059236245e-06, - "loss": 1.7199, + "epoch": 1.01, + "grad_norm": 8.957785606384277, + "learning_rate": 1.3238505626908758e-05, + "loss": 0.5589, "step": 8082 }, { - "epoch": 2.43, - "grad_norm": 6.907041072845459, - "learning_rate": 3.798737095319234e-06, - "loss": 0.3853, + "epoch": 1.01, + "grad_norm": 17.459102630615234, + "learning_rate": 1.323766891185207e-05, + "loss": 1.6631, "step": 8083 }, { - "epoch": 2.43, - "grad_norm": 24.64528465270996, - "learning_rate": 3.7967324847148447e-06, - "loss": 1.4183, + "epoch": 1.01, + "grad_norm": 19.128694534301758, + "learning_rate": 1.3236832196795382e-05, + "loss": 0.8569, "step": 8084 }, { - "epoch": 2.43, - "grad_norm": 10.540509223937988, - "learning_rate": 3.7947278741104543e-06, - "loss": 1.0129, + "epoch": 1.01, + "grad_norm": 3.326540470123291, + "learning_rate": 1.3235995481738694e-05, + "loss": 0.1619, "step": 8085 }, { - "epoch": 2.43, - "grad_norm": 13.344951629638672, - "learning_rate": 3.792723263506064e-06, - "loss": 0.989, + "epoch": 1.01, + "grad_norm": 16.52320098876953, + "learning_rate": 1.3235158766682008e-05, + "loss": 1.2735, "step": 8086 }, { - "epoch": 2.43, - "grad_norm": 8.0833158493042, - "learning_rate": 3.7907186529016745e-06, - "loss": 0.8597, + "epoch": 1.01, + "grad_norm": 11.69139289855957, + "learning_rate": 1.323432205162532e-05, + "loss": 0.8281, "step": 8087 }, { - "epoch": 2.43, - "grad_norm": 19.457054138183594, - "learning_rate": 3.788714042297284e-06, - "loss": 1.3676, + "epoch": 1.02, + "grad_norm": 13.96041202545166, + "learning_rate": 1.3233485336568632e-05, + "loss": 1.0855, "step": 8088 }, { - "epoch": 2.43, - "grad_norm": 16.89392852783203, - "learning_rate": 3.786709431692894e-06, - "loss": 1.1124, + "epoch": 1.02, + "grad_norm": 30.238624572753906, + "learning_rate": 1.3232648621511945e-05, + "loss": 2.0699, "step": 8089 }, { - "epoch": 2.43, - "grad_norm": 9.098607063293457, - "learning_rate": 3.784704821088504e-06, - "loss": 0.5794, + "epoch": 1.02, + "grad_norm": 11.058302879333496, + "learning_rate": 1.3231811906455259e-05, + "loss": 2.3913, "step": 8090 }, { - "epoch": 2.43, - "grad_norm": 41.21145248413086, - "learning_rate": 3.7827002104841136e-06, - "loss": 1.9724, + "epoch": 1.02, + "grad_norm": 7.368780136108398, + "learning_rate": 1.323097519139857e-05, + "loss": 1.1791, "step": 8091 }, { - "epoch": 2.43, - "grad_norm": 5.302923679351807, - "learning_rate": 3.7806955998797236e-06, - "loss": 0.579, + "epoch": 1.02, + "grad_norm": 15.291146278381348, + "learning_rate": 1.3230138476341883e-05, + "loss": 1.3794, "step": 8092 }, { - "epoch": 2.43, - "grad_norm": 14.981736183166504, - "learning_rate": 3.7786909892753337e-06, - "loss": 2.1375, + "epoch": 1.02, + "grad_norm": 47.3812141418457, + "learning_rate": 1.3229301761285196e-05, + "loss": 1.7557, "step": 8093 }, { - "epoch": 2.43, - "grad_norm": 20.563865661621094, - "learning_rate": 3.7766863786709434e-06, - "loss": 1.137, + "epoch": 1.02, + "grad_norm": 24.50340461730957, + "learning_rate": 1.3228465046228507e-05, + "loss": 3.2379, "step": 8094 }, { - "epoch": 2.43, - "grad_norm": 10.908076286315918, - "learning_rate": 3.774681768066553e-06, - "loss": 1.2199, + "epoch": 1.02, + "grad_norm": 14.086163520812988, + "learning_rate": 1.322762833117182e-05, + "loss": 0.9016, "step": 8095 }, { - "epoch": 2.43, - "grad_norm": 79.2155532836914, - "learning_rate": 3.7726771574621636e-06, - "loss": 2.0151, + "epoch": 1.02, + "grad_norm": 14.826435089111328, + "learning_rate": 1.3226791616115134e-05, + "loss": 2.2264, "step": 8096 }, { - "epoch": 2.43, - "grad_norm": 187.99632263183594, - "learning_rate": 3.7706725468577732e-06, - "loss": 2.0555, + "epoch": 1.02, + "grad_norm": 16.74859619140625, + "learning_rate": 1.3225954901058446e-05, + "loss": 1.3957, "step": 8097 }, { - "epoch": 2.43, - "grad_norm": 38.596275329589844, - "learning_rate": 3.768667936253383e-06, - "loss": 2.0002, + "epoch": 1.02, + "grad_norm": 12.007782936096191, + "learning_rate": 1.3225118186001758e-05, + "loss": 1.0195, "step": 8098 }, { - "epoch": 2.44, - "grad_norm": 8.95095157623291, - "learning_rate": 3.766663325648993e-06, - "loss": 1.0331, + "epoch": 1.02, + "grad_norm": 34.34865188598633, + "learning_rate": 1.322428147094507e-05, + "loss": 2.1014, "step": 8099 }, { - "epoch": 2.44, - "grad_norm": 17.133882522583008, - "learning_rate": 3.764658715044603e-06, - "loss": 1.5345, + "epoch": 1.02, + "grad_norm": 10.553949356079102, + "learning_rate": 1.3223444755888384e-05, + "loss": 1.3867, "step": 8100 }, { - "epoch": 2.44, - "grad_norm": 8.687376976013184, - "learning_rate": 3.7626541044402127e-06, - "loss": 0.7787, + "epoch": 1.02, + "grad_norm": 7.074171543121338, + "learning_rate": 1.3222608040831695e-05, + "loss": 1.277, "step": 8101 }, { - "epoch": 2.44, - "grad_norm": 69.27308654785156, - "learning_rate": 3.760649493835823e-06, - "loss": 1.252, + "epoch": 1.02, + "grad_norm": 4.532754898071289, + "learning_rate": 1.3221771325775007e-05, + "loss": 0.3835, "step": 8102 }, { - "epoch": 2.44, - "grad_norm": 23.108619689941406, - "learning_rate": 3.7586448832314325e-06, - "loss": 1.3771, + "epoch": 1.02, + "grad_norm": 9.71251106262207, + "learning_rate": 1.3220934610718321e-05, + "loss": 2.2243, "step": 8103 }, { - "epoch": 2.44, - "grad_norm": 33.36818313598633, - "learning_rate": 3.7566402726270425e-06, - "loss": 1.6731, + "epoch": 1.02, + "grad_norm": 12.226500511169434, + "learning_rate": 1.3220097895661635e-05, + "loss": 0.8613, "step": 8104 }, { - "epoch": 2.44, - "grad_norm": 13.890486717224121, - "learning_rate": 3.7546356620226526e-06, - "loss": 1.39, + "epoch": 1.02, + "grad_norm": 12.565596580505371, + "learning_rate": 1.3219261180604945e-05, + "loss": 1.8522, "step": 8105 }, { - "epoch": 2.44, - "grad_norm": 9.70341682434082, - "learning_rate": 3.7526310514182623e-06, - "loss": 0.7873, + "epoch": 1.02, + "grad_norm": 17.8659725189209, + "learning_rate": 1.3218424465548259e-05, + "loss": 0.466, "step": 8106 }, { - "epoch": 2.44, - "grad_norm": 15.94449234008789, - "learning_rate": 3.750626440813872e-06, - "loss": 0.9446, + "epoch": 1.02, + "grad_norm": 22.846317291259766, + "learning_rate": 1.3217587750491572e-05, + "loss": 1.5893, "step": 8107 }, { - "epoch": 2.44, - "grad_norm": 19.849061965942383, - "learning_rate": 3.7486218302094825e-06, - "loss": 1.6883, + "epoch": 1.02, + "grad_norm": 15.297237396240234, + "learning_rate": 1.3216751035434883e-05, + "loss": 0.8397, "step": 8108 }, { - "epoch": 2.44, - "grad_norm": 19.96930503845215, - "learning_rate": 3.746617219605092e-06, - "loss": 1.2279, + "epoch": 1.02, + "grad_norm": 12.62805461883545, + "learning_rate": 1.3215914320378196e-05, + "loss": 1.6367, "step": 8109 }, { - "epoch": 2.44, - "grad_norm": 12.898889541625977, - "learning_rate": 3.7446126090007018e-06, - "loss": 1.6165, + "epoch": 1.02, + "grad_norm": 10.7589111328125, + "learning_rate": 1.321507760532151e-05, + "loss": 0.7109, "step": 8110 }, { - "epoch": 2.44, - "grad_norm": 12.426019668579102, - "learning_rate": 3.742607998396312e-06, - "loss": 1.4589, + "epoch": 1.02, + "grad_norm": 12.487503051757812, + "learning_rate": 1.3214240890264822e-05, + "loss": 0.972, "step": 8111 }, { - "epoch": 2.44, - "grad_norm": 20.18886375427246, - "learning_rate": 3.740603387791922e-06, - "loss": 1.7677, + "epoch": 1.02, + "grad_norm": 13.478726387023926, + "learning_rate": 1.3213404175208134e-05, + "loss": 0.8637, "step": 8112 }, { - "epoch": 2.44, - "grad_norm": 19.88523292541504, - "learning_rate": 3.7385987771875316e-06, - "loss": 2.5334, + "epoch": 1.02, + "grad_norm": 16.753326416015625, + "learning_rate": 1.3212567460151446e-05, + "loss": 0.9855, "step": 8113 }, { - "epoch": 2.44, - "grad_norm": 39.80482482910156, - "learning_rate": 3.7365941665831417e-06, - "loss": 1.2999, + "epoch": 1.02, + "grad_norm": 11.019522666931152, + "learning_rate": 1.321173074509476e-05, + "loss": 1.2104, "step": 8114 }, { - "epoch": 2.44, - "grad_norm": 20.459688186645508, - "learning_rate": 3.7345895559787513e-06, - "loss": 1.1679, + "epoch": 1.02, + "grad_norm": 9.230637550354004, + "learning_rate": 1.3210894030038071e-05, + "loss": 0.93, "step": 8115 }, { - "epoch": 2.44, - "grad_norm": 6.700068950653076, - "learning_rate": 3.7325849453743614e-06, - "loss": 0.7833, + "epoch": 1.02, + "grad_norm": 10.129105567932129, + "learning_rate": 1.3210057314981383e-05, + "loss": 1.3537, "step": 8116 }, { - "epoch": 2.44, - "grad_norm": 10.873579978942871, - "learning_rate": 3.7305803347699715e-06, - "loss": 1.1187, + "epoch": 1.02, + "grad_norm": 12.891251564025879, + "learning_rate": 1.3209220599924697e-05, + "loss": 0.541, "step": 8117 }, { - "epoch": 2.44, - "grad_norm": 11.668147087097168, - "learning_rate": 3.728575724165581e-06, - "loss": 1.436, + "epoch": 1.02, + "grad_norm": 12.813945770263672, + "learning_rate": 1.320838388486801e-05, + "loss": 1.5574, "step": 8118 }, { - "epoch": 2.44, - "grad_norm": 24.042890548706055, - "learning_rate": 3.726571113561191e-06, - "loss": 1.6814, + "epoch": 1.02, + "grad_norm": 23.467496871948242, + "learning_rate": 1.320754716981132e-05, + "loss": 2.6069, "step": 8119 }, { - "epoch": 2.44, - "grad_norm": 7.250650882720947, - "learning_rate": 3.7245665029568013e-06, - "loss": 1.062, + "epoch": 1.02, + "grad_norm": 11.790117263793945, + "learning_rate": 1.3206710454754634e-05, + "loss": 1.5657, "step": 8120 }, { - "epoch": 2.44, - "grad_norm": 8.547741889953613, - "learning_rate": 3.722561892352411e-06, - "loss": 1.0638, + "epoch": 1.02, + "grad_norm": 5.574734210968018, + "learning_rate": 1.3205873739697948e-05, + "loss": 1.0195, "step": 8121 }, { - "epoch": 2.44, - "grad_norm": 9.865437507629395, - "learning_rate": 3.7205572817480207e-06, - "loss": 0.8298, + "epoch": 1.02, + "grad_norm": 12.647422790527344, + "learning_rate": 1.3205037024641258e-05, + "loss": 0.7309, "step": 8122 }, { - "epoch": 2.44, - "grad_norm": 11.00238037109375, - "learning_rate": 3.7185526711436303e-06, - "loss": 0.7298, + "epoch": 1.02, + "grad_norm": 13.510000228881836, + "learning_rate": 1.3204200309584572e-05, + "loss": 1.8041, "step": 8123 }, { - "epoch": 2.44, - "grad_norm": 10.733601570129395, - "learning_rate": 3.716548060539241e-06, - "loss": 0.9721, + "epoch": 1.02, + "grad_norm": 34.436222076416016, + "learning_rate": 1.3203363594527884e-05, + "loss": 1.7884, "step": 8124 }, { - "epoch": 2.44, - "grad_norm": 32.854774475097656, - "learning_rate": 3.7145434499348505e-06, - "loss": 1.4885, + "epoch": 1.02, + "grad_norm": 39.42829895019531, + "learning_rate": 1.3202526879471198e-05, + "loss": 3.7134, "step": 8125 }, { - "epoch": 2.44, - "grad_norm": 11.409563064575195, - "learning_rate": 3.71253883933046e-06, - "loss": 0.8726, + "epoch": 1.02, + "grad_norm": 7.951882839202881, + "learning_rate": 1.320169016441451e-05, + "loss": 0.605, "step": 8126 }, { - "epoch": 2.44, - "grad_norm": 37.438106536865234, - "learning_rate": 3.7105342287260702e-06, - "loss": 1.3614, + "epoch": 1.02, + "grad_norm": 5.677186965942383, + "learning_rate": 1.3200853449357822e-05, + "loss": 0.1397, "step": 8127 }, { - "epoch": 2.44, - "grad_norm": 28.72161865234375, - "learning_rate": 3.70852961812168e-06, - "loss": 1.0996, + "epoch": 1.02, + "grad_norm": 23.42045021057129, + "learning_rate": 1.3200016734301135e-05, + "loss": 1.3299, "step": 8128 }, { - "epoch": 2.44, - "grad_norm": 10.785248756408691, - "learning_rate": 3.70652500751729e-06, - "loss": 0.7049, + "epoch": 1.02, + "grad_norm": 89.72753143310547, + "learning_rate": 1.3199180019244447e-05, + "loss": 0.9728, "step": 8129 }, { - "epoch": 2.44, - "grad_norm": 9.802728652954102, - "learning_rate": 3.7045203969129e-06, - "loss": 1.0367, + "epoch": 1.02, + "grad_norm": 18.427221298217773, + "learning_rate": 1.3198343304187759e-05, + "loss": 1.103, "step": 8130 }, { - "epoch": 2.44, - "grad_norm": 14.082221984863281, - "learning_rate": 3.7025157863085097e-06, - "loss": 1.0125, + "epoch": 1.02, + "grad_norm": 19.781017303466797, + "learning_rate": 1.3197506589131073e-05, + "loss": 2.0529, "step": 8131 }, { - "epoch": 2.44, - "grad_norm": 63.74949645996094, - "learning_rate": 3.7005111757041194e-06, - "loss": 2.3996, + "epoch": 1.02, + "grad_norm": 15.670022964477539, + "learning_rate": 1.3196669874074386e-05, + "loss": 0.8893, "step": 8132 }, { - "epoch": 2.45, - "grad_norm": 9.598031044006348, - "learning_rate": 3.69850656509973e-06, - "loss": 1.0888, + "epoch": 1.02, + "grad_norm": 57.735897064208984, + "learning_rate": 1.3195833159017697e-05, + "loss": 1.2899, "step": 8133 }, { - "epoch": 2.45, - "grad_norm": 15.858400344848633, - "learning_rate": 3.6965019544953396e-06, - "loss": 1.055, + "epoch": 1.02, + "grad_norm": 16.303756713867188, + "learning_rate": 1.319499644396101e-05, + "loss": 1.8557, "step": 8134 }, { - "epoch": 2.45, - "grad_norm": 10.83476734161377, - "learning_rate": 3.6944973438909492e-06, - "loss": 1.9326, + "epoch": 1.02, + "grad_norm": 12.056116104125977, + "learning_rate": 1.3194159728904324e-05, + "loss": 0.8946, "step": 8135 }, { - "epoch": 2.45, - "grad_norm": 10.07654857635498, - "learning_rate": 3.6924927332865597e-06, - "loss": 1.5472, + "epoch": 1.02, + "grad_norm": 36.674530029296875, + "learning_rate": 1.3193323013847634e-05, + "loss": 1.7452, "step": 8136 }, { - "epoch": 2.45, - "grad_norm": 16.537139892578125, - "learning_rate": 3.6904881226821694e-06, - "loss": 1.2108, + "epoch": 1.02, + "grad_norm": 8.627217292785645, + "learning_rate": 1.3192486298790948e-05, + "loss": 0.6619, "step": 8137 }, { - "epoch": 2.45, - "grad_norm": 15.194998741149902, - "learning_rate": 3.688483512077779e-06, - "loss": 1.0623, + "epoch": 1.02, + "grad_norm": 14.593555450439453, + "learning_rate": 1.319164958373426e-05, + "loss": 1.1673, "step": 8138 }, { - "epoch": 2.45, - "grad_norm": 43.36076736450195, - "learning_rate": 3.686478901473389e-06, - "loss": 1.2945, + "epoch": 1.02, + "grad_norm": 4.4681549072265625, + "learning_rate": 1.3190812868677573e-05, + "loss": 2.2662, "step": 8139 }, { - "epoch": 2.45, - "grad_norm": 9.480472564697266, - "learning_rate": 3.684474290868999e-06, - "loss": 0.5474, + "epoch": 1.02, + "grad_norm": 14.621169090270996, + "learning_rate": 1.3189976153620885e-05, + "loss": 1.2689, "step": 8140 }, { - "epoch": 2.45, - "grad_norm": 36.88868713378906, - "learning_rate": 3.682469680264609e-06, - "loss": 1.0721, + "epoch": 1.02, + "grad_norm": 9.740835189819336, + "learning_rate": 1.3189139438564197e-05, + "loss": 1.1295, "step": 8141 }, { - "epoch": 2.45, - "grad_norm": 42.60389709472656, - "learning_rate": 3.680465069660219e-06, - "loss": 1.4549, + "epoch": 1.02, + "grad_norm": 21.940420150756836, + "learning_rate": 1.3188302723507511e-05, + "loss": 0.8043, "step": 8142 }, { - "epoch": 2.45, - "grad_norm": 22.67746353149414, - "learning_rate": 3.6784604590558286e-06, - "loss": 2.1135, + "epoch": 1.02, + "grad_norm": 12.638836860656738, + "learning_rate": 1.3187466008450821e-05, + "loss": 1.6467, "step": 8143 }, { - "epoch": 2.45, - "grad_norm": 8.67175006866455, - "learning_rate": 3.6764558484514383e-06, - "loss": 1.0506, + "epoch": 1.02, + "grad_norm": 13.792120933532715, + "learning_rate": 1.3186629293394135e-05, + "loss": 2.3832, "step": 8144 }, { - "epoch": 2.45, - "grad_norm": 18.72547149658203, - "learning_rate": 3.674451237847049e-06, - "loss": 0.891, + "epoch": 1.02, + "grad_norm": 18.201194763183594, + "learning_rate": 1.3185792578337449e-05, + "loss": 0.1339, "step": 8145 }, { - "epoch": 2.45, - "grad_norm": 32.578861236572266, - "learning_rate": 3.6724466272426585e-06, - "loss": 1.6625, + "epoch": 1.02, + "grad_norm": 24.331432342529297, + "learning_rate": 1.3184955863280762e-05, + "loss": 1.2141, "step": 8146 }, { - "epoch": 2.45, - "grad_norm": 9.158719062805176, - "learning_rate": 3.670442016638268e-06, - "loss": 0.8982, + "epoch": 1.02, + "grad_norm": 16.624353408813477, + "learning_rate": 1.3184119148224073e-05, + "loss": 0.5703, "step": 8147 }, { - "epoch": 2.45, - "grad_norm": 15.746097564697266, - "learning_rate": 3.6684374060338786e-06, - "loss": 1.349, + "epoch": 1.02, + "grad_norm": 8.646965980529785, + "learning_rate": 1.3183282433167386e-05, + "loss": 0.8436, "step": 8148 }, { - "epoch": 2.45, - "grad_norm": 12.427379608154297, - "learning_rate": 3.6664327954294883e-06, - "loss": 1.283, + "epoch": 1.02, + "grad_norm": 9.734932899475098, + "learning_rate": 1.31824457181107e-05, + "loss": 1.2568, "step": 8149 }, { - "epoch": 2.45, - "grad_norm": 30.601055145263672, - "learning_rate": 3.664428184825098e-06, - "loss": 0.9753, + "epoch": 1.02, + "grad_norm": 26.99791717529297, + "learning_rate": 1.318160900305401e-05, + "loss": 1.2669, "step": 8150 }, { - "epoch": 2.45, - "grad_norm": 15.714460372924805, - "learning_rate": 3.662423574220708e-06, - "loss": 1.1643, + "epoch": 1.02, + "grad_norm": 5.206856727600098, + "learning_rate": 1.3180772287997324e-05, + "loss": 1.4424, "step": 8151 }, { - "epoch": 2.45, - "grad_norm": 27.35734748840332, - "learning_rate": 3.6604189636163177e-06, - "loss": 1.7191, + "epoch": 1.02, + "grad_norm": 9.2411527633667, + "learning_rate": 1.3179935572940636e-05, + "loss": 0.7221, "step": 8152 }, { - "epoch": 2.45, - "grad_norm": 16.40424919128418, - "learning_rate": 3.6584143530119278e-06, - "loss": 1.4443, + "epoch": 1.02, + "grad_norm": 28.84791374206543, + "learning_rate": 1.317909885788395e-05, + "loss": 1.5638, "step": 8153 }, { - "epoch": 2.45, - "grad_norm": 9.086562156677246, - "learning_rate": 3.656409742407538e-06, - "loss": 1.0836, + "epoch": 1.02, + "grad_norm": 8.334434509277344, + "learning_rate": 1.3178262142827261e-05, + "loss": 0.9285, "step": 8154 }, { - "epoch": 2.45, - "grad_norm": 16.159156799316406, - "learning_rate": 3.6544051318031475e-06, - "loss": 1.0811, + "epoch": 1.02, + "grad_norm": 6.804058074951172, + "learning_rate": 1.3177425427770573e-05, + "loss": 1.3012, "step": 8155 }, { - "epoch": 2.45, - "grad_norm": 32.5626220703125, - "learning_rate": 3.652400521198757e-06, - "loss": 1.5753, + "epoch": 1.02, + "grad_norm": 25.955961227416992, + "learning_rate": 1.3176588712713887e-05, + "loss": 1.9186, "step": 8156 }, { - "epoch": 2.45, - "grad_norm": 10.6674165725708, - "learning_rate": 3.6503959105943677e-06, - "loss": 0.9974, + "epoch": 1.02, + "grad_norm": 15.244351387023926, + "learning_rate": 1.3175751997657197e-05, + "loss": 1.8402, "step": 8157 }, { - "epoch": 2.45, - "grad_norm": 11.11431884765625, - "learning_rate": 3.6483912999899773e-06, - "loss": 1.3574, + "epoch": 1.02, + "grad_norm": 13.831007957458496, + "learning_rate": 1.317491528260051e-05, + "loss": 2.3185, "step": 8158 }, { - "epoch": 2.45, - "grad_norm": 24.535797119140625, - "learning_rate": 3.646386689385587e-06, - "loss": 1.4364, + "epoch": 1.02, + "grad_norm": 25.491474151611328, + "learning_rate": 1.3174078567543824e-05, + "loss": 1.2211, "step": 8159 }, { - "epoch": 2.45, - "grad_norm": 49.98130416870117, - "learning_rate": 3.6443820787811975e-06, - "loss": 1.0015, - "step": 8160 - }, - { - "epoch": 2.45, - "eval_loss": 0.16650927066802979, - "eval_runtime": 44.1127, - "eval_samples_per_second": 33.528, - "eval_steps_per_second": 33.528, + "epoch": 1.02, + "grad_norm": 14.899972915649414, + "learning_rate": 1.3173241852487138e-05, + "loss": 0.8519, "step": 8160 }, { - "epoch": 2.45, - "grad_norm": 9.75783634185791, - "learning_rate": 3.642377468176807e-06, - "loss": 1.1418, + "epoch": 1.02, + "grad_norm": 19.21306610107422, + "learning_rate": 1.3172405137430448e-05, + "loss": 1.4239, "step": 8161 }, { - "epoch": 2.45, - "grad_norm": 15.587701797485352, - "learning_rate": 3.640372857572417e-06, - "loss": 0.7225, + "epoch": 1.02, + "grad_norm": 15.32010555267334, + "learning_rate": 1.3171568422373762e-05, + "loss": 1.9368, "step": 8162 }, { - "epoch": 2.45, - "grad_norm": 17.10492706298828, - "learning_rate": 3.6383682469680265e-06, - "loss": 1.1976, + "epoch": 1.02, + "grad_norm": 35.73579788208008, + "learning_rate": 1.3170731707317076e-05, + "loss": 3.0578, "step": 8163 }, { - "epoch": 2.45, - "grad_norm": 23.952566146850586, - "learning_rate": 3.6363636363636366e-06, - "loss": 1.2036, + "epoch": 1.02, + "grad_norm": 9.026825904846191, + "learning_rate": 1.3169894992260386e-05, + "loss": 0.4943, "step": 8164 }, { - "epoch": 2.45, - "grad_norm": 17.270137786865234, - "learning_rate": 3.6343590257592467e-06, - "loss": 1.6207, + "epoch": 1.02, + "grad_norm": 17.9897518157959, + "learning_rate": 1.31690582772037e-05, + "loss": 2.4651, "step": 8165 }, { - "epoch": 2.46, - "grad_norm": 16.140460968017578, - "learning_rate": 3.6323544151548563e-06, - "loss": 0.9803, + "epoch": 1.02, + "grad_norm": 9.883686065673828, + "learning_rate": 1.3168221562147012e-05, + "loss": 1.4876, "step": 8166 }, { - "epoch": 2.46, - "grad_norm": 42.045997619628906, - "learning_rate": 3.6303498045504664e-06, - "loss": 2.4721, + "epoch": 1.02, + "grad_norm": 17.007078170776367, + "learning_rate": 1.3167384847090325e-05, + "loss": 1.8312, "step": 8167 }, { - "epoch": 2.46, - "grad_norm": 13.348779678344727, - "learning_rate": 3.628345193946076e-06, - "loss": 1.1113, + "epoch": 1.03, + "grad_norm": 26.36995506286621, + "learning_rate": 1.3166548132033637e-05, + "loss": 1.2385, "step": 8168 }, { - "epoch": 2.46, - "grad_norm": 36.904014587402344, - "learning_rate": 3.6263405833416857e-06, - "loss": 2.5081, + "epoch": 1.03, + "grad_norm": 10.366000175476074, + "learning_rate": 1.3165711416976949e-05, + "loss": 1.5701, "step": 8169 }, { - "epoch": 2.46, - "grad_norm": 23.20426368713379, - "learning_rate": 3.6243359727372962e-06, - "loss": 1.0172, + "epoch": 1.03, + "grad_norm": 5.277139663696289, + "learning_rate": 1.3164874701920263e-05, + "loss": 0.8169, "step": 8170 }, { - "epoch": 2.46, - "grad_norm": 16.895856857299805, - "learning_rate": 3.622331362132906e-06, - "loss": 1.0231, + "epoch": 1.03, + "grad_norm": 59.65888977050781, + "learning_rate": 1.3164037986863573e-05, + "loss": 2.6797, "step": 8171 }, { - "epoch": 2.46, - "grad_norm": 37.31813049316406, - "learning_rate": 3.6203267515285156e-06, - "loss": 1.3518, + "epoch": 1.03, + "grad_norm": 4.414580821990967, + "learning_rate": 1.3163201271806887e-05, + "loss": 0.1261, "step": 8172 }, { - "epoch": 2.46, - "grad_norm": 19.724748611450195, - "learning_rate": 3.618322140924126e-06, - "loss": 1.2338, + "epoch": 1.03, + "grad_norm": 55.824485778808594, + "learning_rate": 1.31623645567502e-05, + "loss": 1.2241, "step": 8173 }, { - "epoch": 2.46, - "grad_norm": 19.960813522338867, - "learning_rate": 3.6163175303197357e-06, - "loss": 1.4858, + "epoch": 1.03, + "grad_norm": 23.670427322387695, + "learning_rate": 1.3161527841693514e-05, + "loss": 1.7405, "step": 8174 }, { - "epoch": 2.46, - "grad_norm": 11.546113967895508, - "learning_rate": 3.6143129197153454e-06, - "loss": 1.2553, + "epoch": 1.03, + "grad_norm": 30.104541778564453, + "learning_rate": 1.3160691126636824e-05, + "loss": 2.3148, "step": 8175 }, { - "epoch": 2.46, - "grad_norm": 18.865678787231445, - "learning_rate": 3.6123083091109555e-06, - "loss": 1.2506, + "epoch": 1.03, + "grad_norm": 23.93840217590332, + "learning_rate": 1.3159854411580138e-05, + "loss": 1.0324, "step": 8176 }, { - "epoch": 2.46, - "grad_norm": 22.531909942626953, - "learning_rate": 3.6103036985065656e-06, - "loss": 1.6922, + "epoch": 1.03, + "grad_norm": 7.747591972351074, + "learning_rate": 1.315901769652345e-05, + "loss": 0.6227, "step": 8177 }, { - "epoch": 2.46, - "grad_norm": 20.21176528930664, - "learning_rate": 3.6082990879021752e-06, - "loss": 0.8544, + "epoch": 1.03, + "grad_norm": 13.66466236114502, + "learning_rate": 1.3158180981466762e-05, + "loss": 2.1594, "step": 8178 }, { - "epoch": 2.46, - "grad_norm": 15.008464813232422, - "learning_rate": 3.6062944772977853e-06, - "loss": 1.3876, + "epoch": 1.03, + "grad_norm": 33.47099304199219, + "learning_rate": 1.3157344266410075e-05, + "loss": 1.5768, "step": 8179 }, { - "epoch": 2.46, - "grad_norm": 24.066457748413086, - "learning_rate": 3.604289866693395e-06, - "loss": 1.4522, + "epoch": 1.03, + "grad_norm": 9.036125183105469, + "learning_rate": 1.3156507551353387e-05, + "loss": 0.7407, "step": 8180 }, { - "epoch": 2.46, - "grad_norm": 7.842005729675293, - "learning_rate": 3.6022852560890046e-06, - "loss": 0.5896, + "epoch": 1.03, + "grad_norm": 10.413735389709473, + "learning_rate": 1.3155670836296701e-05, + "loss": 1.5001, "step": 8181 }, { - "epoch": 2.46, - "grad_norm": 13.021004676818848, - "learning_rate": 3.600280645484615e-06, - "loss": 0.7843, + "epoch": 1.03, + "grad_norm": 10.219388008117676, + "learning_rate": 1.3154834121240013e-05, + "loss": 0.7861, "step": 8182 }, { - "epoch": 2.46, - "grad_norm": 21.92922592163086, - "learning_rate": 3.598276034880225e-06, - "loss": 2.2506, + "epoch": 1.03, + "grad_norm": 17.917850494384766, + "learning_rate": 1.3153997406183325e-05, + "loss": 1.4693, "step": 8183 }, { - "epoch": 2.46, - "grad_norm": 11.565603256225586, - "learning_rate": 3.5962714242758345e-06, - "loss": 1.0158, + "epoch": 1.03, + "grad_norm": 35.04172134399414, + "learning_rate": 1.3153160691126639e-05, + "loss": 0.9711, "step": 8184 }, { - "epoch": 2.46, - "grad_norm": 18.74056053161621, - "learning_rate": 3.594266813671445e-06, - "loss": 0.9444, + "epoch": 1.03, + "grad_norm": 7.332533359527588, + "learning_rate": 1.3152323976069949e-05, + "loss": 0.7555, "step": 8185 }, { - "epoch": 2.46, - "grad_norm": 14.562174797058105, - "learning_rate": 3.5922622030670546e-06, - "loss": 0.7885, + "epoch": 1.03, + "grad_norm": 10.041555404663086, + "learning_rate": 1.3151487261013262e-05, + "loss": 0.8399, "step": 8186 }, { - "epoch": 2.46, - "grad_norm": 37.87458038330078, - "learning_rate": 3.5902575924626643e-06, - "loss": 1.1931, + "epoch": 1.03, + "grad_norm": 7.564031600952148, + "learning_rate": 1.3150650545956576e-05, + "loss": 0.7352, "step": 8187 }, { - "epoch": 2.46, - "grad_norm": 14.50774097442627, - "learning_rate": 3.5882529818582744e-06, - "loss": 1.1908, + "epoch": 1.03, + "grad_norm": 12.851036071777344, + "learning_rate": 1.314981383089989e-05, + "loss": 0.5656, "step": 8188 }, { - "epoch": 2.46, - "grad_norm": 37.05324172973633, - "learning_rate": 3.5862483712538845e-06, - "loss": 2.294, + "epoch": 1.03, + "grad_norm": 5.794101715087891, + "learning_rate": 1.31489771158432e-05, + "loss": 0.7497, "step": 8189 }, { - "epoch": 2.46, - "grad_norm": 7.687615394592285, - "learning_rate": 3.584243760649494e-06, - "loss": 0.7946, + "epoch": 1.03, + "grad_norm": 5.937995910644531, + "learning_rate": 1.3148140400786514e-05, + "loss": 0.5418, "step": 8190 }, { - "epoch": 2.46, - "grad_norm": 10.446063995361328, - "learning_rate": 3.582239150045104e-06, - "loss": 1.2933, + "epoch": 1.03, + "grad_norm": 3.5626814365386963, + "learning_rate": 1.3147303685729826e-05, + "loss": 0.1635, "step": 8191 }, { - "epoch": 2.46, - "grad_norm": 25.05036735534668, - "learning_rate": 3.580234539440714e-06, - "loss": 1.8224, + "epoch": 1.03, + "grad_norm": 9.711762428283691, + "learning_rate": 1.3146466970673138e-05, + "loss": 1.463, "step": 8192 }, { - "epoch": 2.46, - "grad_norm": 12.898480415344238, - "learning_rate": 3.5782299288363235e-06, - "loss": 1.3031, + "epoch": 1.03, + "grad_norm": 16.928871154785156, + "learning_rate": 1.3145630255616451e-05, + "loss": 2.0754, "step": 8193 }, { - "epoch": 2.46, - "grad_norm": 23.968446731567383, - "learning_rate": 3.576225318231934e-06, - "loss": 2.3017, + "epoch": 1.03, + "grad_norm": 14.005651473999023, + "learning_rate": 1.3144793540559763e-05, + "loss": 0.7658, "step": 8194 }, { - "epoch": 2.46, - "grad_norm": 16.672903060913086, - "learning_rate": 3.5742207076275437e-06, - "loss": 0.9025, + "epoch": 1.03, + "grad_norm": 13.096718788146973, + "learning_rate": 1.3143956825503077e-05, + "loss": 2.0003, "step": 8195 }, { - "epoch": 2.46, - "grad_norm": 10.29025936126709, - "learning_rate": 3.5722160970231534e-06, - "loss": 1.0269, + "epoch": 1.03, + "grad_norm": 10.545746803283691, + "learning_rate": 1.3143120110446387e-05, + "loss": 1.2345, "step": 8196 }, { - "epoch": 2.46, - "grad_norm": 11.360820770263672, - "learning_rate": 3.570211486418764e-06, - "loss": 1.7671, + "epoch": 1.03, + "grad_norm": 15.581537246704102, + "learning_rate": 1.31422833953897e-05, + "loss": 0.686, "step": 8197 }, { - "epoch": 2.46, - "grad_norm": 28.387712478637695, - "learning_rate": 3.5682068758143735e-06, - "loss": 0.9867, + "epoch": 1.03, + "grad_norm": 14.40272045135498, + "learning_rate": 1.3141446680333014e-05, + "loss": 1.2653, "step": 8198 }, { - "epoch": 2.47, - "grad_norm": 37.20726776123047, - "learning_rate": 3.566202265209983e-06, - "loss": 2.4591, + "epoch": 1.03, + "grad_norm": 30.775413513183594, + "learning_rate": 1.3140609965276325e-05, + "loss": 2.5021, "step": 8199 }, { - "epoch": 2.47, - "grad_norm": 8.600688934326172, - "learning_rate": 3.5641976546055933e-06, - "loss": 0.8608, + "epoch": 1.03, + "grad_norm": 9.351306915283203, + "learning_rate": 1.3139773250219638e-05, + "loss": 0.517, "step": 8200 }, { - "epoch": 2.47, - "grad_norm": 42.92110824584961, - "learning_rate": 3.562193044001203e-06, - "loss": 1.5033, + "epoch": 1.03, + "grad_norm": 15.242072105407715, + "learning_rate": 1.3138936535162952e-05, + "loss": 0.8643, "step": 8201 }, { - "epoch": 2.47, - "grad_norm": 9.241715431213379, - "learning_rate": 3.560188433396813e-06, - "loss": 0.8824, + "epoch": 1.03, + "grad_norm": 9.428893089294434, + "learning_rate": 1.3138099820106266e-05, + "loss": 3.8307, "step": 8202 }, { - "epoch": 2.47, - "grad_norm": 18.05348777770996, - "learning_rate": 3.558183822792423e-06, - "loss": 0.8519, + "epoch": 1.03, + "grad_norm": 13.981907844543457, + "learning_rate": 1.3137263105049576e-05, + "loss": 1.1886, "step": 8203 }, { - "epoch": 2.47, - "grad_norm": 13.041126251220703, - "learning_rate": 3.5561792121880328e-06, - "loss": 0.9648, + "epoch": 1.03, + "grad_norm": 12.839777946472168, + "learning_rate": 1.313642638999289e-05, + "loss": 0.8264, "step": 8204 }, { - "epoch": 2.47, - "grad_norm": 15.101016998291016, - "learning_rate": 3.5541746015836424e-06, - "loss": 1.7081, + "epoch": 1.03, + "grad_norm": 8.989615440368652, + "learning_rate": 1.3135589674936201e-05, + "loss": 0.6943, "step": 8205 }, { - "epoch": 2.47, - "grad_norm": 21.521345138549805, - "learning_rate": 3.5521699909792525e-06, - "loss": 2.7192, + "epoch": 1.03, + "grad_norm": 12.968503952026367, + "learning_rate": 1.3134752959879513e-05, + "loss": 0.7676, "step": 8206 }, { - "epoch": 2.47, - "grad_norm": 27.694257736206055, - "learning_rate": 3.5501653803748626e-06, - "loss": 1.5167, + "epoch": 1.03, + "grad_norm": 10.712398529052734, + "learning_rate": 1.3133916244822827e-05, + "loss": 0.555, "step": 8207 }, { - "epoch": 2.47, - "grad_norm": 10.351465225219727, - "learning_rate": 3.5481607697704722e-06, - "loss": 1.3463, + "epoch": 1.03, + "grad_norm": 13.51182746887207, + "learning_rate": 1.3133079529766139e-05, + "loss": 1.0422, "step": 8208 }, { - "epoch": 2.47, - "grad_norm": 29.087156295776367, - "learning_rate": 3.546156159166082e-06, - "loss": 1.397, + "epoch": 1.03, + "grad_norm": 60.49845504760742, + "learning_rate": 1.3132242814709453e-05, + "loss": 2.5942, "step": 8209 }, { - "epoch": 2.47, - "grad_norm": 14.024823188781738, - "learning_rate": 3.5441515485616924e-06, - "loss": 0.8097, + "epoch": 1.03, + "grad_norm": 9.75202465057373, + "learning_rate": 1.3131406099652763e-05, + "loss": 1.0263, "step": 8210 }, { - "epoch": 2.47, - "grad_norm": 11.973261833190918, - "learning_rate": 3.542146937957302e-06, - "loss": 0.7829, + "epoch": 1.03, + "grad_norm": 7.670017242431641, + "learning_rate": 1.3130569384596077e-05, + "loss": 0.427, "step": 8211 }, { - "epoch": 2.47, - "grad_norm": 8.828713417053223, - "learning_rate": 3.5401423273529117e-06, - "loss": 0.7569, + "epoch": 1.03, + "grad_norm": 10.970623016357422, + "learning_rate": 1.312973266953939e-05, + "loss": 0.9992, "step": 8212 }, { - "epoch": 2.47, - "grad_norm": 32.27656173706055, - "learning_rate": 3.538137716748522e-06, - "loss": 0.9883, + "epoch": 1.03, + "grad_norm": 43.433349609375, + "learning_rate": 1.31288959544827e-05, + "loss": 2.8945, "step": 8213 }, { - "epoch": 2.47, - "grad_norm": 54.36296463012695, - "learning_rate": 3.536133106144132e-06, - "loss": 2.0888, + "epoch": 1.03, + "grad_norm": 12.724357604980469, + "learning_rate": 1.3128059239426014e-05, + "loss": 1.0343, "step": 8214 }, { - "epoch": 2.47, - "grad_norm": 30.008255004882812, - "learning_rate": 3.5341284955397416e-06, - "loss": 1.1675, + "epoch": 1.03, + "grad_norm": 10.736432075500488, + "learning_rate": 1.3127222524369328e-05, + "loss": 1.8907, "step": 8215 }, { - "epoch": 2.47, - "grad_norm": 20.43587875366211, - "learning_rate": 3.5321238849353517e-06, - "loss": 1.0271, + "epoch": 1.03, + "grad_norm": 25.897153854370117, + "learning_rate": 1.3126385809312641e-05, + "loss": 1.8143, "step": 8216 }, { - "epoch": 2.47, - "grad_norm": 9.914734840393066, - "learning_rate": 3.5301192743309613e-06, - "loss": 1.4436, + "epoch": 1.03, + "grad_norm": 10.944652557373047, + "learning_rate": 1.3125549094255952e-05, + "loss": 1.6987, "step": 8217 }, { - "epoch": 2.47, - "grad_norm": 10.097274780273438, - "learning_rate": 3.5281146637265714e-06, - "loss": 1.2491, + "epoch": 1.03, + "grad_norm": 15.509830474853516, + "learning_rate": 1.3124712379199265e-05, + "loss": 1.0449, "step": 8218 }, { - "epoch": 2.47, - "grad_norm": 13.336429595947266, - "learning_rate": 3.5261100531221815e-06, - "loss": 1.1802, + "epoch": 1.03, + "grad_norm": 25.927719116210938, + "learning_rate": 1.3123875664142577e-05, + "loss": 1.7837, "step": 8219 }, { - "epoch": 2.47, - "grad_norm": 14.391281127929688, - "learning_rate": 3.524105442517791e-06, - "loss": 1.0317, + "epoch": 1.03, + "grad_norm": 16.40271759033203, + "learning_rate": 1.312303894908589e-05, + "loss": 1.5452, "step": 8220 }, { - "epoch": 2.47, - "grad_norm": 21.83466339111328, - "learning_rate": 3.522100831913401e-06, - "loss": 2.0406, + "epoch": 1.03, + "grad_norm": 8.034306526184082, + "learning_rate": 1.3122202234029203e-05, + "loss": 0.4517, "step": 8221 }, { - "epoch": 2.47, - "grad_norm": 13.69311809539795, - "learning_rate": 3.5200962213090113e-06, - "loss": 1.0287, + "epoch": 1.03, + "grad_norm": 18.109060287475586, + "learning_rate": 1.3121365518972515e-05, + "loss": 1.3607, "step": 8222 }, { - "epoch": 2.47, - "grad_norm": 14.4343900680542, - "learning_rate": 3.518091610704621e-06, - "loss": 1.1169, + "epoch": 1.03, + "grad_norm": 13.58246898651123, + "learning_rate": 1.3120528803915829e-05, + "loss": 1.6415, "step": 8223 }, { - "epoch": 2.47, - "grad_norm": 19.60431671142578, - "learning_rate": 3.5160870001002306e-06, - "loss": 1.2294, + "epoch": 1.03, + "grad_norm": 20.36124038696289, + "learning_rate": 1.3119692088859139e-05, + "loss": 1.2454, "step": 8224 }, { - "epoch": 2.47, - "grad_norm": 8.042855262756348, - "learning_rate": 3.5140823894958407e-06, - "loss": 0.8052, + "epoch": 1.03, + "grad_norm": 32.146080017089844, + "learning_rate": 1.3118855373802452e-05, + "loss": 1.7477, "step": 8225 }, { - "epoch": 2.47, - "grad_norm": 13.775824546813965, - "learning_rate": 3.512077778891451e-06, - "loss": 1.0722, + "epoch": 1.03, + "grad_norm": 12.584585189819336, + "learning_rate": 1.3118018658745766e-05, + "loss": 1.3979, "step": 8226 }, { - "epoch": 2.47, - "grad_norm": 15.00515079498291, - "learning_rate": 3.5100731682870605e-06, - "loss": 1.0372, + "epoch": 1.03, + "grad_norm": 10.174505233764648, + "learning_rate": 1.3117181943689076e-05, + "loss": 0.8317, "step": 8227 }, { - "epoch": 2.47, - "grad_norm": 52.58286666870117, - "learning_rate": 3.5080685576826705e-06, - "loss": 4.7955, + "epoch": 1.03, + "grad_norm": 39.13493728637695, + "learning_rate": 1.311634522863239e-05, + "loss": 3.3734, "step": 8228 }, { - "epoch": 2.47, - "grad_norm": 24.154563903808594, - "learning_rate": 3.50606394707828e-06, - "loss": 1.4256, + "epoch": 1.03, + "grad_norm": 9.438246726989746, + "learning_rate": 1.3115508513575704e-05, + "loss": 0.4985, "step": 8229 }, { - "epoch": 2.47, - "grad_norm": 13.502425193786621, - "learning_rate": 3.50405933647389e-06, - "loss": 1.381, + "epoch": 1.03, + "grad_norm": 22.061195373535156, + "learning_rate": 1.3114671798519016e-05, + "loss": 1.6837, "step": 8230 }, { - "epoch": 2.47, - "grad_norm": 9.207113265991211, - "learning_rate": 3.5020547258695004e-06, - "loss": 0.4176, + "epoch": 1.03, + "grad_norm": 11.457298278808594, + "learning_rate": 1.3113835083462328e-05, + "loss": 0.9882, "step": 8231 }, { - "epoch": 2.48, - "grad_norm": 21.608287811279297, - "learning_rate": 3.50005011526511e-06, - "loss": 0.7372, + "epoch": 1.03, + "grad_norm": 27.55769920349121, + "learning_rate": 1.3112998368405641e-05, + "loss": 1.9697, "step": 8232 }, { - "epoch": 2.48, - "grad_norm": 13.105839729309082, - "learning_rate": 3.4980455046607197e-06, - "loss": 0.719, + "epoch": 1.03, + "grad_norm": 61.551185607910156, + "learning_rate": 1.3112161653348953e-05, + "loss": 1.7753, "step": 8233 }, { - "epoch": 2.48, - "grad_norm": 17.119613647460938, - "learning_rate": 3.49604089405633e-06, - "loss": 1.2089, + "epoch": 1.03, + "grad_norm": 9.996932983398438, + "learning_rate": 1.3111324938292265e-05, + "loss": 1.5993, "step": 8234 }, { - "epoch": 2.48, - "grad_norm": 129.32228088378906, - "learning_rate": 3.49403628345194e-06, - "loss": 2.5109, + "epoch": 1.03, + "grad_norm": 10.689933776855469, + "learning_rate": 1.3110488223235579e-05, + "loss": 1.6766, "step": 8235 }, { - "epoch": 2.48, - "grad_norm": 11.509007453918457, - "learning_rate": 3.4920316728475495e-06, - "loss": 1.1199, + "epoch": 1.03, + "grad_norm": 26.191883087158203, + "learning_rate": 1.310965150817889e-05, + "loss": 2.6339, "step": 8236 }, { - "epoch": 2.48, - "grad_norm": 17.33429718017578, - "learning_rate": 3.4900270622431596e-06, - "loss": 1.3015, + "epoch": 1.03, + "grad_norm": 31.052392959594727, + "learning_rate": 1.3108814793122204e-05, + "loss": 4.1182, "step": 8237 }, { - "epoch": 2.48, - "grad_norm": 28.055233001708984, - "learning_rate": 3.4880224516387697e-06, - "loss": 2.121, + "epoch": 1.03, + "grad_norm": 6.11808967590332, + "learning_rate": 1.3107978078065515e-05, + "loss": 0.3464, "step": 8238 }, { - "epoch": 2.48, - "grad_norm": 22.95208740234375, - "learning_rate": 3.4860178410343794e-06, - "loss": 1.7331, + "epoch": 1.03, + "grad_norm": 8.085333824157715, + "learning_rate": 1.3107141363008828e-05, + "loss": 1.1866, "step": 8239 }, { - "epoch": 2.48, - "grad_norm": 12.567434310913086, - "learning_rate": 3.4840132304299894e-06, - "loss": 1.5196, + "epoch": 1.03, + "grad_norm": 16.53407859802246, + "learning_rate": 1.3106304647952142e-05, + "loss": 3.1467, "step": 8240 }, { - "epoch": 2.48, - "grad_norm": 40.08758544921875, - "learning_rate": 3.482008619825599e-06, - "loss": 1.1594, + "epoch": 1.03, + "grad_norm": 10.062830924987793, + "learning_rate": 1.3105467932895452e-05, + "loss": 1.7672, "step": 8241 }, { - "epoch": 2.48, - "grad_norm": 17.78892707824707, - "learning_rate": 3.4800040092212088e-06, - "loss": 1.0566, + "epoch": 1.03, + "grad_norm": 20.216842651367188, + "learning_rate": 1.3104631217838766e-05, + "loss": 2.2723, "step": 8242 }, { - "epoch": 2.48, - "grad_norm": 28.924680709838867, - "learning_rate": 3.4779993986168193e-06, - "loss": 1.722, + "epoch": 1.03, + "grad_norm": 17.774564743041992, + "learning_rate": 1.310379450278208e-05, + "loss": 0.7638, "step": 8243 }, { - "epoch": 2.48, - "grad_norm": 13.244800567626953, - "learning_rate": 3.475994788012429e-06, - "loss": 1.5066, + "epoch": 1.03, + "grad_norm": 7.087859153747559, + "learning_rate": 1.3102957787725391e-05, + "loss": 0.5927, "step": 8244 }, { - "epoch": 2.48, - "grad_norm": 46.74547576904297, - "learning_rate": 3.4739901774080386e-06, - "loss": 1.802, + "epoch": 1.03, + "grad_norm": 6.852178573608398, + "learning_rate": 1.3102121072668703e-05, + "loss": 0.5539, "step": 8245 }, { - "epoch": 2.48, - "grad_norm": 19.008142471313477, - "learning_rate": 3.4719855668036482e-06, - "loss": 1.4982, + "epoch": 1.03, + "grad_norm": 25.83189582824707, + "learning_rate": 1.3101284357612017e-05, + "loss": 2.1372, "step": 8246 }, { - "epoch": 2.48, - "grad_norm": 16.53547477722168, - "learning_rate": 3.4699809561992588e-06, - "loss": 1.4899, + "epoch": 1.03, + "grad_norm": 14.124688148498535, + "learning_rate": 1.3100447642555329e-05, + "loss": 1.074, "step": 8247 }, { - "epoch": 2.48, - "grad_norm": 18.363264083862305, - "learning_rate": 3.4679763455948684e-06, - "loss": 1.022, + "epoch": 1.04, + "grad_norm": 7.1842360496521, + "learning_rate": 1.3099610927498641e-05, + "loss": 3.1602, "step": 8248 }, { - "epoch": 2.48, - "grad_norm": 7.702117443084717, - "learning_rate": 3.465971734990478e-06, - "loss": 1.3944, + "epoch": 1.04, + "grad_norm": 16.31489372253418, + "learning_rate": 1.3098774212441953e-05, + "loss": 1.529, "step": 8249 }, { - "epoch": 2.48, - "grad_norm": 8.90109634399414, - "learning_rate": 3.4639671243860886e-06, - "loss": 0.8569, + "epoch": 1.04, + "grad_norm": 18.872175216674805, + "learning_rate": 1.3097937497385267e-05, + "loss": 1.4403, "step": 8250 }, { - "epoch": 2.48, - "grad_norm": 34.664485931396484, - "learning_rate": 3.4619625137816982e-06, - "loss": 1.817, + "epoch": 1.04, + "grad_norm": 24.27296257019043, + "learning_rate": 1.309710078232858e-05, + "loss": 1.354, "step": 8251 }, { - "epoch": 2.48, - "grad_norm": 12.355849266052246, - "learning_rate": 3.459957903177308e-06, - "loss": 0.9937, + "epoch": 1.04, + "grad_norm": 19.99920082092285, + "learning_rate": 1.309626406727189e-05, + "loss": 0.6707, "step": 8252 }, { - "epoch": 2.48, - "grad_norm": 12.493422508239746, - "learning_rate": 3.457953292572918e-06, - "loss": 1.1074, + "epoch": 1.04, + "grad_norm": 18.022979736328125, + "learning_rate": 1.3095427352215204e-05, + "loss": 0.9206, "step": 8253 }, { - "epoch": 2.48, - "grad_norm": 20.661190032958984, - "learning_rate": 3.4559486819685277e-06, - "loss": 2.2912, + "epoch": 1.04, + "grad_norm": 8.533984184265137, + "learning_rate": 1.3094590637158518e-05, + "loss": 1.351, "step": 8254 }, { - "epoch": 2.48, - "grad_norm": 19.867286682128906, - "learning_rate": 3.4539440713641377e-06, - "loss": 1.6991, + "epoch": 1.04, + "grad_norm": 9.833261489868164, + "learning_rate": 1.3093753922101828e-05, + "loss": 1.4907, "step": 8255 }, { - "epoch": 2.48, - "grad_norm": 13.034092903137207, - "learning_rate": 3.451939460759748e-06, - "loss": 0.8973, + "epoch": 1.04, + "grad_norm": 21.648242950439453, + "learning_rate": 1.3092917207045142e-05, + "loss": 0.5904, "step": 8256 }, { - "epoch": 2.48, - "grad_norm": 14.20175552368164, - "learning_rate": 3.4499348501553575e-06, - "loss": 1.1049, + "epoch": 1.04, + "grad_norm": 41.22957992553711, + "learning_rate": 1.3092080491988455e-05, + "loss": 0.6205, "step": 8257 }, { - "epoch": 2.48, - "grad_norm": 12.550244331359863, - "learning_rate": 3.447930239550967e-06, - "loss": 1.1492, + "epoch": 1.04, + "grad_norm": 5.088689804077148, + "learning_rate": 1.3091243776931767e-05, + "loss": 0.337, "step": 8258 }, { - "epoch": 2.48, - "grad_norm": 13.553984642028809, - "learning_rate": 3.4459256289465777e-06, - "loss": 1.1702, + "epoch": 1.04, + "grad_norm": 14.02363109588623, + "learning_rate": 1.309040706187508e-05, + "loss": 3.9714, "step": 8259 }, { - "epoch": 2.48, - "grad_norm": 17.68621826171875, - "learning_rate": 3.4439210183421873e-06, - "loss": 1.4533, + "epoch": 1.04, + "grad_norm": 15.000255584716797, + "learning_rate": 1.3089570346818393e-05, + "loss": 1.4824, "step": 8260 }, { - "epoch": 2.48, - "grad_norm": 19.83187484741211, - "learning_rate": 3.441916407737797e-06, - "loss": 1.4019, + "epoch": 1.04, + "grad_norm": 25.85439109802246, + "learning_rate": 1.3088733631761705e-05, + "loss": 2.2748, "step": 8261 }, { - "epoch": 2.48, - "grad_norm": 11.274413108825684, - "learning_rate": 3.4399117971334075e-06, - "loss": 1.7506, + "epoch": 1.04, + "grad_norm": 39.99784469604492, + "learning_rate": 1.3087896916705017e-05, + "loss": 0.5131, "step": 8262 }, { - "epoch": 2.48, - "grad_norm": 27.54680061340332, - "learning_rate": 3.437907186529017e-06, - "loss": 1.5858, + "epoch": 1.04, + "grad_norm": 6.106348514556885, + "learning_rate": 1.3087060201648329e-05, + "loss": 0.4545, "step": 8263 }, { - "epoch": 2.48, - "grad_norm": 9.336912155151367, - "learning_rate": 3.435902575924627e-06, - "loss": 0.7806, + "epoch": 1.04, + "grad_norm": 17.503942489624023, + "learning_rate": 1.3086223486591642e-05, + "loss": 2.8439, "step": 8264 }, { - "epoch": 2.48, - "grad_norm": 53.21189498901367, - "learning_rate": 3.433897965320237e-06, - "loss": 1.7986, + "epoch": 1.04, + "grad_norm": 7.972207069396973, + "learning_rate": 1.3085386771534956e-05, + "loss": 0.4385, "step": 8265 }, { - "epoch": 2.49, - "grad_norm": 9.917101860046387, - "learning_rate": 3.4318933547158465e-06, - "loss": 0.8816, + "epoch": 1.04, + "grad_norm": 21.280460357666016, + "learning_rate": 1.3084550056478266e-05, + "loss": 1.4569, "step": 8266 }, { - "epoch": 2.49, - "grad_norm": 8.793371200561523, - "learning_rate": 3.4298887441114566e-06, - "loss": 0.8211, + "epoch": 1.04, + "grad_norm": 9.055754661560059, + "learning_rate": 1.308371334142158e-05, + "loss": 1.4348, "step": 8267 }, { - "epoch": 2.49, - "grad_norm": 6.4236626625061035, - "learning_rate": 3.4278841335070667e-06, - "loss": 0.7913, + "epoch": 1.04, + "grad_norm": 10.815393447875977, + "learning_rate": 1.3082876626364894e-05, + "loss": 0.6324, "step": 8268 }, { - "epoch": 2.49, - "grad_norm": 22.39311408996582, - "learning_rate": 3.4258795229026764e-06, - "loss": 1.1071, + "epoch": 1.04, + "grad_norm": 4.52388858795166, + "learning_rate": 1.3082039911308204e-05, + "loss": 0.0638, "step": 8269 }, { - "epoch": 2.49, - "grad_norm": 15.763972282409668, - "learning_rate": 3.423874912298286e-06, - "loss": 0.7142, + "epoch": 1.04, + "grad_norm": 19.21198081970215, + "learning_rate": 1.3081203196251517e-05, + "loss": 1.4016, "step": 8270 }, { - "epoch": 2.49, - "grad_norm": 16.8847713470459, - "learning_rate": 3.4218703016938965e-06, - "loss": 1.2469, + "epoch": 1.04, + "grad_norm": 16.03714370727539, + "learning_rate": 1.3080366481194831e-05, + "loss": 1.0052, "step": 8271 }, { - "epoch": 2.49, - "grad_norm": 9.058008193969727, - "learning_rate": 3.419865691089506e-06, - "loss": 1.3107, + "epoch": 1.04, + "grad_norm": 14.488385200500488, + "learning_rate": 1.3079529766138143e-05, + "loss": 1.3679, "step": 8272 }, { - "epoch": 2.49, - "grad_norm": 19.072185516357422, - "learning_rate": 3.417861080485116e-06, - "loss": 1.2581, + "epoch": 1.04, + "grad_norm": 19.894723892211914, + "learning_rate": 1.3078693051081455e-05, + "loss": 1.1507, "step": 8273 }, { - "epoch": 2.49, - "grad_norm": 30.158096313476562, - "learning_rate": 3.4158564698807264e-06, - "loss": 1.2928, + "epoch": 1.04, + "grad_norm": 5.145405292510986, + "learning_rate": 1.3077856336024769e-05, + "loss": 0.2995, "step": 8274 }, { - "epoch": 2.49, - "grad_norm": 11.686331748962402, - "learning_rate": 3.413851859276336e-06, - "loss": 1.0867, + "epoch": 1.04, + "grad_norm": 9.702325820922852, + "learning_rate": 1.307701962096808e-05, + "loss": 1.1392, "step": 8275 }, { - "epoch": 2.49, - "grad_norm": 13.757401466369629, - "learning_rate": 3.4118472486719457e-06, - "loss": 1.5826, + "epoch": 1.04, + "grad_norm": 14.014906883239746, + "learning_rate": 1.3076182905911393e-05, + "loss": 1.3747, "step": 8276 }, { - "epoch": 2.49, - "grad_norm": 18.740379333496094, - "learning_rate": 3.4098426380675558e-06, - "loss": 1.2056, + "epoch": 1.04, + "grad_norm": 20.981679916381836, + "learning_rate": 1.3075346190854705e-05, + "loss": 1.193, "step": 8277 }, { - "epoch": 2.49, - "grad_norm": 32.20988082885742, - "learning_rate": 3.4078380274631654e-06, - "loss": 1.3858, + "epoch": 1.04, + "grad_norm": 77.5040512084961, + "learning_rate": 1.3074509475798018e-05, + "loss": 2.1638, "step": 8278 }, { - "epoch": 2.49, - "grad_norm": 13.55139446258545, - "learning_rate": 3.4058334168587755e-06, - "loss": 1.0321, + "epoch": 1.04, + "grad_norm": 6.79390811920166, + "learning_rate": 1.3073672760741332e-05, + "loss": 0.7708, "step": 8279 }, { - "epoch": 2.49, - "grad_norm": 10.795275688171387, - "learning_rate": 3.4038288062543856e-06, - "loss": 1.2745, - "step": 8280 - }, - { - "epoch": 2.49, - "eval_loss": 0.17804668843746185, - "eval_runtime": 43.714, - "eval_samples_per_second": 33.834, - "eval_steps_per_second": 33.834, + "epoch": 1.04, + "grad_norm": 43.72199249267578, + "learning_rate": 1.3072836045684642e-05, + "loss": 2.7219, "step": 8280 }, { - "epoch": 2.49, - "grad_norm": 11.243696212768555, - "learning_rate": 3.4018241956499953e-06, - "loss": 0.8101, + "epoch": 1.04, + "grad_norm": 12.676359176635742, + "learning_rate": 1.3071999330627956e-05, + "loss": 0.864, "step": 8281 }, { - "epoch": 2.49, - "grad_norm": 10.823609352111816, - "learning_rate": 3.399819585045605e-06, - "loss": 2.0621, + "epoch": 1.04, + "grad_norm": 13.768181800842285, + "learning_rate": 1.307116261557127e-05, + "loss": 0.7752, "step": 8282 }, { - "epoch": 2.49, - "grad_norm": 52.87884521484375, - "learning_rate": 3.3978149744412154e-06, - "loss": 2.0929, + "epoch": 1.04, + "grad_norm": 13.410317420959473, + "learning_rate": 1.307032590051458e-05, + "loss": 1.1399, "step": 8283 }, { - "epoch": 2.49, - "grad_norm": 11.563464164733887, - "learning_rate": 3.395810363836825e-06, - "loss": 0.8609, + "epoch": 1.04, + "grad_norm": 55.05906677246094, + "learning_rate": 1.3069489185457893e-05, + "loss": 3.1471, "step": 8284 }, { - "epoch": 2.49, - "grad_norm": 39.08991241455078, - "learning_rate": 3.3938057532324348e-06, - "loss": 1.3453, + "epoch": 1.04, + "grad_norm": 11.868683815002441, + "learning_rate": 1.3068652470401207e-05, + "loss": 0.7911, "step": 8285 }, { - "epoch": 2.49, - "grad_norm": 72.77201843261719, - "learning_rate": 3.391801142628045e-06, - "loss": 1.5782, + "epoch": 1.04, + "grad_norm": 16.719898223876953, + "learning_rate": 1.3067815755344519e-05, + "loss": 2.0644, "step": 8286 }, { - "epoch": 2.49, - "grad_norm": 44.493431091308594, - "learning_rate": 3.389796532023655e-06, - "loss": 1.312, + "epoch": 1.04, + "grad_norm": 9.625706672668457, + "learning_rate": 1.3066979040287831e-05, + "loss": 1.6071, "step": 8287 }, { - "epoch": 2.49, - "grad_norm": 49.612998962402344, - "learning_rate": 3.3877919214192646e-06, - "loss": 2.2307, + "epoch": 1.04, + "grad_norm": 117.92691040039062, + "learning_rate": 1.3066142325231143e-05, + "loss": 2.3713, "step": 8288 }, { - "epoch": 2.49, - "grad_norm": 30.34926986694336, - "learning_rate": 3.3857873108148743e-06, - "loss": 1.1781, + "epoch": 1.04, + "grad_norm": 8.23699951171875, + "learning_rate": 1.3065305610174456e-05, + "loss": 0.5104, "step": 8289 }, { - "epoch": 2.49, - "grad_norm": 9.271419525146484, - "learning_rate": 3.3837827002104843e-06, - "loss": 0.9995, + "epoch": 1.04, + "grad_norm": 15.290740966796875, + "learning_rate": 1.3064468895117768e-05, + "loss": 0.795, "step": 8290 }, { - "epoch": 2.49, - "grad_norm": 16.43708610534668, - "learning_rate": 3.3817780896060944e-06, - "loss": 1.193, + "epoch": 1.04, + "grad_norm": 8.10437297821045, + "learning_rate": 1.306363218006108e-05, + "loss": 0.5317, "step": 8291 }, { - "epoch": 2.49, - "grad_norm": 15.72551441192627, - "learning_rate": 3.379773479001704e-06, - "loss": 1.3098, + "epoch": 1.04, + "grad_norm": 13.65834903717041, + "learning_rate": 1.3062795465004394e-05, + "loss": 1.32, "step": 8292 }, { - "epoch": 2.49, - "grad_norm": 7.720631122589111, - "learning_rate": 3.377768868397314e-06, - "loss": 0.8755, + "epoch": 1.04, + "grad_norm": 38.906044006347656, + "learning_rate": 1.3061958749947708e-05, + "loss": 0.8079, "step": 8293 }, { - "epoch": 2.49, - "grad_norm": 27.38102149963379, - "learning_rate": 3.375764257792924e-06, - "loss": 1.4355, + "epoch": 1.04, + "grad_norm": 25.86734962463379, + "learning_rate": 1.3061122034891018e-05, + "loss": 1.9084, "step": 8294 }, { - "epoch": 2.49, - "grad_norm": 12.947731018066406, - "learning_rate": 3.3737596471885335e-06, - "loss": 1.0127, + "epoch": 1.04, + "grad_norm": 13.268917083740234, + "learning_rate": 1.3060285319834332e-05, + "loss": 1.4258, "step": 8295 }, { - "epoch": 2.49, - "grad_norm": 18.314491271972656, - "learning_rate": 3.371755036584144e-06, - "loss": 1.1366, + "epoch": 1.04, + "grad_norm": 6.482137203216553, + "learning_rate": 1.3059448604777645e-05, + "loss": 0.7315, "step": 8296 }, { - "epoch": 2.49, - "grad_norm": 20.400217056274414, - "learning_rate": 3.3697504259797537e-06, - "loss": 1.2425, + "epoch": 1.04, + "grad_norm": 22.378463745117188, + "learning_rate": 1.3058611889720956e-05, + "loss": 1.3847, "step": 8297 }, { - "epoch": 2.49, - "grad_norm": 21.475561141967773, - "learning_rate": 3.3677458153753633e-06, - "loss": 0.9925, + "epoch": 1.04, + "grad_norm": 5.98436975479126, + "learning_rate": 1.305777517466427e-05, + "loss": 1.3013, "step": 8298 }, { - "epoch": 2.5, - "grad_norm": 21.971511840820312, - "learning_rate": 3.365741204770974e-06, - "loss": 1.3472, + "epoch": 1.04, + "grad_norm": 24.26239776611328, + "learning_rate": 1.3056938459607583e-05, + "loss": 2.889, "step": 8299 }, { - "epoch": 2.5, - "grad_norm": 11.167675018310547, - "learning_rate": 3.3637365941665835e-06, - "loss": 1.0643, + "epoch": 1.04, + "grad_norm": 20.896337509155273, + "learning_rate": 1.3056101744550895e-05, + "loss": 2.7163, "step": 8300 }, { - "epoch": 2.5, - "grad_norm": 55.813114166259766, - "learning_rate": 3.361731983562193e-06, - "loss": 2.2609, + "epoch": 1.04, + "grad_norm": 7.655635833740234, + "learning_rate": 1.3055265029494207e-05, + "loss": 0.6938, "step": 8301 }, { - "epoch": 2.5, - "grad_norm": 14.630870819091797, - "learning_rate": 3.3597273729578032e-06, - "loss": 0.6388, + "epoch": 1.04, + "grad_norm": 8.687627792358398, + "learning_rate": 1.3054428314437519e-05, + "loss": 1.4532, "step": 8302 }, { - "epoch": 2.5, - "grad_norm": 21.16858673095703, - "learning_rate": 3.357722762353413e-06, - "loss": 1.4621, + "epoch": 1.04, + "grad_norm": 27.534439086914062, + "learning_rate": 1.3053591599380832e-05, + "loss": 0.9914, "step": 8303 }, { - "epoch": 2.5, - "grad_norm": 25.93169593811035, - "learning_rate": 3.355718151749023e-06, - "loss": 1.6922, + "epoch": 1.04, + "grad_norm": 11.624176025390625, + "learning_rate": 1.3052754884324144e-05, + "loss": 1.4505, "step": 8304 }, { - "epoch": 2.5, - "grad_norm": 17.35527229309082, - "learning_rate": 3.353713541144633e-06, - "loss": 1.1722, + "epoch": 1.04, + "grad_norm": 35.57938766479492, + "learning_rate": 1.3051918169267456e-05, + "loss": 0.9582, "step": 8305 }, { - "epoch": 2.5, - "grad_norm": 33.93075942993164, - "learning_rate": 3.3517089305402427e-06, - "loss": 1.9865, + "epoch": 1.04, + "grad_norm": 17.190563201904297, + "learning_rate": 1.305108145421077e-05, + "loss": 1.7702, "step": 8306 }, { - "epoch": 2.5, - "grad_norm": 39.736629486083984, - "learning_rate": 3.3497043199358524e-06, - "loss": 2.1434, + "epoch": 1.04, + "grad_norm": 12.20495891571045, + "learning_rate": 1.3050244739154084e-05, + "loss": 0.6509, "step": 8307 }, { - "epoch": 2.5, - "grad_norm": 16.60782814025879, - "learning_rate": 3.347699709331463e-06, - "loss": 1.4811, + "epoch": 1.04, + "grad_norm": 3.5402653217315674, + "learning_rate": 1.3049408024097394e-05, + "loss": 0.1798, "step": 8308 }, { - "epoch": 2.5, - "grad_norm": 23.1959285736084, - "learning_rate": 3.3456950987270725e-06, - "loss": 1.8844, + "epoch": 1.04, + "grad_norm": 37.674285888671875, + "learning_rate": 1.3048571309040707e-05, + "loss": 2.1001, "step": 8309 }, { - "epoch": 2.5, - "grad_norm": 15.47692584991455, - "learning_rate": 3.343690488122682e-06, - "loss": 1.1461, + "epoch": 1.04, + "grad_norm": 9.620979309082031, + "learning_rate": 1.3047734593984021e-05, + "loss": 1.6433, "step": 8310 }, { - "epoch": 2.5, - "grad_norm": 10.405765533447266, - "learning_rate": 3.3416858775182927e-06, - "loss": 1.0609, + "epoch": 1.04, + "grad_norm": 8.073790550231934, + "learning_rate": 1.3046897878927331e-05, + "loss": 0.5259, "step": 8311 }, { - "epoch": 2.5, - "grad_norm": 7.443643569946289, - "learning_rate": 3.3396812669139024e-06, - "loss": 0.5891, + "epoch": 1.04, + "grad_norm": 17.23143768310547, + "learning_rate": 1.3046061163870645e-05, + "loss": 0.92, "step": 8312 }, { - "epoch": 2.5, - "grad_norm": 40.757137298583984, - "learning_rate": 3.337676656309512e-06, - "loss": 1.1556, + "epoch": 1.04, + "grad_norm": 15.197664260864258, + "learning_rate": 1.3045224448813959e-05, + "loss": 0.8573, "step": 8313 }, { - "epoch": 2.5, - "grad_norm": 23.71717071533203, - "learning_rate": 3.335672045705122e-06, - "loss": 1.3438, + "epoch": 1.04, + "grad_norm": 40.136722564697266, + "learning_rate": 1.304438773375727e-05, + "loss": 1.027, "step": 8314 }, { - "epoch": 2.5, - "grad_norm": 9.829071998596191, - "learning_rate": 3.3336674351007318e-06, - "loss": 1.1778, + "epoch": 1.04, + "grad_norm": 13.674126625061035, + "learning_rate": 1.3043551018700583e-05, + "loss": 1.138, "step": 8315 }, { - "epoch": 2.5, - "grad_norm": 36.342872619628906, - "learning_rate": 3.331662824496342e-06, - "loss": 1.5935, + "epoch": 1.04, + "grad_norm": 15.043445587158203, + "learning_rate": 1.3042714303643895e-05, + "loss": 1.1383, "step": 8316 }, { - "epoch": 2.5, - "grad_norm": 34.38883590698242, - "learning_rate": 3.329658213891952e-06, - "loss": 1.1699, + "epoch": 1.04, + "grad_norm": 17.719526290893555, + "learning_rate": 1.3041877588587208e-05, + "loss": 0.6344, "step": 8317 }, { - "epoch": 2.5, - "grad_norm": 13.522592544555664, - "learning_rate": 3.3276536032875616e-06, - "loss": 1.3218, + "epoch": 1.04, + "grad_norm": 16.272741317749023, + "learning_rate": 1.304104087353052e-05, + "loss": 0.9304, "step": 8318 }, { - "epoch": 2.5, - "grad_norm": 18.872207641601562, - "learning_rate": 3.3256489926831713e-06, - "loss": 1.0661, + "epoch": 1.04, + "grad_norm": 13.373222351074219, + "learning_rate": 1.3040204158473832e-05, + "loss": 1.0971, "step": 8319 }, { - "epoch": 2.5, - "grad_norm": 48.556331634521484, - "learning_rate": 3.3236443820787818e-06, - "loss": 2.3702, + "epoch": 1.04, + "grad_norm": 15.191892623901367, + "learning_rate": 1.3039367443417146e-05, + "loss": 2.0868, "step": 8320 }, { - "epoch": 2.5, - "grad_norm": 26.391420364379883, - "learning_rate": 3.3216397714743914e-06, - "loss": 1.0047, + "epoch": 1.04, + "grad_norm": 8.404037475585938, + "learning_rate": 1.303853072836046e-05, + "loss": 1.9416, "step": 8321 }, { - "epoch": 2.5, - "grad_norm": 10.691487312316895, - "learning_rate": 3.319635160870001e-06, - "loss": 0.6147, + "epoch": 1.04, + "grad_norm": 26.0845890045166, + "learning_rate": 1.303769401330377e-05, + "loss": 2.1982, "step": 8322 }, { - "epoch": 2.5, - "grad_norm": 11.963746070861816, - "learning_rate": 3.3176305502656116e-06, - "loss": 1.4303, + "epoch": 1.04, + "grad_norm": 24.468116760253906, + "learning_rate": 1.3036857298247083e-05, + "loss": 0.9575, "step": 8323 }, { - "epoch": 2.5, - "grad_norm": 14.355230331420898, - "learning_rate": 3.3156259396612213e-06, - "loss": 1.6003, + "epoch": 1.04, + "grad_norm": 6.69379186630249, + "learning_rate": 1.3036020583190397e-05, + "loss": 0.4925, "step": 8324 }, { - "epoch": 2.5, - "grad_norm": 14.952115058898926, - "learning_rate": 3.313621329056831e-06, - "loss": 1.2936, + "epoch": 1.04, + "grad_norm": 7.881843090057373, + "learning_rate": 1.3035183868133707e-05, + "loss": 0.6911, "step": 8325 }, { - "epoch": 2.5, - "grad_norm": 11.36012077331543, - "learning_rate": 3.311616718452441e-06, - "loss": 1.3233, + "epoch": 1.04, + "grad_norm": 7.593258857727051, + "learning_rate": 1.303434715307702e-05, + "loss": 1.0592, "step": 8326 }, { - "epoch": 2.5, - "grad_norm": 73.88568115234375, - "learning_rate": 3.3096121078480507e-06, - "loss": 2.0371, + "epoch": 1.05, + "grad_norm": 45.25659942626953, + "learning_rate": 1.3033510438020334e-05, + "loss": 1.0704, "step": 8327 }, { - "epoch": 2.5, - "grad_norm": 7.392289638519287, - "learning_rate": 3.3076074972436608e-06, - "loss": 0.817, + "epoch": 1.05, + "grad_norm": 8.04725170135498, + "learning_rate": 1.3032673722963646e-05, + "loss": 0.7889, "step": 8328 }, { - "epoch": 2.5, - "grad_norm": 26.323123931884766, - "learning_rate": 3.305602886639271e-06, - "loss": 1.2377, + "epoch": 1.05, + "grad_norm": 14.932684898376465, + "learning_rate": 1.3031837007906958e-05, + "loss": 0.8947, "step": 8329 }, { - "epoch": 2.5, - "grad_norm": 6.049129486083984, - "learning_rate": 3.3035982760348805e-06, - "loss": 0.725, + "epoch": 1.05, + "grad_norm": 21.940521240234375, + "learning_rate": 1.303100029285027e-05, + "loss": 2.4528, "step": 8330 }, { - "epoch": 2.5, - "grad_norm": 18.688451766967773, - "learning_rate": 3.30159366543049e-06, - "loss": 1.0841, + "epoch": 1.05, + "grad_norm": 10.622476577758789, + "learning_rate": 1.3030163577793584e-05, + "loss": 1.2091, "step": 8331 }, { - "epoch": 2.51, - "grad_norm": 19.779504776000977, - "learning_rate": 3.2995890548261e-06, - "loss": 1.6004, + "epoch": 1.05, + "grad_norm": 5.0412774085998535, + "learning_rate": 1.3029326862736896e-05, + "loss": 0.4054, "step": 8332 }, { - "epoch": 2.51, - "grad_norm": 37.673744201660156, - "learning_rate": 3.2975844442217103e-06, - "loss": 1.7093, + "epoch": 1.05, + "grad_norm": 4.07755708694458, + "learning_rate": 1.3028490147680208e-05, + "loss": 0.287, "step": 8333 }, { - "epoch": 2.51, - "grad_norm": 15.679644584655762, - "learning_rate": 3.29557983361732e-06, - "loss": 1.4718, + "epoch": 1.05, + "grad_norm": 12.299552917480469, + "learning_rate": 1.3027653432623522e-05, + "loss": 1.523, "step": 8334 }, { - "epoch": 2.51, - "grad_norm": 17.381481170654297, - "learning_rate": 3.2935752230129297e-06, - "loss": 2.5925, + "epoch": 1.05, + "grad_norm": 30.969959259033203, + "learning_rate": 1.3026816717566835e-05, + "loss": 1.1815, "step": 8335 }, { - "epoch": 2.51, - "grad_norm": 9.106160163879395, - "learning_rate": 3.29157061240854e-06, - "loss": 0.8652, + "epoch": 1.05, + "grad_norm": 10.52627944946289, + "learning_rate": 1.3025980002510145e-05, + "loss": 0.4909, "step": 8336 }, { - "epoch": 2.51, - "grad_norm": 25.941999435424805, - "learning_rate": 3.28956600180415e-06, - "loss": 0.8842, + "epoch": 1.05, + "grad_norm": 8.907776832580566, + "learning_rate": 1.3025143287453459e-05, + "loss": 0.4805, "step": 8337 }, { - "epoch": 2.51, - "grad_norm": 9.563976287841797, - "learning_rate": 3.2875613911997595e-06, - "loss": 1.6659, + "epoch": 1.05, + "grad_norm": 14.624920845031738, + "learning_rate": 1.3024306572396773e-05, + "loss": 0.9164, "step": 8338 }, { - "epoch": 2.51, - "grad_norm": 46.76715087890625, - "learning_rate": 3.2855567805953696e-06, - "loss": 1.8769, + "epoch": 1.05, + "grad_norm": 9.58926010131836, + "learning_rate": 1.3023469857340083e-05, + "loss": 1.1276, "step": 8339 }, { - "epoch": 2.51, - "grad_norm": 20.058713912963867, - "learning_rate": 3.2835521699909797e-06, - "loss": 1.3938, + "epoch": 1.05, + "grad_norm": 2.0990917682647705, + "learning_rate": 1.3022633142283397e-05, + "loss": 0.0459, "step": 8340 }, { - "epoch": 2.51, - "grad_norm": 41.1780891418457, - "learning_rate": 3.2815475593865893e-06, - "loss": 1.9891, + "epoch": 1.05, + "grad_norm": 13.683082580566406, + "learning_rate": 1.3021796427226709e-05, + "loss": 1.0213, "step": 8341 }, { - "epoch": 2.51, - "grad_norm": 14.344354629516602, - "learning_rate": 3.2795429487821994e-06, - "loss": 1.4836, + "epoch": 1.05, + "grad_norm": 5.27555513381958, + "learning_rate": 1.3020959712170022e-05, + "loss": 1.4565, "step": 8342 }, { - "epoch": 2.51, - "grad_norm": 12.271919250488281, - "learning_rate": 3.277538338177809e-06, - "loss": 1.1799, + "epoch": 1.05, + "grad_norm": 9.36648178100586, + "learning_rate": 1.3020122997113334e-05, + "loss": 1.678, "step": 8343 }, { - "epoch": 2.51, - "grad_norm": 10.539395332336426, - "learning_rate": 3.2755337275734187e-06, - "loss": 1.0998, + "epoch": 1.05, + "grad_norm": 15.057022094726562, + "learning_rate": 1.3019286282056646e-05, + "loss": 0.7218, "step": 8344 }, { - "epoch": 2.51, - "grad_norm": 18.392601013183594, - "learning_rate": 3.2735291169690292e-06, - "loss": 1.2702, + "epoch": 1.05, + "grad_norm": 6.689286708831787, + "learning_rate": 1.301844956699996e-05, + "loss": 0.9842, "step": 8345 }, { - "epoch": 2.51, - "grad_norm": 9.823795318603516, - "learning_rate": 3.271524506364639e-06, - "loss": 1.4675, + "epoch": 1.05, + "grad_norm": 4.9068708419799805, + "learning_rate": 1.3017612851943272e-05, + "loss": 0.3076, "step": 8346 }, { - "epoch": 2.51, - "grad_norm": 43.700782775878906, - "learning_rate": 3.2695198957602486e-06, - "loss": 2.5785, + "epoch": 1.05, + "grad_norm": 22.96696662902832, + "learning_rate": 1.3016776136886584e-05, + "loss": 1.6388, "step": 8347 }, { - "epoch": 2.51, - "grad_norm": 18.977764129638672, - "learning_rate": 3.267515285155859e-06, - "loss": 2.1341, + "epoch": 1.05, + "grad_norm": 12.055654525756836, + "learning_rate": 1.3015939421829897e-05, + "loss": 3.3452, "step": 8348 }, { - "epoch": 2.51, - "grad_norm": 11.805031776428223, - "learning_rate": 3.2655106745514687e-06, - "loss": 1.0303, + "epoch": 1.05, + "grad_norm": 21.18698501586914, + "learning_rate": 1.3015102706773211e-05, + "loss": 2.8709, "step": 8349 }, { - "epoch": 2.51, - "grad_norm": 35.22747802734375, - "learning_rate": 3.2635060639470784e-06, - "loss": 1.0836, + "epoch": 1.05, + "grad_norm": 12.760493278503418, + "learning_rate": 1.3014265991716521e-05, + "loss": 0.861, "step": 8350 }, { - "epoch": 2.51, - "grad_norm": 48.45168685913086, - "learning_rate": 3.2615014533426885e-06, - "loss": 1.3489, + "epoch": 1.05, + "grad_norm": 22.128196716308594, + "learning_rate": 1.3013429276659835e-05, + "loss": 2.5848, "step": 8351 }, { - "epoch": 2.51, - "grad_norm": 22.869895935058594, - "learning_rate": 3.2594968427382986e-06, - "loss": 2.6811, + "epoch": 1.05, + "grad_norm": 91.83653259277344, + "learning_rate": 1.3012592561603149e-05, + "loss": 2.6359, "step": 8352 }, { - "epoch": 2.51, - "grad_norm": 18.353343963623047, - "learning_rate": 3.257492232133908e-06, - "loss": 0.976, + "epoch": 1.05, + "grad_norm": 38.465980529785156, + "learning_rate": 1.3011755846546459e-05, + "loss": 0.6478, "step": 8353 }, { - "epoch": 2.51, - "grad_norm": 29.93402862548828, - "learning_rate": 3.2554876215295183e-06, - "loss": 1.1663, + "epoch": 1.05, + "grad_norm": 11.067882537841797, + "learning_rate": 1.3010919131489773e-05, + "loss": 0.8612, "step": 8354 }, { - "epoch": 2.51, - "grad_norm": 13.383641242980957, - "learning_rate": 3.253483010925128e-06, - "loss": 0.7903, + "epoch": 1.05, + "grad_norm": 6.735037326812744, + "learning_rate": 1.3010082416433084e-05, + "loss": 0.5627, "step": 8355 }, { - "epoch": 2.51, - "grad_norm": 6.8472747802734375, - "learning_rate": 3.2514784003207376e-06, - "loss": 1.1989, + "epoch": 1.05, + "grad_norm": 12.847387313842773, + "learning_rate": 1.3009245701376398e-05, + "loss": 1.3777, "step": 8356 }, { - "epoch": 2.51, - "grad_norm": 32.73374938964844, - "learning_rate": 3.249473789716348e-06, - "loss": 0.9284, + "epoch": 1.05, + "grad_norm": 19.365373611450195, + "learning_rate": 1.300840898631971e-05, + "loss": 1.3549, "step": 8357 }, { - "epoch": 2.51, - "grad_norm": 11.246085166931152, - "learning_rate": 3.2474691791119578e-06, - "loss": 1.5323, + "epoch": 1.05, + "grad_norm": 43.83978271484375, + "learning_rate": 1.3007572271263022e-05, + "loss": 2.663, "step": 8358 }, { - "epoch": 2.51, - "grad_norm": 8.259552955627441, - "learning_rate": 3.2454645685075674e-06, - "loss": 0.8713, + "epoch": 1.05, + "grad_norm": 7.216566562652588, + "learning_rate": 1.3006735556206336e-05, + "loss": 0.6891, "step": 8359 }, { - "epoch": 2.51, - "grad_norm": 28.066495895385742, - "learning_rate": 3.243459957903178e-06, - "loss": 1.1289, + "epoch": 1.05, + "grad_norm": 12.197381019592285, + "learning_rate": 1.3005898841149646e-05, + "loss": 1.2583, "step": 8360 }, { - "epoch": 2.51, - "grad_norm": 16.770734786987305, - "learning_rate": 3.2414553472987876e-06, - "loss": 1.1323, + "epoch": 1.05, + "grad_norm": 11.418231964111328, + "learning_rate": 1.300506212609296e-05, + "loss": 0.7901, "step": 8361 }, { - "epoch": 2.51, - "grad_norm": 10.900749206542969, - "learning_rate": 3.2394507366943973e-06, - "loss": 0.7672, + "epoch": 1.05, + "grad_norm": 28.907398223876953, + "learning_rate": 1.3004225411036273e-05, + "loss": 1.1503, "step": 8362 }, { - "epoch": 2.51, - "grad_norm": 21.28752326965332, - "learning_rate": 3.2374461260900074e-06, - "loss": 1.4755, + "epoch": 1.05, + "grad_norm": 7.6004638671875, + "learning_rate": 1.3003388695979587e-05, + "loss": 2.7435, "step": 8363 }, { - "epoch": 2.51, - "grad_norm": 15.661283493041992, - "learning_rate": 3.2354415154856174e-06, - "loss": 1.3512, + "epoch": 1.05, + "grad_norm": 6.2184906005859375, + "learning_rate": 1.3002551980922897e-05, + "loss": 0.4766, "step": 8364 }, { - "epoch": 2.52, - "grad_norm": 16.886972427368164, - "learning_rate": 3.233436904881227e-06, - "loss": 1.4028, + "epoch": 1.05, + "grad_norm": 20.67194938659668, + "learning_rate": 1.300171526586621e-05, + "loss": 1.1035, "step": 8365 }, { - "epoch": 2.52, - "grad_norm": 17.953754425048828, - "learning_rate": 3.231432294276837e-06, - "loss": 1.2938, + "epoch": 1.05, + "grad_norm": 17.56820297241211, + "learning_rate": 1.3000878550809524e-05, + "loss": 1.4941, "step": 8366 }, { - "epoch": 2.52, - "grad_norm": 9.600143432617188, - "learning_rate": 3.229427683672447e-06, - "loss": 0.7052, + "epoch": 1.05, + "grad_norm": 9.857599258422852, + "learning_rate": 1.3000041835752835e-05, + "loss": 1.516, "step": 8367 }, { - "epoch": 2.52, - "grad_norm": 19.288686752319336, - "learning_rate": 3.2274230730680565e-06, - "loss": 1.588, + "epoch": 1.05, + "grad_norm": 15.678145408630371, + "learning_rate": 1.2999205120696148e-05, + "loss": 2.4484, "step": 8368 }, { - "epoch": 2.52, - "grad_norm": 12.002143859863281, - "learning_rate": 3.225418462463667e-06, - "loss": 0.5881, + "epoch": 1.05, + "grad_norm": 26.623945236206055, + "learning_rate": 1.299836840563946e-05, + "loss": 2.4554, "step": 8369 }, { - "epoch": 2.52, - "grad_norm": 18.510684967041016, - "learning_rate": 3.2234138518592767e-06, - "loss": 1.543, + "epoch": 1.05, + "grad_norm": 30.240673065185547, + "learning_rate": 1.2997531690582774e-05, + "loss": 1.1681, "step": 8370 }, { - "epoch": 2.52, - "grad_norm": 31.459373474121094, - "learning_rate": 3.2214092412548863e-06, - "loss": 1.0949, + "epoch": 1.05, + "grad_norm": 6.327847003936768, + "learning_rate": 1.2996694975526086e-05, + "loss": 1.967, "step": 8371 }, { - "epoch": 2.52, - "grad_norm": 7.364177703857422, - "learning_rate": 3.219404630650496e-06, - "loss": 0.7282, + "epoch": 1.05, + "grad_norm": 11.448563575744629, + "learning_rate": 1.2995858260469398e-05, + "loss": 1.3557, "step": 8372 }, { - "epoch": 2.52, - "grad_norm": 10.49648666381836, - "learning_rate": 3.2174000200461065e-06, - "loss": 0.6402, + "epoch": 1.05, + "grad_norm": 10.462690353393555, + "learning_rate": 1.2995021545412712e-05, + "loss": 0.842, "step": 8373 }, { - "epoch": 2.52, - "grad_norm": 25.187376022338867, - "learning_rate": 3.215395409441716e-06, - "loss": 1.4291, + "epoch": 1.05, + "grad_norm": 6.635849952697754, + "learning_rate": 1.2994184830356022e-05, + "loss": 2.6515, "step": 8374 }, { - "epoch": 2.52, - "grad_norm": 26.098617553710938, - "learning_rate": 3.213390798837326e-06, - "loss": 1.861, + "epoch": 1.05, + "grad_norm": 10.515811920166016, + "learning_rate": 1.2993348115299335e-05, + "loss": 1.0804, "step": 8375 }, { - "epoch": 2.52, - "grad_norm": 46.278839111328125, - "learning_rate": 3.2113861882329363e-06, - "loss": 1.8303, + "epoch": 1.05, + "grad_norm": 10.180510520935059, + "learning_rate": 1.2992511400242649e-05, + "loss": 0.8655, "step": 8376 }, { - "epoch": 2.52, - "grad_norm": 12.867042541503906, - "learning_rate": 3.209381577628546e-06, - "loss": 1.0696, + "epoch": 1.05, + "grad_norm": 12.36981201171875, + "learning_rate": 1.2991674685185963e-05, + "loss": 0.8491, "step": 8377 }, { - "epoch": 2.52, - "grad_norm": 20.583019256591797, - "learning_rate": 3.2073769670241557e-06, - "loss": 0.992, + "epoch": 1.05, + "grad_norm": 23.697532653808594, + "learning_rate": 1.2990837970129273e-05, + "loss": 1.1596, "step": 8378 }, { - "epoch": 2.52, - "grad_norm": 56.57286071777344, - "learning_rate": 3.2053723564197657e-06, - "loss": 1.8542, + "epoch": 1.05, + "grad_norm": 10.437012672424316, + "learning_rate": 1.2990001255072587e-05, + "loss": 1.5892, "step": 8379 }, { - "epoch": 2.52, - "grad_norm": 13.195068359375, - "learning_rate": 3.2033677458153754e-06, - "loss": 1.3004, + "epoch": 1.05, + "grad_norm": 40.67985534667969, + "learning_rate": 1.29891645400159e-05, + "loss": 0.9195, "step": 8380 }, { - "epoch": 2.52, - "grad_norm": 16.263944625854492, - "learning_rate": 3.2013631352109855e-06, - "loss": 1.7439, + "epoch": 1.05, + "grad_norm": 22.098230361938477, + "learning_rate": 1.298832782495921e-05, + "loss": 1.1321, "step": 8381 }, { - "epoch": 2.52, - "grad_norm": 11.452669143676758, - "learning_rate": 3.1993585246065956e-06, - "loss": 0.7889, + "epoch": 1.05, + "grad_norm": 8.74831771850586, + "learning_rate": 1.2987491109902524e-05, + "loss": 0.8672, "step": 8382 }, { - "epoch": 2.52, - "grad_norm": 17.5844669342041, - "learning_rate": 3.1973539140022052e-06, - "loss": 1.2866, + "epoch": 1.05, + "grad_norm": 10.82736587524414, + "learning_rate": 1.2986654394845836e-05, + "loss": 0.5641, "step": 8383 }, { - "epoch": 2.52, - "grad_norm": 12.583413124084473, - "learning_rate": 3.195349303397815e-06, - "loss": 0.9804, + "epoch": 1.05, + "grad_norm": 11.020607948303223, + "learning_rate": 1.298581767978915e-05, + "loss": 1.3105, "step": 8384 }, { - "epoch": 2.52, - "grad_norm": 13.724156379699707, - "learning_rate": 3.1933446927934254e-06, - "loss": 0.7936, + "epoch": 1.05, + "grad_norm": 15.373324394226074, + "learning_rate": 1.2984980964732462e-05, + "loss": 2.5396, "step": 8385 }, { - "epoch": 2.52, - "grad_norm": 11.498400688171387, - "learning_rate": 3.191340082189035e-06, - "loss": 1.1136, + "epoch": 1.05, + "grad_norm": 5.774839878082275, + "learning_rate": 1.2984144249675774e-05, + "loss": 0.4541, "step": 8386 }, { - "epoch": 2.52, - "grad_norm": 41.235206604003906, - "learning_rate": 3.1893354715846447e-06, - "loss": 1.1562, + "epoch": 1.05, + "grad_norm": 9.357996940612793, + "learning_rate": 1.2983307534619087e-05, + "loss": 1.65, "step": 8387 }, { - "epoch": 2.52, - "grad_norm": 26.88475799560547, - "learning_rate": 3.187330860980255e-06, - "loss": 2.1271, + "epoch": 1.05, + "grad_norm": 27.872713088989258, + "learning_rate": 1.2982470819562398e-05, + "loss": 3.2732, "step": 8388 }, { - "epoch": 2.52, - "grad_norm": 16.73887825012207, - "learning_rate": 3.185326250375865e-06, - "loss": 1.5196, + "epoch": 1.05, + "grad_norm": 3.760759115219116, + "learning_rate": 1.2981634104505711e-05, + "loss": 0.2349, "step": 8389 }, { - "epoch": 2.52, - "grad_norm": 20.1945858001709, - "learning_rate": 3.1833216397714746e-06, - "loss": 0.9805, + "epoch": 1.05, + "grad_norm": 15.883977890014648, + "learning_rate": 1.2980797389449025e-05, + "loss": 1.1503, "step": 8390 }, { - "epoch": 2.52, - "grad_norm": 34.976627349853516, - "learning_rate": 3.1813170291670846e-06, - "loss": 1.1922, + "epoch": 1.05, + "grad_norm": 4.68303918838501, + "learning_rate": 1.2979960674392335e-05, + "loss": 0.3075, "step": 8391 }, { - "epoch": 2.52, - "grad_norm": 36.179351806640625, - "learning_rate": 3.1793124185626943e-06, - "loss": 4.2282, + "epoch": 1.05, + "grad_norm": 23.391752243041992, + "learning_rate": 1.2979123959335649e-05, + "loss": 1.0449, "step": 8392 }, { - "epoch": 2.52, - "grad_norm": 9.375896453857422, - "learning_rate": 3.1773078079583044e-06, - "loss": 0.9379, + "epoch": 1.05, + "grad_norm": 20.98833465576172, + "learning_rate": 1.2978287244278962e-05, + "loss": 1.4492, "step": 8393 }, { - "epoch": 2.52, - "grad_norm": 132.8348846435547, - "learning_rate": 3.1753031973539145e-06, - "loss": 3.0371, + "epoch": 1.05, + "grad_norm": 14.180381774902344, + "learning_rate": 1.2977450529222274e-05, + "loss": 1.2787, "step": 8394 }, { - "epoch": 2.52, - "grad_norm": 30.87514877319336, - "learning_rate": 3.173298586749524e-06, - "loss": 2.5639, + "epoch": 1.05, + "grad_norm": 13.713095664978027, + "learning_rate": 1.2976613814165586e-05, + "loss": 0.6493, "step": 8395 }, { - "epoch": 2.52, - "grad_norm": 12.229470252990723, - "learning_rate": 3.171293976145134e-06, - "loss": 0.9845, + "epoch": 1.05, + "grad_norm": 35.20022201538086, + "learning_rate": 1.29757770991089e-05, + "loss": 2.4457, "step": 8396 }, { - "epoch": 2.52, - "grad_norm": 26.513559341430664, - "learning_rate": 3.1692893655407443e-06, - "loss": 1.1088, + "epoch": 1.05, + "grad_norm": 13.833316802978516, + "learning_rate": 1.2974940384052212e-05, + "loss": 1.1561, "step": 8397 }, { - "epoch": 2.52, - "grad_norm": 7.9161553382873535, - "learning_rate": 3.167284754936354e-06, - "loss": 0.8013, + "epoch": 1.05, + "grad_norm": 75.72393035888672, + "learning_rate": 1.2974103668995524e-05, + "loss": 2.1973, "step": 8398 }, { - "epoch": 2.53, - "grad_norm": 12.535621643066406, - "learning_rate": 3.1652801443319636e-06, - "loss": 1.0497, + "epoch": 1.05, + "grad_norm": 37.64180374145508, + "learning_rate": 1.2973266953938836e-05, + "loss": 1.6479, "step": 8399 }, { - "epoch": 2.53, - "grad_norm": 9.409601211547852, - "learning_rate": 3.1632755337275737e-06, - "loss": 1.132, + "epoch": 1.05, + "grad_norm": 10.788687705993652, + "learning_rate": 1.297243023888215e-05, + "loss": 0.7525, "step": 8400 }, { - "epoch": 2.53, - "eval_loss": 0.17680850625038147, - "eval_runtime": 44.0695, - "eval_samples_per_second": 33.561, - "eval_steps_per_second": 33.561, + "epoch": 1.05, + "eval_loss": 0.09782509505748749, + "eval_runtime": 95.178, + "eval_samples_per_second": 37.214, + "eval_steps_per_second": 37.214, "step": 8400 }, { - "epoch": 2.53, - "grad_norm": 38.185997009277344, - "learning_rate": 3.1612709231231838e-06, - "loss": 1.8298, + "epoch": 1.05, + "grad_norm": 2.3076012134552, + "learning_rate": 1.2971593523825463e-05, + "loss": 0.0305, "step": 8401 }, { - "epoch": 2.53, - "grad_norm": 103.94436645507812, - "learning_rate": 3.1592663125187934e-06, - "loss": 0.8306, + "epoch": 1.05, + "grad_norm": 17.84105682373047, + "learning_rate": 1.2970756808768773e-05, + "loss": 2.1674, "step": 8402 }, { - "epoch": 2.53, - "grad_norm": 105.07266998291016, - "learning_rate": 3.1572617019144035e-06, - "loss": 1.827, + "epoch": 1.05, + "grad_norm": 14.507989883422852, + "learning_rate": 1.2969920093712087e-05, + "loss": 1.7957, "step": 8403 }, { - "epoch": 2.53, - "grad_norm": 19.526813507080078, - "learning_rate": 3.155257091310013e-06, - "loss": 1.4366, + "epoch": 1.05, + "grad_norm": 34.563480377197266, + "learning_rate": 1.29690833786554e-05, + "loss": 2.2601, "step": 8404 }, { - "epoch": 2.53, - "grad_norm": 10.135507583618164, - "learning_rate": 3.153252480705623e-06, - "loss": 0.6992, + "epoch": 1.05, + "grad_norm": 16.744068145751953, + "learning_rate": 1.2968246663598711e-05, + "loss": 2.0247, "step": 8405 }, { - "epoch": 2.53, - "grad_norm": 35.69138717651367, - "learning_rate": 3.1512478701012334e-06, - "loss": 1.5329, + "epoch": 1.05, + "grad_norm": 14.041572570800781, + "learning_rate": 1.2967409948542025e-05, + "loss": 1.5232, "step": 8406 }, { - "epoch": 2.53, - "grad_norm": 74.143798828125, - "learning_rate": 3.149243259496843e-06, - "loss": 0.9668, + "epoch": 1.06, + "grad_norm": 19.138408660888672, + "learning_rate": 1.2966573233485338e-05, + "loss": 0.5881, "step": 8407 }, { - "epoch": 2.53, - "grad_norm": 15.071893692016602, - "learning_rate": 3.1472386488924527e-06, - "loss": 1.1024, + "epoch": 1.06, + "grad_norm": 44.88977813720703, + "learning_rate": 1.296573651842865e-05, + "loss": 1.8347, "step": 8408 }, { - "epoch": 2.53, - "grad_norm": 11.287890434265137, - "learning_rate": 3.145234038288063e-06, - "loss": 1.2887, + "epoch": 1.06, + "grad_norm": 22.761062622070312, + "learning_rate": 1.2964899803371962e-05, + "loss": 1.487, "step": 8409 }, { - "epoch": 2.53, - "grad_norm": 63.98174285888672, - "learning_rate": 3.143229427683673e-06, - "loss": 1.8663, + "epoch": 1.06, + "grad_norm": 8.038599014282227, + "learning_rate": 1.2964063088315276e-05, + "loss": 0.7646, "step": 8410 }, { - "epoch": 2.53, - "grad_norm": 15.564671516418457, - "learning_rate": 3.1412248170792825e-06, - "loss": 1.1972, + "epoch": 1.06, + "grad_norm": 100.69731903076172, + "learning_rate": 1.2963226373258588e-05, + "loss": 1.1985, "step": 8411 }, { - "epoch": 2.53, - "grad_norm": 26.284500122070312, - "learning_rate": 3.1392202064748926e-06, - "loss": 1.1945, + "epoch": 1.06, + "grad_norm": 22.61496925354004, + "learning_rate": 1.29623896582019e-05, + "loss": 2.8117, "step": 8412 }, { - "epoch": 2.53, - "grad_norm": 81.35211181640625, - "learning_rate": 3.1372155958705027e-06, - "loss": 1.7933, + "epoch": 1.06, + "grad_norm": 4.329559326171875, + "learning_rate": 1.2961552943145212e-05, + "loss": 0.4211, "step": 8413 }, { - "epoch": 2.53, - "grad_norm": 20.28238296508789, - "learning_rate": 3.1352109852661123e-06, - "loss": 1.6545, + "epoch": 1.06, + "grad_norm": 19.117862701416016, + "learning_rate": 1.2960716228088525e-05, + "loss": 2.1339, "step": 8414 }, { - "epoch": 2.53, - "grad_norm": 11.033011436462402, - "learning_rate": 3.133206374661722e-06, - "loss": 0.8211, + "epoch": 1.06, + "grad_norm": 11.0752534866333, + "learning_rate": 1.2959879513031839e-05, + "loss": 1.2909, "step": 8415 }, { - "epoch": 2.53, - "grad_norm": 17.557846069335938, - "learning_rate": 3.131201764057332e-06, - "loss": 1.2009, + "epoch": 1.06, + "grad_norm": 12.766861915588379, + "learning_rate": 1.295904279797515e-05, + "loss": 1.6993, "step": 8416 }, { - "epoch": 2.53, - "grad_norm": 17.877410888671875, - "learning_rate": 3.1291971534529417e-06, - "loss": 1.1617, + "epoch": 1.06, + "grad_norm": 15.978042602539062, + "learning_rate": 1.2958206082918463e-05, + "loss": 0.2342, "step": 8417 }, { - "epoch": 2.53, - "grad_norm": 17.008520126342773, - "learning_rate": 3.127192542848552e-06, - "loss": 1.2213, + "epoch": 1.06, + "grad_norm": 8.060977935791016, + "learning_rate": 1.2957369367861777e-05, + "loss": 0.2633, "step": 8418 }, { - "epoch": 2.53, - "grad_norm": 13.909770965576172, - "learning_rate": 3.125187932244162e-06, - "loss": 0.9544, + "epoch": 1.06, + "grad_norm": 5.748692035675049, + "learning_rate": 1.2956532652805087e-05, + "loss": 1.5113, "step": 8419 }, { - "epoch": 2.53, - "grad_norm": 12.314075469970703, - "learning_rate": 3.1231833216397716e-06, - "loss": 1.1267, + "epoch": 1.06, + "grad_norm": 194.9235076904297, + "learning_rate": 1.29556959377484e-05, + "loss": 1.4548, "step": 8420 }, { - "epoch": 2.53, - "grad_norm": 8.14324951171875, - "learning_rate": 3.1211787110353812e-06, - "loss": 1.052, + "epoch": 1.06, + "grad_norm": 26.03392219543457, + "learning_rate": 1.2954859222691714e-05, + "loss": 2.4875, "step": 8421 }, { - "epoch": 2.53, - "grad_norm": 9.060734748840332, - "learning_rate": 3.1191741004309917e-06, - "loss": 0.7107, + "epoch": 1.06, + "grad_norm": 13.564663887023926, + "learning_rate": 1.2954022507635026e-05, + "loss": 1.1039, "step": 8422 }, { - "epoch": 2.53, - "grad_norm": 12.2525634765625, - "learning_rate": 3.1171694898266014e-06, - "loss": 0.9515, + "epoch": 1.06, + "grad_norm": 13.226625442504883, + "learning_rate": 1.2953185792578338e-05, + "loss": 2.0588, "step": 8423 }, { - "epoch": 2.53, - "grad_norm": 18.487070083618164, - "learning_rate": 3.115164879222211e-06, - "loss": 0.9164, + "epoch": 1.06, + "grad_norm": 17.731657028198242, + "learning_rate": 1.2952349077521652e-05, + "loss": 2.594, "step": 8424 }, { - "epoch": 2.53, - "grad_norm": 21.11574363708496, - "learning_rate": 3.1131602686178216e-06, - "loss": 1.1415, + "epoch": 1.06, + "grad_norm": 32.340492248535156, + "learning_rate": 1.2951512362464964e-05, + "loss": 2.1051, "step": 8425 }, { - "epoch": 2.53, - "grad_norm": 10.710432052612305, - "learning_rate": 3.1111556580134312e-06, - "loss": 0.6223, + "epoch": 1.06, + "grad_norm": 4.714109897613525, + "learning_rate": 1.2950675647408276e-05, + "loss": 0.3578, "step": 8426 }, { - "epoch": 2.53, - "grad_norm": 12.434354782104492, - "learning_rate": 3.109151047409041e-06, - "loss": 0.7547, + "epoch": 1.06, + "grad_norm": 19.667007446289062, + "learning_rate": 1.2949838932351588e-05, + "loss": 1.5086, "step": 8427 }, { - "epoch": 2.53, - "grad_norm": 5.420670032501221, - "learning_rate": 3.107146436804651e-06, - "loss": 0.913, + "epoch": 1.06, + "grad_norm": 22.923402786254883, + "learning_rate": 1.2949002217294901e-05, + "loss": 2.1589, "step": 8428 }, { - "epoch": 2.53, - "grad_norm": 23.501205444335938, - "learning_rate": 3.1051418262002606e-06, - "loss": 1.7807, + "epoch": 1.06, + "grad_norm": 9.848310470581055, + "learning_rate": 1.2948165502238215e-05, + "loss": 0.8451, "step": 8429 }, { - "epoch": 2.53, - "grad_norm": 17.01691436767578, - "learning_rate": 3.1031372155958707e-06, - "loss": 1.2807, + "epoch": 1.06, + "grad_norm": 16.107831954956055, + "learning_rate": 1.2947328787181525e-05, + "loss": 0.7639, "step": 8430 }, { - "epoch": 2.53, - "grad_norm": 15.571081161499023, - "learning_rate": 3.101132604991481e-06, - "loss": 1.397, + "epoch": 1.06, + "grad_norm": 33.23837661743164, + "learning_rate": 1.2946492072124839e-05, + "loss": 2.7027, "step": 8431 }, { - "epoch": 2.54, - "grad_norm": 32.079612731933594, - "learning_rate": 3.0991279943870905e-06, - "loss": 1.4421, + "epoch": 1.06, + "grad_norm": 15.720340728759766, + "learning_rate": 1.2945655357068152e-05, + "loss": 1.4081, "step": 8432 }, { - "epoch": 2.54, - "grad_norm": 13.537861824035645, - "learning_rate": 3.0971233837827e-06, - "loss": 1.4324, + "epoch": 1.06, + "grad_norm": 19.901630401611328, + "learning_rate": 1.2944818642011463e-05, + "loss": 1.5531, "step": 8433 }, { - "epoch": 2.54, - "grad_norm": 72.99842834472656, - "learning_rate": 3.0951187731783106e-06, - "loss": 2.2397, + "epoch": 1.06, + "grad_norm": 7.82213830947876, + "learning_rate": 1.2943981926954776e-05, + "loss": 1.1551, "step": 8434 }, { - "epoch": 2.54, - "grad_norm": 32.24034118652344, - "learning_rate": 3.0931141625739203e-06, - "loss": 1.8425, + "epoch": 1.06, + "grad_norm": 6.144089698791504, + "learning_rate": 1.294314521189809e-05, + "loss": 0.5246, "step": 8435 }, { - "epoch": 2.54, - "grad_norm": 14.41550350189209, - "learning_rate": 3.09110955196953e-06, - "loss": 1.0483, + "epoch": 1.06, + "grad_norm": 8.631068229675293, + "learning_rate": 1.2942308496841402e-05, + "loss": 1.9636, "step": 8436 }, { - "epoch": 2.54, - "grad_norm": 47.05146789550781, - "learning_rate": 3.0891049413651405e-06, - "loss": 1.3004, + "epoch": 1.06, + "grad_norm": 14.672770500183105, + "learning_rate": 1.2941471781784714e-05, + "loss": 1.9898, "step": 8437 }, { - "epoch": 2.54, - "grad_norm": 25.591800689697266, - "learning_rate": 3.08710033076075e-06, - "loss": 1.1771, + "epoch": 1.06, + "grad_norm": 19.22916030883789, + "learning_rate": 1.2940635066728028e-05, + "loss": 2.5212, "step": 8438 }, { - "epoch": 2.54, - "grad_norm": 9.942156791687012, - "learning_rate": 3.08509572015636e-06, - "loss": 0.5432, + "epoch": 1.06, + "grad_norm": 12.619750022888184, + "learning_rate": 1.293979835167134e-05, + "loss": 1.0048, "step": 8439 }, { - "epoch": 2.54, - "grad_norm": 22.738941192626953, - "learning_rate": 3.08309110955197e-06, - "loss": 2.3535, + "epoch": 1.06, + "grad_norm": 10.999564170837402, + "learning_rate": 1.2938961636614651e-05, + "loss": 2.1711, "step": 8440 }, { - "epoch": 2.54, - "grad_norm": 14.746498107910156, - "learning_rate": 3.0810864989475795e-06, - "loss": 1.2366, + "epoch": 1.06, + "grad_norm": 6.046539306640625, + "learning_rate": 1.2938124921557963e-05, + "loss": 0.4652, "step": 8441 }, { - "epoch": 2.54, - "grad_norm": 98.85680389404297, - "learning_rate": 3.0790818883431896e-06, - "loss": 2.4007, + "epoch": 1.06, + "grad_norm": 31.383182525634766, + "learning_rate": 1.2937288206501277e-05, + "loss": 0.6297, "step": 8442 }, { - "epoch": 2.54, - "grad_norm": 23.468538284301758, - "learning_rate": 3.0770772777387997e-06, - "loss": 1.0986, + "epoch": 1.06, + "grad_norm": 13.119832992553711, + "learning_rate": 1.293645149144459e-05, + "loss": 2.0921, "step": 8443 }, { - "epoch": 2.54, - "grad_norm": 8.700865745544434, - "learning_rate": 3.0750726671344094e-06, - "loss": 1.1621, + "epoch": 1.06, + "grad_norm": 6.461184978485107, + "learning_rate": 1.2935614776387901e-05, + "loss": 0.593, "step": 8444 }, { - "epoch": 2.54, - "grad_norm": 22.677888870239258, - "learning_rate": 3.073068056530019e-06, - "loss": 1.2522, + "epoch": 1.06, + "grad_norm": 14.403892517089844, + "learning_rate": 1.2934778061331215e-05, + "loss": 1.0602, "step": 8445 }, { - "epoch": 2.54, - "grad_norm": 21.384296417236328, - "learning_rate": 3.0710634459256295e-06, - "loss": 1.6951, + "epoch": 1.06, + "grad_norm": 24.468730926513672, + "learning_rate": 1.2933941346274528e-05, + "loss": 1.0107, "step": 8446 }, { - "epoch": 2.54, - "grad_norm": 30.282161712646484, - "learning_rate": 3.069058835321239e-06, - "loss": 1.8486, + "epoch": 1.06, + "grad_norm": 60.255062103271484, + "learning_rate": 1.2933104631217839e-05, + "loss": 3.9681, "step": 8447 }, { - "epoch": 2.54, - "grad_norm": 11.835555076599121, - "learning_rate": 3.067054224716849e-06, - "loss": 1.0966, + "epoch": 1.06, + "grad_norm": 20.134521484375, + "learning_rate": 1.2932267916161152e-05, + "loss": 0.6754, "step": 8448 }, { - "epoch": 2.54, - "grad_norm": 16.40022850036621, - "learning_rate": 3.0650496141124594e-06, - "loss": 1.55, + "epoch": 1.06, + "grad_norm": 19.462905883789062, + "learning_rate": 1.2931431201104466e-05, + "loss": 1.1152, "step": 8449 }, { - "epoch": 2.54, - "grad_norm": 34.12657928466797, - "learning_rate": 3.063045003508069e-06, - "loss": 1.6477, + "epoch": 1.06, + "grad_norm": 9.095423698425293, + "learning_rate": 1.2930594486047778e-05, + "loss": 2.4232, "step": 8450 }, { - "epoch": 2.54, - "grad_norm": 43.224571228027344, - "learning_rate": 3.0610403929036787e-06, - "loss": 1.4323, + "epoch": 1.06, + "grad_norm": 26.529722213745117, + "learning_rate": 1.292975777099109e-05, + "loss": 1.9776, "step": 8451 }, { - "epoch": 2.54, - "grad_norm": 21.87476921081543, - "learning_rate": 3.0590357822992888e-06, - "loss": 1.117, + "epoch": 1.06, + "grad_norm": 9.931761741638184, + "learning_rate": 1.2928921055934402e-05, + "loss": 1.3352, "step": 8452 }, { - "epoch": 2.54, - "grad_norm": 12.986775398254395, - "learning_rate": 3.0570311716948984e-06, - "loss": 0.865, + "epoch": 1.06, + "grad_norm": 11.543974876403809, + "learning_rate": 1.2928084340877715e-05, + "loss": 1.5399, "step": 8453 }, { - "epoch": 2.54, - "grad_norm": 12.08977222442627, - "learning_rate": 3.0550265610905085e-06, - "loss": 0.7838, + "epoch": 1.06, + "grad_norm": 5.663609504699707, + "learning_rate": 1.2927247625821027e-05, + "loss": 0.2769, "step": 8454 }, { - "epoch": 2.54, - "grad_norm": 50.07052230834961, - "learning_rate": 3.053021950486118e-06, - "loss": 1.4847, + "epoch": 1.06, + "grad_norm": 17.997838973999023, + "learning_rate": 1.292641091076434e-05, + "loss": 1.8294, "step": 8455 }, { - "epoch": 2.54, - "grad_norm": 7.173743724822998, - "learning_rate": 3.0510173398817283e-06, - "loss": 0.6167, + "epoch": 1.06, + "grad_norm": 5.947362899780273, + "learning_rate": 1.2925574195707653e-05, + "loss": 1.9695, "step": 8456 }, { - "epoch": 2.54, - "grad_norm": 38.1131591796875, - "learning_rate": 3.049012729277338e-06, - "loss": 2.517, + "epoch": 1.06, + "grad_norm": 32.4327392578125, + "learning_rate": 1.2924737480650967e-05, + "loss": 2.0827, "step": 8457 }, { - "epoch": 2.54, - "grad_norm": 9.692411422729492, - "learning_rate": 3.0470081186729476e-06, - "loss": 1.0573, + "epoch": 1.06, + "grad_norm": 10.101252555847168, + "learning_rate": 1.2923900765594277e-05, + "loss": 1.8666, "step": 8458 }, { - "epoch": 2.54, - "grad_norm": 41.6327018737793, - "learning_rate": 3.045003508068558e-06, - "loss": 1.4349, + "epoch": 1.06, + "grad_norm": 22.610300064086914, + "learning_rate": 1.292306405053759e-05, + "loss": 1.9778, "step": 8459 }, { - "epoch": 2.54, - "grad_norm": 29.187610626220703, - "learning_rate": 3.0429988974641677e-06, - "loss": 1.5777, + "epoch": 1.06, + "grad_norm": 3.201634168624878, + "learning_rate": 1.2922227335480904e-05, + "loss": 0.132, "step": 8460 }, { - "epoch": 2.54, - "grad_norm": 29.35192108154297, - "learning_rate": 3.0409942868597774e-06, - "loss": 1.8016, + "epoch": 1.06, + "grad_norm": 6.934627056121826, + "learning_rate": 1.2921390620424214e-05, + "loss": 1.0572, "step": 8461 }, { - "epoch": 2.54, - "grad_norm": 20.445228576660156, - "learning_rate": 3.038989676255388e-06, - "loss": 0.7974, + "epoch": 1.06, + "grad_norm": 23.772220611572266, + "learning_rate": 1.2920553905367528e-05, + "loss": 1.8739, "step": 8462 }, { - "epoch": 2.54, - "grad_norm": 18.133628845214844, - "learning_rate": 3.0369850656509976e-06, - "loss": 1.1352, + "epoch": 1.06, + "grad_norm": 11.809895515441895, + "learning_rate": 1.2919717190310842e-05, + "loss": 1.0289, "step": 8463 }, { - "epoch": 2.54, - "grad_norm": 16.889158248901367, - "learning_rate": 3.0349804550466072e-06, - "loss": 1.4672, + "epoch": 1.06, + "grad_norm": 27.51059341430664, + "learning_rate": 1.2918880475254154e-05, + "loss": 2.3416, "step": 8464 }, { - "epoch": 2.55, - "grad_norm": 20.985919952392578, - "learning_rate": 3.0329758444422173e-06, - "loss": 1.192, + "epoch": 1.06, + "grad_norm": 56.7679328918457, + "learning_rate": 1.2918043760197466e-05, + "loss": 1.933, "step": 8465 }, { - "epoch": 2.55, - "grad_norm": 9.734439849853516, - "learning_rate": 3.0309712338378274e-06, - "loss": 0.7422, + "epoch": 1.06, + "grad_norm": 25.842777252197266, + "learning_rate": 1.2917207045140778e-05, + "loss": 1.3563, "step": 8466 }, { - "epoch": 2.55, - "grad_norm": 23.414813995361328, - "learning_rate": 3.028966623233437e-06, - "loss": 1.8114, + "epoch": 1.06, + "grad_norm": 16.23643684387207, + "learning_rate": 1.2916370330084091e-05, + "loss": 1.3781, "step": 8467 }, { - "epoch": 2.55, - "grad_norm": 12.681184768676758, - "learning_rate": 3.026962012629047e-06, - "loss": 1.1328, + "epoch": 1.06, + "grad_norm": 8.489397048950195, + "learning_rate": 1.2915533615027403e-05, + "loss": 0.4774, "step": 8468 }, { - "epoch": 2.55, - "grad_norm": 23.120763778686523, - "learning_rate": 3.024957402024657e-06, - "loss": 1.1875, + "epoch": 1.06, + "grad_norm": 98.74266052246094, + "learning_rate": 1.2914696899970715e-05, + "loss": 3.1134, "step": 8469 }, { - "epoch": 2.55, - "grad_norm": 19.008502960205078, - "learning_rate": 3.0229527914202665e-06, - "loss": 1.2166, + "epoch": 1.06, + "grad_norm": 16.9528865814209, + "learning_rate": 1.2913860184914029e-05, + "loss": 1.0636, "step": 8470 }, { - "epoch": 2.55, - "grad_norm": 22.20121192932129, - "learning_rate": 3.020948180815877e-06, - "loss": 1.4729, + "epoch": 1.06, + "grad_norm": 11.784737586975098, + "learning_rate": 1.2913023469857342e-05, + "loss": 0.9287, "step": 8471 }, { - "epoch": 2.55, - "grad_norm": 16.53184700012207, - "learning_rate": 3.0189435702114866e-06, - "loss": 1.8034, + "epoch": 1.06, + "grad_norm": 31.83979034423828, + "learning_rate": 1.2912186754800653e-05, + "loss": 1.2546, "step": 8472 }, { - "epoch": 2.55, - "grad_norm": 21.981178283691406, - "learning_rate": 3.0169389596070963e-06, - "loss": 2.2006, + "epoch": 1.06, + "grad_norm": 20.695783615112305, + "learning_rate": 1.2911350039743966e-05, + "loss": 0.9959, "step": 8473 }, { - "epoch": 2.55, - "grad_norm": 36.195106506347656, - "learning_rate": 3.014934349002707e-06, - "loss": 2.1031, + "epoch": 1.06, + "grad_norm": 32.69658279418945, + "learning_rate": 1.291051332468728e-05, + "loss": 2.0376, "step": 8474 }, { - "epoch": 2.55, - "grad_norm": 33.1042366027832, - "learning_rate": 3.0129297383983165e-06, - "loss": 1.5522, + "epoch": 1.06, + "grad_norm": 42.36573028564453, + "learning_rate": 1.290967660963059e-05, + "loss": 1.5448, "step": 8475 }, { - "epoch": 2.55, - "grad_norm": 14.721949577331543, - "learning_rate": 3.010925127793926e-06, - "loss": 1.5655, + "epoch": 1.06, + "grad_norm": 17.114213943481445, + "learning_rate": 1.2908839894573904e-05, + "loss": 1.0816, "step": 8476 }, { - "epoch": 2.55, - "grad_norm": 14.444605827331543, - "learning_rate": 3.0089205171895362e-06, - "loss": 1.2094, + "epoch": 1.06, + "grad_norm": 7.809509754180908, + "learning_rate": 1.2908003179517217e-05, + "loss": 1.6434, "step": 8477 }, { - "epoch": 2.55, - "grad_norm": 14.284456253051758, - "learning_rate": 3.0069159065851463e-06, - "loss": 0.986, + "epoch": 1.06, + "grad_norm": 10.35164737701416, + "learning_rate": 1.290716646446053e-05, + "loss": 0.9084, "step": 8478 }, { - "epoch": 2.55, - "grad_norm": 12.816495895385742, - "learning_rate": 3.004911295980756e-06, - "loss": 1.0418, + "epoch": 1.06, + "grad_norm": 12.399913787841797, + "learning_rate": 1.2906329749403841e-05, + "loss": 3.8119, "step": 8479 }, { - "epoch": 2.55, - "grad_norm": 67.11431884765625, - "learning_rate": 3.002906685376366e-06, - "loss": 1.5002, + "epoch": 1.06, + "grad_norm": 23.38853645324707, + "learning_rate": 1.2905493034347153e-05, + "loss": 1.6561, "step": 8480 }, { - "epoch": 2.55, - "grad_norm": 24.34355926513672, - "learning_rate": 3.0009020747719757e-06, - "loss": 1.329, + "epoch": 1.06, + "grad_norm": 8.756095886230469, + "learning_rate": 1.2904656319290467e-05, + "loss": 1.7878, "step": 8481 }, { - "epoch": 2.55, - "grad_norm": 11.59719467163086, - "learning_rate": 2.9988974641675854e-06, - "loss": 1.4363, + "epoch": 1.06, + "grad_norm": 14.100663185119629, + "learning_rate": 1.2903819604233779e-05, + "loss": 0.8435, "step": 8482 }, { - "epoch": 2.55, - "grad_norm": 32.02231216430664, - "learning_rate": 2.996892853563196e-06, - "loss": 1.2263, + "epoch": 1.06, + "grad_norm": 7.758046627044678, + "learning_rate": 1.2902982889177091e-05, + "loss": 1.9517, "step": 8483 }, { - "epoch": 2.55, - "grad_norm": 9.977734565734863, - "learning_rate": 2.9948882429588055e-06, - "loss": 0.6075, + "epoch": 1.06, + "grad_norm": 13.198431968688965, + "learning_rate": 1.2902146174120405e-05, + "loss": 2.3742, "step": 8484 }, { - "epoch": 2.55, - "grad_norm": 48.880313873291016, - "learning_rate": 2.992883632354415e-06, - "loss": 1.5793, + "epoch": 1.06, + "grad_norm": 18.462568283081055, + "learning_rate": 1.2901309459063718e-05, + "loss": 3.8024, "step": 8485 }, { - "epoch": 2.55, - "grad_norm": 7.916026592254639, - "learning_rate": 2.9908790217500257e-06, - "loss": 0.7116, + "epoch": 1.06, + "grad_norm": 11.379865646362305, + "learning_rate": 1.2900472744007028e-05, + "loss": 0.5839, "step": 8486 }, { - "epoch": 2.55, - "grad_norm": 29.04728889465332, - "learning_rate": 2.9888744111456354e-06, - "loss": 1.2325, + "epoch": 1.07, + "grad_norm": 52.05200958251953, + "learning_rate": 1.2899636028950342e-05, + "loss": 1.2563, "step": 8487 }, { - "epoch": 2.55, - "grad_norm": 13.980238914489746, - "learning_rate": 2.986869800541245e-06, - "loss": 1.496, + "epoch": 1.07, + "grad_norm": 6.643377304077148, + "learning_rate": 1.2898799313893656e-05, + "loss": 1.4281, "step": 8488 }, { - "epoch": 2.55, - "grad_norm": 15.269109725952148, - "learning_rate": 2.984865189936855e-06, - "loss": 1.4313, + "epoch": 1.07, + "grad_norm": 5.62060022354126, + "learning_rate": 1.2897962598836966e-05, + "loss": 0.6797, "step": 8489 }, { - "epoch": 2.55, - "grad_norm": 20.857440948486328, - "learning_rate": 2.9828605793324648e-06, - "loss": 1.5363, + "epoch": 1.07, + "grad_norm": 16.942380905151367, + "learning_rate": 1.289712588378028e-05, + "loss": 1.6484, "step": 8490 }, { - "epoch": 2.55, - "grad_norm": 14.627253532409668, - "learning_rate": 2.980855968728075e-06, - "loss": 1.3487, + "epoch": 1.07, + "grad_norm": 20.18743133544922, + "learning_rate": 1.2896289168723593e-05, + "loss": 2.1858, "step": 8491 }, { - "epoch": 2.55, - "grad_norm": 7.229565620422363, - "learning_rate": 2.978851358123685e-06, - "loss": 0.8008, + "epoch": 1.07, + "grad_norm": 27.09478759765625, + "learning_rate": 1.2895452453666905e-05, + "loss": 1.9353, "step": 8492 }, { - "epoch": 2.55, - "grad_norm": 29.895063400268555, - "learning_rate": 2.9768467475192946e-06, - "loss": 1.7014, + "epoch": 1.07, + "grad_norm": 10.600691795349121, + "learning_rate": 1.2894615738610217e-05, + "loss": 1.1805, "step": 8493 }, { - "epoch": 2.55, - "grad_norm": 26.894834518432617, - "learning_rate": 2.9748421369149043e-06, - "loss": 1.8372, + "epoch": 1.07, + "grad_norm": 13.021747589111328, + "learning_rate": 1.289377902355353e-05, + "loss": 1.6513, "step": 8494 }, { - "epoch": 2.55, - "grad_norm": 50.57379150390625, - "learning_rate": 2.9728375263105148e-06, - "loss": 1.3365, + "epoch": 1.07, + "grad_norm": 13.426149368286133, + "learning_rate": 1.2892942308496843e-05, + "loss": 0.9426, "step": 8495 }, { - "epoch": 2.55, - "grad_norm": 12.997958183288574, - "learning_rate": 2.9708329157061244e-06, - "loss": 1.9539, + "epoch": 1.07, + "grad_norm": 8.105659484863281, + "learning_rate": 1.2892105593440155e-05, + "loss": 0.8835, "step": 8496 }, { - "epoch": 2.55, - "grad_norm": 11.23823356628418, - "learning_rate": 2.968828305101734e-06, - "loss": 1.2944, + "epoch": 1.07, + "grad_norm": 11.75523853302002, + "learning_rate": 1.2891268878383467e-05, + "loss": 0.6368, "step": 8497 }, { - "epoch": 2.56, - "grad_norm": 21.072771072387695, - "learning_rate": 2.9668236944973438e-06, - "loss": 1.4985, + "epoch": 1.07, + "grad_norm": 15.636470794677734, + "learning_rate": 1.289043216332678e-05, + "loss": 1.5947, "step": 8498 }, { - "epoch": 2.56, - "grad_norm": 61.82595443725586, - "learning_rate": 2.9648190838929543e-06, - "loss": 2.2333, + "epoch": 1.07, + "grad_norm": 17.106548309326172, + "learning_rate": 1.2889595448270094e-05, + "loss": 1.625, "step": 8499 }, { - "epoch": 2.56, - "grad_norm": 26.898548126220703, - "learning_rate": 2.962814473288564e-06, - "loss": 1.2997, + "epoch": 1.07, + "grad_norm": 17.792394638061523, + "learning_rate": 1.2888758733213404e-05, + "loss": 2.9065, "step": 8500 }, { - "epoch": 2.56, - "grad_norm": 14.761646270751953, - "learning_rate": 2.9608098626841736e-06, - "loss": 1.4087, + "epoch": 1.07, + "grad_norm": 13.237042427062988, + "learning_rate": 1.2887922018156718e-05, + "loss": 1.746, "step": 8501 }, { - "epoch": 2.56, - "grad_norm": 26.489471435546875, - "learning_rate": 2.9588052520797837e-06, - "loss": 1.6874, + "epoch": 1.07, + "grad_norm": 72.85885620117188, + "learning_rate": 1.2887085303100032e-05, + "loss": 4.4072, "step": 8502 }, { - "epoch": 2.56, - "grad_norm": 25.354101181030273, - "learning_rate": 2.9568006414753938e-06, - "loss": 0.7809, + "epoch": 1.07, + "grad_norm": 5.319523811340332, + "learning_rate": 1.2886248588043342e-05, + "loss": 0.8691, "step": 8503 }, { - "epoch": 2.56, - "grad_norm": 8.844145774841309, - "learning_rate": 2.9547960308710034e-06, - "loss": 0.9255, + "epoch": 1.07, + "grad_norm": 4.781764984130859, + "learning_rate": 1.2885411872986656e-05, + "loss": 0.1701, "step": 8504 }, { - "epoch": 2.56, - "grad_norm": 18.32469940185547, - "learning_rate": 2.9527914202666135e-06, - "loss": 1.0907, + "epoch": 1.07, + "grad_norm": 41.50909423828125, + "learning_rate": 1.2884575157929967e-05, + "loss": 1.0517, "step": 8505 }, { - "epoch": 2.56, - "grad_norm": 14.326888084411621, - "learning_rate": 2.950786809662223e-06, - "loss": 1.1902, + "epoch": 1.07, + "grad_norm": 20.49448585510254, + "learning_rate": 1.2883738442873281e-05, + "loss": 1.4417, "step": 8506 }, { - "epoch": 2.56, - "grad_norm": 19.863540649414062, - "learning_rate": 2.948782199057833e-06, - "loss": 1.5195, + "epoch": 1.07, + "grad_norm": 16.194725036621094, + "learning_rate": 1.2882901727816593e-05, + "loss": 2.1433, "step": 8507 }, { - "epoch": 2.56, - "grad_norm": 9.05129623413086, - "learning_rate": 2.9467775884534433e-06, - "loss": 0.7649, + "epoch": 1.07, + "grad_norm": 10.775847434997559, + "learning_rate": 1.2882065012759905e-05, + "loss": 1.8694, "step": 8508 }, { - "epoch": 2.56, - "grad_norm": 11.893647193908691, - "learning_rate": 2.944772977849053e-06, - "loss": 1.4486, + "epoch": 1.07, + "grad_norm": 12.136926651000977, + "learning_rate": 1.2881228297703219e-05, + "loss": 1.9675, "step": 8509 }, { - "epoch": 2.56, - "grad_norm": 18.236352920532227, - "learning_rate": 2.9427683672446626e-06, - "loss": 1.0206, + "epoch": 1.07, + "grad_norm": 9.653098106384277, + "learning_rate": 1.2880391582646529e-05, + "loss": 1.2999, "step": 8510 }, { - "epoch": 2.56, - "grad_norm": 32.056034088134766, - "learning_rate": 2.940763756640273e-06, - "loss": 2.3092, + "epoch": 1.07, + "grad_norm": 23.440593719482422, + "learning_rate": 1.2879554867589843e-05, + "loss": 2.8655, "step": 8511 }, { - "epoch": 2.56, - "grad_norm": 72.33465576171875, - "learning_rate": 2.938759146035883e-06, - "loss": 2.1425, + "epoch": 1.07, + "grad_norm": 7.745482921600342, + "learning_rate": 1.2878718152533156e-05, + "loss": 0.3792, "step": 8512 }, { - "epoch": 2.56, - "grad_norm": 56.66347122192383, - "learning_rate": 2.9367545354314925e-06, - "loss": 1.1673, + "epoch": 1.07, + "grad_norm": 12.097042083740234, + "learning_rate": 1.287788143747647e-05, + "loss": 0.9682, "step": 8513 }, { - "epoch": 2.56, - "grad_norm": 14.660852432250977, - "learning_rate": 2.9347499248271026e-06, - "loss": 1.1077, + "epoch": 1.07, + "grad_norm": 13.924994468688965, + "learning_rate": 1.287704472241978e-05, + "loss": 1.8219, "step": 8514 }, { - "epoch": 2.56, - "grad_norm": 45.072975158691406, - "learning_rate": 2.9327453142227126e-06, - "loss": 1.7196, + "epoch": 1.07, + "grad_norm": 5.2052693367004395, + "learning_rate": 1.2876208007363094e-05, + "loss": 0.1782, "step": 8515 }, { - "epoch": 2.56, - "grad_norm": 15.052632331848145, - "learning_rate": 2.9307407036183223e-06, - "loss": 1.245, + "epoch": 1.07, + "grad_norm": 9.861640930175781, + "learning_rate": 1.2875371292306407e-05, + "loss": 1.4817, "step": 8516 }, { - "epoch": 2.56, - "grad_norm": 17.311857223510742, - "learning_rate": 2.9287360930139324e-06, - "loss": 1.241, + "epoch": 1.07, + "grad_norm": 11.262444496154785, + "learning_rate": 1.2874534577249718e-05, + "loss": 1.083, "step": 8517 }, { - "epoch": 2.56, - "grad_norm": 71.18388366699219, - "learning_rate": 2.926731482409542e-06, - "loss": 1.4936, + "epoch": 1.07, + "grad_norm": 14.59859848022461, + "learning_rate": 1.2873697862193031e-05, + "loss": 2.0179, "step": 8518 }, { - "epoch": 2.56, - "grad_norm": 14.947158813476562, - "learning_rate": 2.9247268718051517e-06, - "loss": 1.164, + "epoch": 1.07, + "grad_norm": 25.852855682373047, + "learning_rate": 1.2872861147136343e-05, + "loss": 1.4012, "step": 8519 }, { - "epoch": 2.56, - "grad_norm": 39.9619140625, - "learning_rate": 2.9227222612007622e-06, - "loss": 2.3687, - "step": 8520 - }, - { - "epoch": 2.56, - "eval_loss": 0.17769810557365417, - "eval_runtime": 44.5571, - "eval_samples_per_second": 33.193, - "eval_steps_per_second": 33.193, + "epoch": 1.07, + "grad_norm": 47.70848083496094, + "learning_rate": 1.2872024432079657e-05, + "loss": 1.2517, "step": 8520 }, { - "epoch": 2.56, - "grad_norm": 17.896486282348633, - "learning_rate": 2.920717650596372e-06, - "loss": 1.4077, + "epoch": 1.07, + "grad_norm": 12.465068817138672, + "learning_rate": 1.2871187717022969e-05, + "loss": 2.0167, "step": 8521 }, { - "epoch": 2.56, - "grad_norm": 32.224422454833984, - "learning_rate": 2.9187130399919815e-06, - "loss": 1.8564, + "epoch": 1.07, + "grad_norm": 5.884963035583496, + "learning_rate": 1.2870351001966281e-05, + "loss": 1.7394, "step": 8522 }, { - "epoch": 2.56, - "grad_norm": 20.0201358795166, - "learning_rate": 2.916708429387592e-06, - "loss": 0.6378, + "epoch": 1.07, + "grad_norm": 24.14913558959961, + "learning_rate": 1.2869514286909595e-05, + "loss": 0.7824, "step": 8523 }, { - "epoch": 2.56, - "grad_norm": 8.714916229248047, - "learning_rate": 2.9147038187832017e-06, - "loss": 0.8691, + "epoch": 1.07, + "grad_norm": 10.188133239746094, + "learning_rate": 1.2868677571852905e-05, + "loss": 1.2877, "step": 8524 }, { - "epoch": 2.56, - "grad_norm": 13.311516761779785, - "learning_rate": 2.9126992081788114e-06, - "loss": 1.0312, + "epoch": 1.07, + "grad_norm": 48.296077728271484, + "learning_rate": 1.2867840856796218e-05, + "loss": 3.0401, "step": 8525 }, { - "epoch": 2.56, - "grad_norm": 23.491960525512695, - "learning_rate": 2.9106945975744215e-06, - "loss": 1.2836, + "epoch": 1.07, + "grad_norm": 18.35318374633789, + "learning_rate": 1.2867004141739532e-05, + "loss": 2.7819, "step": 8526 }, { - "epoch": 2.56, - "grad_norm": 9.72035026550293, - "learning_rate": 2.9086899869700315e-06, - "loss": 1.1779, + "epoch": 1.07, + "grad_norm": 6.806789875030518, + "learning_rate": 1.2866167426682846e-05, + "loss": 1.1726, "step": 8527 }, { - "epoch": 2.56, - "grad_norm": 14.758877754211426, - "learning_rate": 2.906685376365641e-06, - "loss": 1.3364, + "epoch": 1.07, + "grad_norm": 2.1864826679229736, + "learning_rate": 1.2865330711626156e-05, + "loss": 0.0783, "step": 8528 }, { - "epoch": 2.56, - "grad_norm": 9.038101196289062, - "learning_rate": 2.9046807657612513e-06, - "loss": 0.7359, + "epoch": 1.07, + "grad_norm": 10.152175903320312, + "learning_rate": 1.286449399656947e-05, + "loss": 2.3536, "step": 8529 }, { - "epoch": 2.56, - "grad_norm": 9.687514305114746, - "learning_rate": 2.902676155156861e-06, - "loss": 0.6275, + "epoch": 1.07, + "grad_norm": 3.9797370433807373, + "learning_rate": 1.2863657281512783e-05, + "loss": 0.3895, "step": 8530 }, { - "epoch": 2.56, - "grad_norm": 21.076562881469727, - "learning_rate": 2.9006715445524706e-06, - "loss": 1.3599, + "epoch": 1.07, + "grad_norm": 21.074848175048828, + "learning_rate": 1.2862820566456094e-05, + "loss": 1.5691, "step": 8531 }, { - "epoch": 2.57, - "grad_norm": 8.555130004882812, - "learning_rate": 2.898666933948081e-06, - "loss": 0.9138, + "epoch": 1.07, + "grad_norm": 12.177873611450195, + "learning_rate": 1.2861983851399407e-05, + "loss": 1.2464, "step": 8532 }, { - "epoch": 2.57, - "grad_norm": 30.38118553161621, - "learning_rate": 2.8966623233436908e-06, - "loss": 1.4925, + "epoch": 1.07, + "grad_norm": 15.976359367370605, + "learning_rate": 1.2861147136342719e-05, + "loss": 1.0228, "step": 8533 }, { - "epoch": 2.57, - "grad_norm": 34.965641021728516, - "learning_rate": 2.8946577127393004e-06, - "loss": 1.3331, + "epoch": 1.07, + "grad_norm": 23.05260467529297, + "learning_rate": 1.2860310421286033e-05, + "loss": 0.9748, "step": 8534 }, { - "epoch": 2.57, - "grad_norm": 10.746747016906738, - "learning_rate": 2.892653102134911e-06, - "loss": 1.2565, + "epoch": 1.07, + "grad_norm": 18.960296630859375, + "learning_rate": 1.2859473706229345e-05, + "loss": 2.3059, "step": 8535 }, { - "epoch": 2.57, - "grad_norm": 62.186431884765625, - "learning_rate": 2.8906484915305206e-06, - "loss": 2.5688, + "epoch": 1.07, + "grad_norm": 14.666804313659668, + "learning_rate": 1.2858636991172657e-05, + "loss": 1.1556, "step": 8536 }, { - "epoch": 2.57, - "grad_norm": 9.051580429077148, - "learning_rate": 2.8886438809261303e-06, - "loss": 0.7643, + "epoch": 1.07, + "grad_norm": 39.453224182128906, + "learning_rate": 1.285780027611597e-05, + "loss": 1.7697, "step": 8537 }, { - "epoch": 2.57, - "grad_norm": 22.37142562866211, - "learning_rate": 2.88663927032174e-06, - "loss": 1.8968, + "epoch": 1.07, + "grad_norm": 5.983280181884766, + "learning_rate": 1.285696356105928e-05, + "loss": 0.5179, "step": 8538 }, { - "epoch": 2.57, - "grad_norm": 15.524712562561035, - "learning_rate": 2.8846346597173504e-06, - "loss": 0.9424, + "epoch": 1.07, + "grad_norm": 16.707027435302734, + "learning_rate": 1.2856126846002594e-05, + "loss": 2.7405, "step": 8539 }, { - "epoch": 2.57, - "grad_norm": 14.384515762329102, - "learning_rate": 2.88263004911296e-06, - "loss": 1.0429, + "epoch": 1.07, + "grad_norm": 9.118749618530273, + "learning_rate": 1.2855290130945908e-05, + "loss": 0.7396, "step": 8540 }, { - "epoch": 2.57, - "grad_norm": 55.4445686340332, - "learning_rate": 2.8806254385085698e-06, - "loss": 2.7088, + "epoch": 1.07, + "grad_norm": 12.518699645996094, + "learning_rate": 1.2854453415889222e-05, + "loss": 0.8971, "step": 8541 }, { - "epoch": 2.57, - "grad_norm": 11.720520973205566, - "learning_rate": 2.87862082790418e-06, - "loss": 0.7067, + "epoch": 1.07, + "grad_norm": 5.475255489349365, + "learning_rate": 1.2853616700832532e-05, + "loss": 0.307, "step": 8542 }, { - "epoch": 2.57, - "grad_norm": 18.14213752746582, - "learning_rate": 2.8766162172997895e-06, - "loss": 1.0006, + "epoch": 1.07, + "grad_norm": 5.851897716522217, + "learning_rate": 1.2852779985775845e-05, + "loss": 1.6283, "step": 8543 }, { - "epoch": 2.57, - "grad_norm": 9.360933303833008, - "learning_rate": 2.8746116066953996e-06, - "loss": 1.1814, + "epoch": 1.07, + "grad_norm": 25.698322296142578, + "learning_rate": 1.2851943270719157e-05, + "loss": 2.1417, "step": 8544 }, { - "epoch": 2.57, - "grad_norm": 54.820091247558594, - "learning_rate": 2.8726069960910097e-06, - "loss": 1.0313, + "epoch": 1.07, + "grad_norm": 13.419102668762207, + "learning_rate": 1.285110655566247e-05, + "loss": 2.0774, "step": 8545 }, { - "epoch": 2.57, - "grad_norm": 17.7171688079834, - "learning_rate": 2.8706023854866193e-06, - "loss": 0.5955, + "epoch": 1.07, + "grad_norm": 7.896689414978027, + "learning_rate": 1.2850269840605783e-05, + "loss": 1.5321, "step": 8546 }, { - "epoch": 2.57, - "grad_norm": 26.265403747558594, - "learning_rate": 2.868597774882229e-06, - "loss": 1.2873, + "epoch": 1.07, + "grad_norm": 6.227831840515137, + "learning_rate": 1.2849433125549095e-05, + "loss": 1.4229, "step": 8547 }, { - "epoch": 2.57, - "grad_norm": 51.058250427246094, - "learning_rate": 2.8665931642778395e-06, - "loss": 1.3523, + "epoch": 1.07, + "grad_norm": 12.581622123718262, + "learning_rate": 1.2848596410492409e-05, + "loss": 0.6904, "step": 8548 }, { - "epoch": 2.57, - "grad_norm": 14.056093215942383, - "learning_rate": 2.864588553673449e-06, - "loss": 0.9431, + "epoch": 1.07, + "grad_norm": 13.476349830627441, + "learning_rate": 1.284775969543572e-05, + "loss": 1.8013, "step": 8549 }, { - "epoch": 2.57, - "grad_norm": 55.49951171875, - "learning_rate": 2.862583943069059e-06, - "loss": 1.6338, + "epoch": 1.07, + "grad_norm": 11.182458877563477, + "learning_rate": 1.2846922980379033e-05, + "loss": 2.6402, "step": 8550 }, { - "epoch": 2.57, - "grad_norm": 12.471390724182129, - "learning_rate": 2.8605793324646693e-06, - "loss": 1.1413, + "epoch": 1.07, + "grad_norm": 12.608802795410156, + "learning_rate": 1.2846086265322346e-05, + "loss": 1.9193, "step": 8551 }, { - "epoch": 2.57, - "grad_norm": 19.35071563720703, - "learning_rate": 2.858574721860279e-06, - "loss": 0.9613, + "epoch": 1.07, + "grad_norm": 14.025070190429688, + "learning_rate": 1.2845249550265656e-05, + "loss": 2.8147, "step": 8552 }, { - "epoch": 2.57, - "grad_norm": 16.208166122436523, - "learning_rate": 2.8565701112558886e-06, - "loss": 1.5786, + "epoch": 1.07, + "grad_norm": 2.8733303546905518, + "learning_rate": 1.284441283520897e-05, + "loss": 0.0776, "step": 8553 }, { - "epoch": 2.57, - "grad_norm": 24.832500457763672, - "learning_rate": 2.8545655006514987e-06, - "loss": 1.6098, + "epoch": 1.07, + "grad_norm": 7.129153251647949, + "learning_rate": 1.2843576120152284e-05, + "loss": 3.1068, "step": 8554 }, { - "epoch": 2.57, - "grad_norm": 51.225799560546875, - "learning_rate": 2.8525608900471084e-06, - "loss": 1.4513, + "epoch": 1.07, + "grad_norm": 6.150088787078857, + "learning_rate": 1.2842739405095597e-05, + "loss": 1.1727, "step": 8555 }, { - "epoch": 2.57, - "grad_norm": 13.274884223937988, - "learning_rate": 2.8505562794427185e-06, - "loss": 0.8535, + "epoch": 1.07, + "grad_norm": 37.756011962890625, + "learning_rate": 1.2841902690038908e-05, + "loss": 1.289, "step": 8556 }, { - "epoch": 2.57, - "grad_norm": 31.125516891479492, - "learning_rate": 2.8485516688383286e-06, - "loss": 1.388, + "epoch": 1.07, + "grad_norm": 19.50652313232422, + "learning_rate": 1.2841065974982221e-05, + "loss": 1.7728, "step": 8557 }, { - "epoch": 2.57, - "grad_norm": 17.432456970214844, - "learning_rate": 2.8465470582339382e-06, - "loss": 1.5913, + "epoch": 1.07, + "grad_norm": 8.960627555847168, + "learning_rate": 1.2840229259925533e-05, + "loss": 0.5113, "step": 8558 }, { - "epoch": 2.57, - "grad_norm": 24.500856399536133, - "learning_rate": 2.844542447629548e-06, - "loss": 2.2924, + "epoch": 1.07, + "grad_norm": 11.609451293945312, + "learning_rate": 1.2839392544868845e-05, + "loss": 1.1409, "step": 8559 }, { - "epoch": 2.57, - "grad_norm": 14.529585838317871, - "learning_rate": 2.8425378370251584e-06, - "loss": 0.5999, + "epoch": 1.07, + "grad_norm": 83.95161437988281, + "learning_rate": 1.2838555829812159e-05, + "loss": 2.2761, "step": 8560 }, { - "epoch": 2.57, - "grad_norm": 12.131428718566895, - "learning_rate": 2.840533226420768e-06, - "loss": 1.439, + "epoch": 1.07, + "grad_norm": 12.734169006347656, + "learning_rate": 1.283771911475547e-05, + "loss": 0.7802, "step": 8561 }, { - "epoch": 2.57, - "grad_norm": 8.231968879699707, - "learning_rate": 2.8385286158163777e-06, - "loss": 0.9683, + "epoch": 1.07, + "grad_norm": 13.65036392211914, + "learning_rate": 1.2836882399698784e-05, + "loss": 1.8548, "step": 8562 }, { - "epoch": 2.57, - "grad_norm": 7.67575740814209, - "learning_rate": 2.836524005211988e-06, - "loss": 0.6245, + "epoch": 1.07, + "grad_norm": 43.75178146362305, + "learning_rate": 1.2836045684642095e-05, + "loss": 1.3983, "step": 8563 }, { - "epoch": 2.57, - "grad_norm": 12.068075180053711, - "learning_rate": 2.834519394607598e-06, - "loss": 0.7527, + "epoch": 1.07, + "grad_norm": 7.9612274169921875, + "learning_rate": 1.2835208969585408e-05, + "loss": 0.3605, "step": 8564 }, { - "epoch": 2.58, - "grad_norm": 14.110102653503418, - "learning_rate": 2.8325147840032075e-06, - "loss": 1.7017, + "epoch": 1.07, + "grad_norm": 12.827942848205566, + "learning_rate": 1.2834372254528722e-05, + "loss": 2.2101, "step": 8565 }, { - "epoch": 2.58, - "grad_norm": 38.348731994628906, - "learning_rate": 2.8305101733988176e-06, - "loss": 2.0444, + "epoch": 1.08, + "grad_norm": 41.54198455810547, + "learning_rate": 1.2833535539472032e-05, + "loss": 1.6254, "step": 8566 }, { - "epoch": 2.58, - "grad_norm": 9.310153007507324, - "learning_rate": 2.8285055627944273e-06, - "loss": 1.0862, + "epoch": 1.08, + "grad_norm": 16.435697555541992, + "learning_rate": 1.2832698824415346e-05, + "loss": 2.8402, "step": 8567 }, { - "epoch": 2.58, - "grad_norm": 18.77411460876465, - "learning_rate": 2.8265009521900374e-06, - "loss": 1.0017, + "epoch": 1.08, + "grad_norm": 10.593535423278809, + "learning_rate": 1.283186210935866e-05, + "loss": 2.2697, "step": 8568 }, { - "epoch": 2.58, - "grad_norm": 9.130143165588379, - "learning_rate": 2.8244963415856475e-06, - "loss": 1.1588, + "epoch": 1.08, + "grad_norm": 38.62551498413086, + "learning_rate": 1.2831025394301973e-05, + "loss": 2.4518, "step": 8569 }, { - "epoch": 2.58, - "grad_norm": 37.241661071777344, - "learning_rate": 2.822491730981257e-06, - "loss": 1.1469, + "epoch": 1.08, + "grad_norm": 7.9425201416015625, + "learning_rate": 1.2830188679245283e-05, + "loss": 0.6873, "step": 8570 }, { - "epoch": 2.58, - "grad_norm": 19.131694793701172, - "learning_rate": 2.8204871203768668e-06, - "loss": 1.8664, + "epoch": 1.08, + "grad_norm": 24.836193084716797, + "learning_rate": 1.2829351964188597e-05, + "loss": 2.4507, "step": 8571 }, { - "epoch": 2.58, - "grad_norm": 16.91837501525879, - "learning_rate": 2.8184825097724773e-06, - "loss": 1.2044, + "epoch": 1.08, + "grad_norm": 19.25704002380371, + "learning_rate": 1.2828515249131909e-05, + "loss": 1.3152, "step": 8572 }, { - "epoch": 2.58, - "grad_norm": 12.339491844177246, - "learning_rate": 2.816477899168087e-06, - "loss": 0.8794, + "epoch": 1.08, + "grad_norm": 38.70884323120117, + "learning_rate": 1.2827678534075221e-05, + "loss": 3.0298, "step": 8573 }, { - "epoch": 2.58, - "grad_norm": 41.453243255615234, - "learning_rate": 2.8144732885636966e-06, - "loss": 1.4408, + "epoch": 1.08, + "grad_norm": 6.464087009429932, + "learning_rate": 1.2826841819018535e-05, + "loss": 1.7264, "step": 8574 }, { - "epoch": 2.58, - "grad_norm": 36.32587814331055, - "learning_rate": 2.8124686779593067e-06, - "loss": 0.8383, + "epoch": 1.08, + "grad_norm": 14.138628959655762, + "learning_rate": 1.2826005103961847e-05, + "loss": 1.5077, "step": 8575 }, { - "epoch": 2.58, - "grad_norm": 19.15566635131836, - "learning_rate": 2.8104640673549168e-06, - "loss": 1.0883, + "epoch": 1.08, + "grad_norm": 30.603025436401367, + "learning_rate": 1.282516838890516e-05, + "loss": 2.0811, "step": 8576 }, { - "epoch": 2.58, - "grad_norm": 18.432010650634766, - "learning_rate": 2.8084594567505264e-06, - "loss": 1.314, + "epoch": 1.08, + "grad_norm": 8.994444847106934, + "learning_rate": 1.282433167384847e-05, + "loss": 0.3496, "step": 8577 }, { - "epoch": 2.58, - "grad_norm": 51.9474983215332, - "learning_rate": 2.8064548461461365e-06, - "loss": 1.7958, + "epoch": 1.08, + "grad_norm": 7.73020601272583, + "learning_rate": 1.2823494958791784e-05, + "loss": 0.8861, "step": 8578 }, { - "epoch": 2.58, - "grad_norm": 21.26700210571289, - "learning_rate": 2.804450235541746e-06, - "loss": 1.878, + "epoch": 1.08, + "grad_norm": 11.562694549560547, + "learning_rate": 1.2822658243735098e-05, + "loss": 0.9408, "step": 8579 }, { - "epoch": 2.58, - "grad_norm": 23.973962783813477, - "learning_rate": 2.8024456249373563e-06, - "loss": 1.2876, + "epoch": 1.08, + "grad_norm": 19.258441925048828, + "learning_rate": 1.2821821528678408e-05, + "loss": 1.2217, "step": 8580 }, { - "epoch": 2.58, - "grad_norm": 11.906970024108887, - "learning_rate": 2.800441014332966e-06, - "loss": 0.5872, + "epoch": 1.08, + "grad_norm": 12.606703758239746, + "learning_rate": 1.2820984813621722e-05, + "loss": 1.2354, "step": 8581 }, { - "epoch": 2.58, - "grad_norm": 14.538961410522461, - "learning_rate": 2.798436403728576e-06, - "loss": 0.9155, + "epoch": 1.08, + "grad_norm": 8.044978141784668, + "learning_rate": 1.2820148098565035e-05, + "loss": 0.4183, "step": 8582 }, { - "epoch": 2.58, - "grad_norm": 19.3804988861084, - "learning_rate": 2.7964317931241857e-06, - "loss": 1.2437, + "epoch": 1.08, + "grad_norm": 33.10894012451172, + "learning_rate": 1.2819311383508349e-05, + "loss": 1.6051, "step": 8583 }, { - "epoch": 2.58, - "grad_norm": 21.384546279907227, - "learning_rate": 2.7944271825197953e-06, - "loss": 0.8738, + "epoch": 1.08, + "grad_norm": 10.770204544067383, + "learning_rate": 1.281847466845166e-05, + "loss": 0.8247, "step": 8584 }, { - "epoch": 2.58, - "grad_norm": 5.975277423858643, - "learning_rate": 2.792422571915406e-06, - "loss": 0.4514, + "epoch": 1.08, + "grad_norm": 12.140080451965332, + "learning_rate": 1.2817637953394973e-05, + "loss": 0.8705, "step": 8585 }, { - "epoch": 2.58, - "grad_norm": 13.506431579589844, - "learning_rate": 2.7904179613110155e-06, - "loss": 1.3215, + "epoch": 1.08, + "grad_norm": 12.246003150939941, + "learning_rate": 1.2816801238338285e-05, + "loss": 2.2221, "step": 8586 }, { - "epoch": 2.58, - "grad_norm": 16.181753158569336, - "learning_rate": 2.788413350706625e-06, - "loss": 1.9854, + "epoch": 1.08, + "grad_norm": 16.525575637817383, + "learning_rate": 1.2815964523281597e-05, + "loss": 1.4073, "step": 8587 }, { - "epoch": 2.58, - "grad_norm": 23.374975204467773, - "learning_rate": 2.7864087401022357e-06, - "loss": 1.9213, + "epoch": 1.08, + "grad_norm": 10.41401481628418, + "learning_rate": 1.281512780822491e-05, + "loss": 1.7817, "step": 8588 }, { - "epoch": 2.58, - "grad_norm": 29.04400634765625, - "learning_rate": 2.7844041294978453e-06, - "loss": 1.8084, + "epoch": 1.08, + "grad_norm": 74.99308776855469, + "learning_rate": 1.2814291093168222e-05, + "loss": 2.2155, "step": 8589 }, { - "epoch": 2.58, - "grad_norm": 9.371678352355957, - "learning_rate": 2.782399518893455e-06, - "loss": 1.0408, + "epoch": 1.08, + "grad_norm": 6.079598903656006, + "learning_rate": 1.2813454378111536e-05, + "loss": 0.2717, "step": 8590 }, { - "epoch": 2.58, - "grad_norm": 11.659550666809082, - "learning_rate": 2.780394908289065e-06, - "loss": 1.3009, + "epoch": 1.08, + "grad_norm": 76.10659790039062, + "learning_rate": 1.2812617663054846e-05, + "loss": 1.7313, "step": 8591 }, { - "epoch": 2.58, - "grad_norm": 27.285043716430664, - "learning_rate": 2.7783902976846747e-06, - "loss": 1.9198, + "epoch": 1.08, + "grad_norm": 31.880477905273438, + "learning_rate": 1.281178094799816e-05, + "loss": 0.3522, "step": 8592 }, { - "epoch": 2.58, - "grad_norm": 30.421232223510742, - "learning_rate": 2.776385687080285e-06, - "loss": 1.3893, + "epoch": 1.08, + "grad_norm": 13.708595275878906, + "learning_rate": 1.2810944232941474e-05, + "loss": 2.8344, "step": 8593 }, { - "epoch": 2.58, - "grad_norm": 14.901866912841797, - "learning_rate": 2.774381076475895e-06, - "loss": 1.0787, + "epoch": 1.08, + "grad_norm": 47.557804107666016, + "learning_rate": 1.2810107517884784e-05, + "loss": 2.2919, "step": 8594 }, { - "epoch": 2.58, - "grad_norm": 10.58189868927002, - "learning_rate": 2.7723764658715046e-06, - "loss": 1.3617, + "epoch": 1.08, + "grad_norm": 6.2081685066223145, + "learning_rate": 1.2809270802828098e-05, + "loss": 0.2591, "step": 8595 }, { - "epoch": 2.58, - "grad_norm": 83.2371826171875, - "learning_rate": 2.7703718552671142e-06, - "loss": 1.7557, + "epoch": 1.08, + "grad_norm": 17.9738826751709, + "learning_rate": 1.2808434087771411e-05, + "loss": 1.872, "step": 8596 }, { - "epoch": 2.58, - "grad_norm": 23.80765724182129, - "learning_rate": 2.7683672446627247e-06, - "loss": 2.3199, + "epoch": 1.08, + "grad_norm": 23.148326873779297, + "learning_rate": 1.2807597372714723e-05, + "loss": 1.6406, "step": 8597 }, { - "epoch": 2.59, - "grad_norm": 15.422316551208496, - "learning_rate": 2.7663626340583344e-06, - "loss": 1.1275, + "epoch": 1.08, + "grad_norm": 26.999752044677734, + "learning_rate": 1.2806760657658035e-05, + "loss": 2.2209, "step": 8598 }, { - "epoch": 2.59, - "grad_norm": 20.54278564453125, - "learning_rate": 2.764358023453944e-06, - "loss": 1.7601, + "epoch": 1.08, + "grad_norm": 8.44945240020752, + "learning_rate": 1.2805923942601349e-05, + "loss": 0.8549, "step": 8599 }, { - "epoch": 2.59, - "grad_norm": 14.661285400390625, - "learning_rate": 2.7623534128495546e-06, - "loss": 1.0741, + "epoch": 1.08, + "grad_norm": 8.50709056854248, + "learning_rate": 1.280508722754466e-05, + "loss": 0.533, "step": 8600 }, { - "epoch": 2.59, - "grad_norm": 33.70438766479492, - "learning_rate": 2.7603488022451642e-06, - "loss": 1.0387, + "epoch": 1.08, + "grad_norm": 11.294102668762207, + "learning_rate": 1.2804250512487973e-05, + "loss": 0.4275, "step": 8601 }, { - "epoch": 2.59, - "grad_norm": 15.016576766967773, - "learning_rate": 2.758344191640774e-06, - "loss": 1.7023, + "epoch": 1.08, + "grad_norm": 4.108835697174072, + "learning_rate": 1.2803413797431286e-05, + "loss": 0.0644, "step": 8602 }, { - "epoch": 2.59, - "grad_norm": 7.491636753082275, - "learning_rate": 2.756339581036384e-06, - "loss": 1.374, + "epoch": 1.08, + "grad_norm": 70.74079132080078, + "learning_rate": 1.2802577082374598e-05, + "loss": 1.4626, "step": 8603 }, { - "epoch": 2.59, - "grad_norm": 13.837325096130371, - "learning_rate": 2.7543349704319936e-06, - "loss": 0.8613, + "epoch": 1.08, + "grad_norm": 21.69393539428711, + "learning_rate": 1.2801740367317912e-05, + "loss": 1.4618, "step": 8604 }, { - "epoch": 2.59, - "grad_norm": 17.60556983947754, - "learning_rate": 2.7523303598276037e-06, - "loss": 1.1599, + "epoch": 1.08, + "grad_norm": 10.441219329833984, + "learning_rate": 1.2800903652261222e-05, + "loss": 1.2134, "step": 8605 }, { - "epoch": 2.59, - "grad_norm": 13.817652702331543, - "learning_rate": 2.750325749223214e-06, - "loss": 1.1847, + "epoch": 1.08, + "grad_norm": 8.55947208404541, + "learning_rate": 1.2800066937204536e-05, + "loss": 1.9702, "step": 8606 }, { - "epoch": 2.59, - "grad_norm": 39.440921783447266, - "learning_rate": 2.7483211386188235e-06, - "loss": 1.5714, + "epoch": 1.08, + "grad_norm": 18.24686050415039, + "learning_rate": 1.279923022214785e-05, + "loss": 1.4235, "step": 8607 }, { - "epoch": 2.59, - "grad_norm": 13.094484329223633, - "learning_rate": 2.746316528014433e-06, - "loss": 1.4254, + "epoch": 1.08, + "grad_norm": 5.580203533172607, + "learning_rate": 1.279839350709116e-05, + "loss": 0.1612, "step": 8608 }, { - "epoch": 2.59, - "grad_norm": 12.363558769226074, - "learning_rate": 2.7443119174100436e-06, - "loss": 1.4503, + "epoch": 1.08, + "grad_norm": 6.0034894943237305, + "learning_rate": 1.2797556792034473e-05, + "loss": 1.0321, "step": 8609 }, { - "epoch": 2.59, - "grad_norm": 81.3821792602539, - "learning_rate": 2.7423073068056533e-06, - "loss": 1.6868, + "epoch": 1.08, + "grad_norm": 14.018782615661621, + "learning_rate": 1.2796720076977787e-05, + "loss": 1.6453, "step": 8610 }, { - "epoch": 2.59, - "grad_norm": 28.759248733520508, - "learning_rate": 2.740302696201263e-06, - "loss": 1.2274, + "epoch": 1.08, + "grad_norm": 18.138858795166016, + "learning_rate": 1.2795883361921099e-05, + "loss": 1.3789, "step": 8611 }, { - "epoch": 2.59, - "grad_norm": 22.243206024169922, - "learning_rate": 2.7382980855968735e-06, - "loss": 1.5054, + "epoch": 1.08, + "grad_norm": 9.55051326751709, + "learning_rate": 1.2795046646864411e-05, + "loss": 1.4569, "step": 8612 }, { - "epoch": 2.59, - "grad_norm": 16.99969482421875, - "learning_rate": 2.736293474992483e-06, - "loss": 1.3755, + "epoch": 1.08, + "grad_norm": 8.357460975646973, + "learning_rate": 1.2794209931807725e-05, + "loss": 0.7194, "step": 8613 }, { - "epoch": 2.59, - "grad_norm": 8.332833290100098, - "learning_rate": 2.7342888643880928e-06, - "loss": 1.0886, + "epoch": 1.08, + "grad_norm": 6.152368545532227, + "learning_rate": 1.2793373216751037e-05, + "loss": 0.9195, "step": 8614 }, { - "epoch": 2.59, - "grad_norm": 17.875459671020508, - "learning_rate": 2.732284253783703e-06, - "loss": 1.2963, + "epoch": 1.08, + "grad_norm": 7.088659286499023, + "learning_rate": 1.2792536501694349e-05, + "loss": 2.3761, "step": 8615 }, { - "epoch": 2.59, - "grad_norm": 17.195663452148438, - "learning_rate": 2.7302796431793125e-06, - "loss": 0.9124, + "epoch": 1.08, + "grad_norm": 16.12812614440918, + "learning_rate": 1.279169978663766e-05, + "loss": 1.8399, "step": 8616 }, { - "epoch": 2.59, - "grad_norm": 45.631954193115234, - "learning_rate": 2.7282750325749226e-06, - "loss": 2.5502, + "epoch": 1.08, + "grad_norm": 12.287689208984375, + "learning_rate": 1.2790863071580974e-05, + "loss": 0.7975, "step": 8617 }, { - "epoch": 2.59, - "grad_norm": 72.2505874633789, - "learning_rate": 2.7262704219705327e-06, - "loss": 1.9153, + "epoch": 1.08, + "grad_norm": 18.367534637451172, + "learning_rate": 1.2790026356524288e-05, + "loss": 0.6008, "step": 8618 }, { - "epoch": 2.59, - "grad_norm": 19.540000915527344, - "learning_rate": 2.7242658113661424e-06, - "loss": 1.3127, + "epoch": 1.08, + "grad_norm": 9.941510200500488, + "learning_rate": 1.2789189641467598e-05, + "loss": 0.8487, "step": 8619 }, { - "epoch": 2.59, - "grad_norm": 16.321380615234375, - "learning_rate": 2.722261200761752e-06, - "loss": 1.1336, + "epoch": 1.08, + "grad_norm": 11.001337051391602, + "learning_rate": 1.2788352926410912e-05, + "loss": 1.2659, "step": 8620 }, { - "epoch": 2.59, - "grad_norm": 35.40462875366211, - "learning_rate": 2.7202565901573625e-06, - "loss": 1.4144, + "epoch": 1.08, + "grad_norm": 18.091001510620117, + "learning_rate": 1.2787516211354225e-05, + "loss": 1.5459, "step": 8621 }, { - "epoch": 2.59, - "grad_norm": 18.681198120117188, - "learning_rate": 2.718251979552972e-06, - "loss": 0.8571, + "epoch": 1.08, + "grad_norm": 39.56760787963867, + "learning_rate": 1.2786679496297536e-05, + "loss": 2.3626, "step": 8622 }, { - "epoch": 2.59, - "grad_norm": 37.2798957824707, - "learning_rate": 2.716247368948582e-06, - "loss": 1.1746, + "epoch": 1.08, + "grad_norm": 12.28984260559082, + "learning_rate": 1.278584278124085e-05, + "loss": 1.4275, "step": 8623 }, { - "epoch": 2.59, - "grad_norm": 17.78194808959961, - "learning_rate": 2.7142427583441915e-06, - "loss": 1.0923, + "epoch": 1.08, + "grad_norm": 6.757332801818848, + "learning_rate": 1.2785006066184163e-05, + "loss": 0.2179, "step": 8624 }, { - "epoch": 2.59, - "grad_norm": 18.684160232543945, - "learning_rate": 2.712238147739802e-06, - "loss": 1.3139, + "epoch": 1.08, + "grad_norm": 8.783282279968262, + "learning_rate": 1.2784169351127475e-05, + "loss": 0.4344, "step": 8625 }, { - "epoch": 2.59, - "grad_norm": 25.179645538330078, - "learning_rate": 2.7102335371354117e-06, - "loss": 1.5291, + "epoch": 1.08, + "grad_norm": 19.863969802856445, + "learning_rate": 1.2783332636070787e-05, + "loss": 1.7297, "step": 8626 }, { - "epoch": 2.59, - "grad_norm": 9.155866622924805, - "learning_rate": 2.7082289265310213e-06, - "loss": 0.6078, + "epoch": 1.08, + "grad_norm": 13.56358814239502, + "learning_rate": 1.27824959210141e-05, + "loss": 0.3086, "step": 8627 }, { - "epoch": 2.59, - "grad_norm": 25.500167846679688, - "learning_rate": 2.7062243159266314e-06, - "loss": 1.8845, + "epoch": 1.08, + "grad_norm": 11.027216911315918, + "learning_rate": 1.2781659205957412e-05, + "loss": 0.847, "step": 8628 }, { - "epoch": 2.59, - "grad_norm": 14.735610008239746, - "learning_rate": 2.7042197053222415e-06, - "loss": 1.3812, + "epoch": 1.08, + "grad_norm": 15.298280715942383, + "learning_rate": 1.2780822490900724e-05, + "loss": 1.68, "step": 8629 }, { - "epoch": 2.59, - "grad_norm": 13.087080001831055, - "learning_rate": 2.702215094717851e-06, - "loss": 2.3241, + "epoch": 1.08, + "grad_norm": 31.234249114990234, + "learning_rate": 1.2779985775844036e-05, + "loss": 2.733, "step": 8630 }, { - "epoch": 2.6, - "grad_norm": 15.85035228729248, - "learning_rate": 2.7002104841134612e-06, - "loss": 2.1271, + "epoch": 1.08, + "grad_norm": 16.561674118041992, + "learning_rate": 1.277914906078735e-05, + "loss": 0.555, "step": 8631 }, { - "epoch": 2.6, - "grad_norm": 16.836761474609375, - "learning_rate": 2.698205873509071e-06, - "loss": 0.9277, + "epoch": 1.08, + "grad_norm": 6.423410415649414, + "learning_rate": 1.2778312345730664e-05, + "loss": 1.22, "step": 8632 }, { - "epoch": 2.6, - "grad_norm": 17.871294021606445, - "learning_rate": 2.6962012629046806e-06, - "loss": 1.3209, + "epoch": 1.08, + "grad_norm": 19.52211570739746, + "learning_rate": 1.2777475630673974e-05, + "loss": 2.2232, "step": 8633 }, { - "epoch": 2.6, - "grad_norm": 10.736555099487305, - "learning_rate": 2.694196652300291e-06, - "loss": 0.9247, + "epoch": 1.08, + "grad_norm": 17.728485107421875, + "learning_rate": 1.2776638915617288e-05, + "loss": 2.1009, "step": 8634 }, { - "epoch": 2.6, - "grad_norm": 4.654946804046631, - "learning_rate": 2.6921920416959007e-06, - "loss": 0.6192, + "epoch": 1.08, + "grad_norm": 20.689403533935547, + "learning_rate": 1.2775802200560601e-05, + "loss": 1.6795, "step": 8635 }, { - "epoch": 2.6, - "grad_norm": 62.396331787109375, - "learning_rate": 2.6901874310915104e-06, - "loss": 2.6783, + "epoch": 1.08, + "grad_norm": 5.071125030517578, + "learning_rate": 1.2774965485503911e-05, + "loss": 0.1614, "step": 8636 }, { - "epoch": 2.6, - "grad_norm": 37.43707275390625, - "learning_rate": 2.688182820487121e-06, - "loss": 0.6253, + "epoch": 1.08, + "grad_norm": 9.400809288024902, + "learning_rate": 1.2774128770447225e-05, + "loss": 1.924, "step": 8637 }, { - "epoch": 2.6, - "grad_norm": 10.426443099975586, - "learning_rate": 2.6861782098827306e-06, - "loss": 1.1717, + "epoch": 1.08, + "grad_norm": 23.80042266845703, + "learning_rate": 1.2773292055390539e-05, + "loss": 2.4768, "step": 8638 }, { - "epoch": 2.6, - "grad_norm": 77.32369995117188, - "learning_rate": 2.6841735992783402e-06, - "loss": 2.797, + "epoch": 1.08, + "grad_norm": 25.71880531311035, + "learning_rate": 1.277245534033385e-05, + "loss": 1.2631, "step": 8639 }, { - "epoch": 2.6, - "grad_norm": 18.485462188720703, - "learning_rate": 2.6821689886739503e-06, - "loss": 0.9191, - "step": 8640 - }, - { - "epoch": 2.6, - "eval_loss": 0.17308995127677917, - "eval_runtime": 43.9129, - "eval_samples_per_second": 33.68, - "eval_steps_per_second": 33.68, + "epoch": 1.08, + "grad_norm": 9.267672538757324, + "learning_rate": 1.2771618625277163e-05, + "loss": 0.2955, "step": 8640 }, { - "epoch": 2.6, - "grad_norm": 10.55259895324707, - "learning_rate": 2.6801643780695604e-06, - "loss": 1.4176, + "epoch": 1.08, + "grad_norm": 4.219285488128662, + "learning_rate": 1.2770781910220476e-05, + "loss": 0.3018, "step": 8641 }, { - "epoch": 2.6, - "grad_norm": 14.04641056060791, - "learning_rate": 2.67815976746517e-06, - "loss": 1.3492, + "epoch": 1.08, + "grad_norm": 5.892913818359375, + "learning_rate": 1.2769945195163788e-05, + "loss": 0.7357, "step": 8642 }, { - "epoch": 2.6, - "grad_norm": 11.673070907592773, - "learning_rate": 2.67615515686078e-06, - "loss": 1.3232, + "epoch": 1.08, + "grad_norm": 19.84624481201172, + "learning_rate": 1.27691084801071e-05, + "loss": 2.7648, "step": 8643 }, { - "epoch": 2.6, - "grad_norm": 22.69013023376465, - "learning_rate": 2.67415054625639e-06, - "loss": 1.535, + "epoch": 1.08, + "grad_norm": 9.951528549194336, + "learning_rate": 1.2768271765050412e-05, + "loss": 1.8721, "step": 8644 }, { - "epoch": 2.6, - "grad_norm": 31.60176658630371, - "learning_rate": 2.6721459356519995e-06, - "loss": 2.397, + "epoch": 1.08, + "grad_norm": 27.758649826049805, + "learning_rate": 1.2767435049993726e-05, + "loss": 1.1707, "step": 8645 }, { - "epoch": 2.6, - "grad_norm": 72.22872161865234, - "learning_rate": 2.67014132504761e-06, - "loss": 3.1373, + "epoch": 1.09, + "grad_norm": 6.61989688873291, + "learning_rate": 1.276659833493704e-05, + "loss": 1.2057, "step": 8646 }, { - "epoch": 2.6, - "grad_norm": 12.671222686767578, - "learning_rate": 2.6681367144432196e-06, - "loss": 0.9173, + "epoch": 1.09, + "grad_norm": 6.966522693634033, + "learning_rate": 1.276576161988035e-05, + "loss": 2.5643, "step": 8647 }, { - "epoch": 2.6, - "grad_norm": 24.99388885498047, - "learning_rate": 2.6661321038388293e-06, - "loss": 1.3955, + "epoch": 1.09, + "grad_norm": 10.06793212890625, + "learning_rate": 1.2764924904823663e-05, + "loss": 0.8765, "step": 8648 }, { - "epoch": 2.6, - "grad_norm": 28.980268478393555, - "learning_rate": 2.66412749323444e-06, - "loss": 1.3139, + "epoch": 1.09, + "grad_norm": 8.949012756347656, + "learning_rate": 1.2764088189766977e-05, + "loss": 2.0474, "step": 8649 }, { - "epoch": 2.6, - "grad_norm": 14.720972061157227, - "learning_rate": 2.6621228826300495e-06, - "loss": 0.8373, + "epoch": 1.09, + "grad_norm": 17.571147918701172, + "learning_rate": 1.2763251474710287e-05, + "loss": 0.9656, "step": 8650 }, { - "epoch": 2.6, - "grad_norm": 63.49948501586914, - "learning_rate": 2.660118272025659e-06, - "loss": 1.9721, + "epoch": 1.09, + "grad_norm": 12.04281234741211, + "learning_rate": 1.2762414759653601e-05, + "loss": 0.9934, "step": 8651 }, { - "epoch": 2.6, - "grad_norm": 11.087202072143555, - "learning_rate": 2.658113661421269e-06, - "loss": 0.9022, + "epoch": 1.09, + "grad_norm": 13.158784866333008, + "learning_rate": 1.2761578044596915e-05, + "loss": 1.818, "step": 8652 }, { - "epoch": 2.6, - "grad_norm": 21.9641170501709, - "learning_rate": 2.6561090508168793e-06, - "loss": 1.9667, + "epoch": 1.09, + "grad_norm": 21.233274459838867, + "learning_rate": 1.2760741329540227e-05, + "loss": 1.3968, "step": 8653 }, { - "epoch": 2.6, - "grad_norm": 16.303483963012695, - "learning_rate": 2.654104440212489e-06, - "loss": 1.1034, + "epoch": 1.09, + "grad_norm": 10.787312507629395, + "learning_rate": 1.2759904614483539e-05, + "loss": 0.5281, "step": 8654 }, { - "epoch": 2.6, - "grad_norm": 15.40654468536377, - "learning_rate": 2.652099829608099e-06, - "loss": 0.9308, + "epoch": 1.09, + "grad_norm": 19.136770248413086, + "learning_rate": 1.275906789942685e-05, + "loss": 0.8433, "step": 8655 }, { - "epoch": 2.6, - "grad_norm": 37.456077575683594, - "learning_rate": 2.6500952190037087e-06, - "loss": 1.5839, + "epoch": 1.09, + "grad_norm": 12.546215057373047, + "learning_rate": 1.2758231184370164e-05, + "loss": 0.9112, "step": 8656 }, { - "epoch": 2.6, - "grad_norm": 189.6568145751953, - "learning_rate": 2.6480906083993184e-06, - "loss": 1.1821, + "epoch": 1.09, + "grad_norm": 1347.7606201171875, + "learning_rate": 1.2757394469313476e-05, + "loss": 1.3567, "step": 8657 }, { - "epoch": 2.6, - "grad_norm": 14.535365104675293, - "learning_rate": 2.646085997794929e-06, - "loss": 1.221, + "epoch": 1.09, + "grad_norm": 9.610301971435547, + "learning_rate": 1.2756557754256788e-05, + "loss": 0.2851, "step": 8658 }, { - "epoch": 2.6, - "grad_norm": 15.61431884765625, - "learning_rate": 2.6440813871905385e-06, - "loss": 0.8718, + "epoch": 1.09, + "grad_norm": 14.493852615356445, + "learning_rate": 1.2755721039200102e-05, + "loss": 2.127, "step": 8659 }, { - "epoch": 2.6, - "grad_norm": 16.955713272094727, - "learning_rate": 2.642076776586148e-06, - "loss": 1.0111, + "epoch": 1.09, + "grad_norm": 9.423722267150879, + "learning_rate": 1.2754884324143415e-05, + "loss": 1.8575, "step": 8660 }, { - "epoch": 2.6, - "grad_norm": 17.068771362304688, - "learning_rate": 2.6400721659817587e-06, - "loss": 1.5695, + "epoch": 1.09, + "grad_norm": 21.096939086914062, + "learning_rate": 1.2754047609086726e-05, + "loss": 2.0822, "step": 8661 }, { - "epoch": 2.6, - "grad_norm": 10.242925643920898, - "learning_rate": 2.6380675553773684e-06, - "loss": 1.4754, + "epoch": 1.09, + "grad_norm": 14.859004974365234, + "learning_rate": 1.275321089403004e-05, + "loss": 2.9309, "step": 8662 }, { - "epoch": 2.6, - "grad_norm": 30.505434036254883, - "learning_rate": 2.636062944772978e-06, - "loss": 1.1623, + "epoch": 1.09, + "grad_norm": 12.358412742614746, + "learning_rate": 1.2752374178973353e-05, + "loss": 0.5245, "step": 8663 }, { - "epoch": 2.6, - "grad_norm": 44.106178283691406, - "learning_rate": 2.6340583341685877e-06, - "loss": 2.1524, + "epoch": 1.09, + "grad_norm": 20.174772262573242, + "learning_rate": 1.2751537463916663e-05, + "loss": 1.7873, "step": 8664 }, { - "epoch": 2.61, - "grad_norm": 15.738425254821777, - "learning_rate": 2.6320537235641978e-06, - "loss": 1.3509, + "epoch": 1.09, + "grad_norm": 13.297382354736328, + "learning_rate": 1.2750700748859977e-05, + "loss": 3.0238, "step": 8665 }, { - "epoch": 2.61, - "grad_norm": 37.473567962646484, - "learning_rate": 2.630049112959808e-06, - "loss": 1.1996, + "epoch": 1.09, + "grad_norm": 15.283510208129883, + "learning_rate": 1.274986403380329e-05, + "loss": 1.0246, "step": 8666 }, { - "epoch": 2.61, - "grad_norm": 20.072607040405273, - "learning_rate": 2.6280445023554175e-06, - "loss": 1.1615, + "epoch": 1.09, + "grad_norm": 31.507217407226562, + "learning_rate": 1.2749027318746602e-05, + "loss": 1.2635, "step": 8667 }, { - "epoch": 2.61, - "grad_norm": 14.636398315429688, - "learning_rate": 2.6260398917510276e-06, - "loss": 0.863, + "epoch": 1.09, + "grad_norm": 11.496175765991211, + "learning_rate": 1.2748190603689914e-05, + "loss": 2.3934, "step": 8668 }, { - "epoch": 2.61, - "grad_norm": 9.129220008850098, - "learning_rate": 2.6240352811466373e-06, - "loss": 1.157, + "epoch": 1.09, + "grad_norm": 9.032344818115234, + "learning_rate": 1.2747353888633226e-05, + "loss": 1.3367, "step": 8669 }, { - "epoch": 2.61, - "grad_norm": 19.694377899169922, - "learning_rate": 2.6220306705422473e-06, - "loss": 0.8906, + "epoch": 1.09, + "grad_norm": 6.9487786293029785, + "learning_rate": 1.274651717357654e-05, + "loss": 1.802, "step": 8670 }, { - "epoch": 2.61, - "grad_norm": 13.88318157196045, - "learning_rate": 2.6200260599378574e-06, - "loss": 1.343, + "epoch": 1.09, + "grad_norm": 33.17593002319336, + "learning_rate": 1.2745680458519852e-05, + "loss": 2.1467, "step": 8671 }, { - "epoch": 2.61, - "grad_norm": 11.803475379943848, - "learning_rate": 2.618021449333467e-06, - "loss": 1.4005, + "epoch": 1.09, + "grad_norm": 13.267254829406738, + "learning_rate": 1.2744843743463164e-05, + "loss": 1.2365, "step": 8672 }, { - "epoch": 2.61, - "grad_norm": 18.531572341918945, - "learning_rate": 2.6160168387290767e-06, - "loss": 1.3321, + "epoch": 1.09, + "grad_norm": 11.520238876342773, + "learning_rate": 1.2744007028406478e-05, + "loss": 1.512, "step": 8673 }, { - "epoch": 2.61, - "grad_norm": 6.691076278686523, - "learning_rate": 2.6140122281246872e-06, - "loss": 0.7632, + "epoch": 1.09, + "grad_norm": 19.45894432067871, + "learning_rate": 1.2743170313349791e-05, + "loss": 2.1284, "step": 8674 }, { - "epoch": 2.61, - "grad_norm": 8.980220794677734, - "learning_rate": 2.612007617520297e-06, - "loss": 0.7735, + "epoch": 1.09, + "grad_norm": 14.671815872192383, + "learning_rate": 1.2742333598293101e-05, + "loss": 1.9271, "step": 8675 }, { - "epoch": 2.61, - "grad_norm": 8.376172065734863, - "learning_rate": 2.6100030069159066e-06, - "loss": 1.0655, + "epoch": 1.09, + "grad_norm": 11.830839157104492, + "learning_rate": 1.2741496883236415e-05, + "loss": 2.0808, "step": 8676 }, { - "epoch": 2.61, - "grad_norm": 10.30313491821289, - "learning_rate": 2.6079983963115167e-06, - "loss": 1.6599, + "epoch": 1.09, + "grad_norm": 6.510457515716553, + "learning_rate": 1.2740660168179729e-05, + "loss": 1.049, "step": 8677 }, { - "epoch": 2.61, - "grad_norm": 23.8471622467041, - "learning_rate": 2.6059937857071267e-06, - "loss": 0.579, + "epoch": 1.09, + "grad_norm": 11.446674346923828, + "learning_rate": 1.2739823453123039e-05, + "loss": 0.7896, "step": 8678 }, { - "epoch": 2.61, - "grad_norm": 25.849637985229492, - "learning_rate": 2.6039891751027364e-06, - "loss": 1.8009, + "epoch": 1.09, + "grad_norm": 9.457968711853027, + "learning_rate": 1.2738986738066353e-05, + "loss": 1.7783, "step": 8679 }, { - "epoch": 2.61, - "grad_norm": 26.335845947265625, - "learning_rate": 2.6019845644983465e-06, - "loss": 1.6794, + "epoch": 1.09, + "grad_norm": 13.267879486083984, + "learning_rate": 1.2738150023009666e-05, + "loss": 1.7797, "step": 8680 }, { - "epoch": 2.61, - "grad_norm": 21.52005386352539, - "learning_rate": 2.599979953893956e-06, - "loss": 1.3504, + "epoch": 1.09, + "grad_norm": 10.053738594055176, + "learning_rate": 1.2737313307952978e-05, + "loss": 1.3612, "step": 8681 }, { - "epoch": 2.61, - "grad_norm": 24.20670509338379, - "learning_rate": 2.5979753432895662e-06, - "loss": 0.9255, + "epoch": 1.09, + "grad_norm": 10.973382949829102, + "learning_rate": 1.273647659289629e-05, + "loss": 1.8119, "step": 8682 }, { - "epoch": 2.61, - "grad_norm": 105.50646209716797, - "learning_rate": 2.5959707326851763e-06, - "loss": 2.7274, + "epoch": 1.09, + "grad_norm": 19.048812866210938, + "learning_rate": 1.2735639877839602e-05, + "loss": 1.8454, "step": 8683 }, { - "epoch": 2.61, - "grad_norm": 28.476221084594727, - "learning_rate": 2.593966122080786e-06, - "loss": 2.7293, + "epoch": 1.09, + "grad_norm": 70.28934478759766, + "learning_rate": 1.2734803162782916e-05, + "loss": 0.5382, "step": 8684 }, { - "epoch": 2.61, - "grad_norm": 23.278079986572266, - "learning_rate": 2.5919615114763956e-06, - "loss": 2.1648, + "epoch": 1.09, + "grad_norm": 30.25827980041504, + "learning_rate": 1.2733966447726228e-05, + "loss": 2.7756, "step": 8685 }, { - "epoch": 2.61, - "grad_norm": 25.680274963378906, - "learning_rate": 2.589956900872006e-06, - "loss": 1.5944, + "epoch": 1.09, + "grad_norm": 24.014326095581055, + "learning_rate": 1.273312973266954e-05, + "loss": 0.9742, "step": 8686 }, { - "epoch": 2.61, - "grad_norm": 16.657413482666016, - "learning_rate": 2.587952290267616e-06, - "loss": 0.8337, + "epoch": 1.09, + "grad_norm": 6.420015335083008, + "learning_rate": 1.2732293017612853e-05, + "loss": 1.4453, "step": 8687 }, { - "epoch": 2.61, - "grad_norm": 24.16915512084961, - "learning_rate": 2.5859476796632255e-06, - "loss": 1.168, + "epoch": 1.09, + "grad_norm": 8.522665977478027, + "learning_rate": 1.2731456302556167e-05, + "loss": 1.9186, "step": 8688 }, { - "epoch": 2.61, - "grad_norm": 13.357244491577148, - "learning_rate": 2.5839430690588355e-06, - "loss": 0.9006, + "epoch": 1.09, + "grad_norm": 18.01955223083496, + "learning_rate": 1.2730619587499477e-05, + "loss": 1.371, "step": 8689 }, { - "epoch": 2.61, - "grad_norm": 17.03959083557129, - "learning_rate": 2.5819384584544456e-06, - "loss": 1.3754, + "epoch": 1.09, + "grad_norm": 11.156051635742188, + "learning_rate": 1.2729782872442791e-05, + "loss": 0.5492, "step": 8690 }, { - "epoch": 2.61, - "grad_norm": 14.713468551635742, - "learning_rate": 2.5799338478500553e-06, - "loss": 1.3571, + "epoch": 1.09, + "grad_norm": 15.096097946166992, + "learning_rate": 1.2728946157386105e-05, + "loss": 1.4618, "step": 8691 }, { - "epoch": 2.61, - "grad_norm": 48.19596481323242, - "learning_rate": 2.5779292372456654e-06, - "loss": 1.8712, + "epoch": 1.09, + "grad_norm": 15.180127143859863, + "learning_rate": 1.2728109442329415e-05, + "loss": 1.2357, "step": 8692 }, { - "epoch": 2.61, - "grad_norm": 42.44739532470703, - "learning_rate": 2.575924626641275e-06, - "loss": 1.2746, + "epoch": 1.09, + "grad_norm": 4.636838436126709, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.4605, "step": 8693 }, { - "epoch": 2.61, - "grad_norm": 35.6419792175293, - "learning_rate": 2.5739200160368847e-06, - "loss": 1.5702, + "epoch": 1.09, + "grad_norm": 24.56222152709961, + "learning_rate": 1.2726436012216042e-05, + "loss": 2.0397, "step": 8694 }, { - "epoch": 2.61, - "grad_norm": 19.48603630065918, - "learning_rate": 2.571915405432495e-06, - "loss": 1.9149, + "epoch": 1.09, + "grad_norm": 8.664571762084961, + "learning_rate": 1.2725599297159354e-05, + "loss": 1.411, "step": 8695 }, { - "epoch": 2.61, - "grad_norm": 15.384188652038574, - "learning_rate": 2.569910794828105e-06, - "loss": 1.4403, + "epoch": 1.09, + "grad_norm": 9.557058334350586, + "learning_rate": 1.2724762582102666e-05, + "loss": 1.119, "step": 8696 }, { - "epoch": 2.61, - "grad_norm": 18.034732818603516, - "learning_rate": 2.5679061842237145e-06, - "loss": 1.2765, + "epoch": 1.09, + "grad_norm": 24.020355224609375, + "learning_rate": 1.2723925867045978e-05, + "loss": 3.312, "step": 8697 }, { - "epoch": 2.62, - "grad_norm": 12.634345054626465, - "learning_rate": 2.565901573619325e-06, - "loss": 1.0044, + "epoch": 1.09, + "grad_norm": 40.546783447265625, + "learning_rate": 1.2723089151989292e-05, + "loss": 1.64, "step": 8698 }, { - "epoch": 2.62, - "grad_norm": 12.71899127960205, - "learning_rate": 2.5638969630149347e-06, - "loss": 1.3239, + "epoch": 1.09, + "grad_norm": 7.633327960968018, + "learning_rate": 1.2722252436932604e-05, + "loss": 0.794, "step": 8699 }, { - "epoch": 2.62, - "grad_norm": 597.7841186523438, - "learning_rate": 2.5618923524105444e-06, - "loss": 1.6701, + "epoch": 1.09, + "grad_norm": 21.85679817199707, + "learning_rate": 1.2721415721875916e-05, + "loss": 1.4378, "step": 8700 }, { - "epoch": 2.62, - "grad_norm": 16.543411254882812, - "learning_rate": 2.5598877418061544e-06, - "loss": 1.4786, + "epoch": 1.09, + "grad_norm": 19.40522575378418, + "learning_rate": 1.272057900681923e-05, + "loss": 2.0973, "step": 8701 }, { - "epoch": 2.62, - "grad_norm": 48.39625930786133, - "learning_rate": 2.5578831312017645e-06, - "loss": 1.538, + "epoch": 1.09, + "grad_norm": 22.19701385498047, + "learning_rate": 1.2719742291762543e-05, + "loss": 1.544, "step": 8702 }, { - "epoch": 2.62, - "grad_norm": 10.015215873718262, - "learning_rate": 2.555878520597374e-06, - "loss": 1.4655, + "epoch": 1.09, + "grad_norm": 6.121273040771484, + "learning_rate": 1.2718905576705853e-05, + "loss": 0.5231, "step": 8703 }, { - "epoch": 2.62, - "grad_norm": 18.111173629760742, - "learning_rate": 2.5538739099929843e-06, - "loss": 1.539, + "epoch": 1.09, + "grad_norm": 61.69236373901367, + "learning_rate": 1.2718068861649167e-05, + "loss": 1.9814, "step": 8704 }, { - "epoch": 2.62, - "grad_norm": 34.77145767211914, - "learning_rate": 2.551869299388594e-06, - "loss": 1.5545, + "epoch": 1.09, + "grad_norm": 7.5420122146606445, + "learning_rate": 1.271723214659248e-05, + "loss": 0.546, "step": 8705 }, { - "epoch": 2.62, - "grad_norm": 17.934062957763672, - "learning_rate": 2.5498646887842036e-06, - "loss": 1.3262, + "epoch": 1.09, + "grad_norm": 8.4906005859375, + "learning_rate": 1.271639543153579e-05, + "loss": 0.6149, "step": 8706 }, { - "epoch": 2.62, - "grad_norm": 14.223177909851074, - "learning_rate": 2.5478600781798137e-06, - "loss": 2.0243, + "epoch": 1.09, + "grad_norm": 20.686519622802734, + "learning_rate": 1.2715558716479104e-05, + "loss": 2.3446, "step": 8707 }, { - "epoch": 2.62, - "grad_norm": 21.776586532592773, - "learning_rate": 2.5458554675754238e-06, - "loss": 1.3405, + "epoch": 1.09, + "grad_norm": 61.66415786743164, + "learning_rate": 1.2714722001422416e-05, + "loss": 2.3224, "step": 8708 }, { - "epoch": 2.62, - "grad_norm": 10.819275856018066, - "learning_rate": 2.5438508569710334e-06, - "loss": 1.488, + "epoch": 1.09, + "grad_norm": 6.78904390335083, + "learning_rate": 1.271388528636573e-05, + "loss": 0.6229, "step": 8709 }, { - "epoch": 2.62, - "grad_norm": 12.452916145324707, - "learning_rate": 2.541846246366643e-06, - "loss": 1.3601, + "epoch": 1.09, + "grad_norm": 27.501174926757812, + "learning_rate": 1.2713048571309042e-05, + "loss": 1.8172, "step": 8710 }, { - "epoch": 2.62, - "grad_norm": 13.645332336425781, - "learning_rate": 2.5398416357622536e-06, - "loss": 1.2305, + "epoch": 1.09, + "grad_norm": 16.328968048095703, + "learning_rate": 1.2712211856252354e-05, + "loss": 1.8874, "step": 8711 }, { - "epoch": 2.62, - "grad_norm": 157.49203491210938, - "learning_rate": 2.5378370251578633e-06, - "loss": 1.5407, + "epoch": 1.09, + "grad_norm": 6.196768760681152, + "learning_rate": 1.2711375141195667e-05, + "loss": 0.6491, "step": 8712 }, { - "epoch": 2.62, - "grad_norm": 15.182050704956055, - "learning_rate": 2.535832414553473e-06, - "loss": 0.874, + "epoch": 1.09, + "grad_norm": 9.383679389953613, + "learning_rate": 1.271053842613898e-05, + "loss": 1.6176, "step": 8713 }, { - "epoch": 2.62, - "grad_norm": 16.068603515625, - "learning_rate": 2.5338278039490834e-06, - "loss": 1.3392, + "epoch": 1.09, + "grad_norm": 12.209012985229492, + "learning_rate": 1.2709701711082291e-05, + "loss": 1.4438, "step": 8714 }, { - "epoch": 2.62, - "grad_norm": 11.687101364135742, - "learning_rate": 2.531823193344693e-06, - "loss": 0.9513, + "epoch": 1.09, + "grad_norm": 16.936748504638672, + "learning_rate": 1.2708864996025605e-05, + "loss": 1.1826, "step": 8715 }, { - "epoch": 2.62, - "grad_norm": 16.209623336791992, - "learning_rate": 2.5298185827403027e-06, - "loss": 0.8897, + "epoch": 1.09, + "grad_norm": 18.694360733032227, + "learning_rate": 1.2708028280968919e-05, + "loss": 1.9717, "step": 8716 }, { - "epoch": 2.62, - "grad_norm": 42.321136474609375, - "learning_rate": 2.527813972135913e-06, - "loss": 2.2684, + "epoch": 1.09, + "grad_norm": 8.575372695922852, + "learning_rate": 1.2707191565912229e-05, + "loss": 1.418, "step": 8717 }, { - "epoch": 2.62, - "grad_norm": 19.06346321105957, - "learning_rate": 2.5258093615315225e-06, - "loss": 1.3514, + "epoch": 1.09, + "grad_norm": 17.063716888427734, + "learning_rate": 1.2706354850855543e-05, + "loss": 0.9269, "step": 8718 }, { - "epoch": 2.62, - "grad_norm": 10.905227661132812, - "learning_rate": 2.5238047509271326e-06, - "loss": 0.6565, + "epoch": 1.09, + "grad_norm": 12.043959617614746, + "learning_rate": 1.2705518135798856e-05, + "loss": 1.7686, "step": 8719 }, { - "epoch": 2.62, - "grad_norm": 17.082691192626953, - "learning_rate": 2.5218001403227427e-06, - "loss": 2.4891, + "epoch": 1.09, + "grad_norm": 34.632266998291016, + "learning_rate": 1.2704681420742166e-05, + "loss": 1.5823, "step": 8720 }, { - "epoch": 2.62, - "grad_norm": 22.291873931884766, - "learning_rate": 2.5197955297183523e-06, - "loss": 2.0741, + "epoch": 1.09, + "grad_norm": 5.508233547210693, + "learning_rate": 1.270384470568548e-05, + "loss": 1.2093, "step": 8721 }, { - "epoch": 2.62, - "grad_norm": 9.077491760253906, - "learning_rate": 2.517790919113962e-06, - "loss": 1.115, + "epoch": 1.09, + "grad_norm": 4.57993745803833, + "learning_rate": 1.2703007990628792e-05, + "loss": 1.6767, "step": 8722 }, { - "epoch": 2.62, - "grad_norm": 10.916313171386719, - "learning_rate": 2.5157863085095725e-06, - "loss": 1.4171, + "epoch": 1.09, + "grad_norm": 15.940275192260742, + "learning_rate": 1.2702171275572106e-05, + "loss": 0.6244, "step": 8723 }, { - "epoch": 2.62, - "grad_norm": 54.628211975097656, - "learning_rate": 2.513781697905182e-06, - "loss": 0.9545, + "epoch": 1.09, + "grad_norm": 8.179159164428711, + "learning_rate": 1.2701334560515418e-05, + "loss": 0.5239, "step": 8724 }, { - "epoch": 2.62, - "grad_norm": 11.69243049621582, - "learning_rate": 2.511777087300792e-06, - "loss": 0.7757, + "epoch": 1.09, + "grad_norm": 8.999481201171875, + "learning_rate": 1.270049784545873e-05, + "loss": 1.9757, "step": 8725 }, { - "epoch": 2.62, - "grad_norm": 23.144771575927734, - "learning_rate": 2.5097724766964023e-06, - "loss": 1.4352, + "epoch": 1.1, + "grad_norm": 9.038658142089844, + "learning_rate": 1.2699661130402043e-05, + "loss": 1.1503, "step": 8726 }, { - "epoch": 2.62, - "grad_norm": 23.521812438964844, - "learning_rate": 2.507767866092012e-06, - "loss": 1.276, + "epoch": 1.1, + "grad_norm": 20.042102813720703, + "learning_rate": 1.2698824415345354e-05, + "loss": 2.745, "step": 8727 }, { - "epoch": 2.62, - "grad_norm": 29.313697814941406, - "learning_rate": 2.5057632554876216e-06, - "loss": 1.4241, + "epoch": 1.1, + "grad_norm": 40.724525451660156, + "learning_rate": 1.2697987700288667e-05, + "loss": 1.8879, "step": 8728 }, { - "epoch": 2.62, - "grad_norm": 5.5264716148376465, - "learning_rate": 2.5037586448832317e-06, - "loss": 0.6715, + "epoch": 1.1, + "grad_norm": 22.703800201416016, + "learning_rate": 1.2697150985231981e-05, + "loss": 1.8462, "step": 8729 }, { - "epoch": 2.62, - "grad_norm": 107.75334167480469, - "learning_rate": 2.5017540342788414e-06, - "loss": 2.3175, + "epoch": 1.1, + "grad_norm": 13.04196834564209, + "learning_rate": 1.2696314270175295e-05, + "loss": 1.4649, "step": 8730 }, { - "epoch": 2.63, - "grad_norm": 16.5670108795166, - "learning_rate": 2.4997494236744515e-06, - "loss": 1.7357, + "epoch": 1.1, + "grad_norm": 6.677236080169678, + "learning_rate": 1.2695477555118605e-05, + "loss": 0.3258, "step": 8731 }, { - "epoch": 2.63, - "grad_norm": 25.61155128479004, - "learning_rate": 2.497744813070061e-06, - "loss": 1.3919, + "epoch": 1.1, + "grad_norm": 9.752840995788574, + "learning_rate": 1.2694640840061918e-05, + "loss": 0.7642, "step": 8732 }, { - "epoch": 2.63, - "grad_norm": 12.228818893432617, - "learning_rate": 2.495740202465671e-06, - "loss": 2.4026, + "epoch": 1.1, + "grad_norm": 7.2943196296691895, + "learning_rate": 1.2693804125005232e-05, + "loss": 0.4322, "step": 8733 }, { - "epoch": 2.63, - "grad_norm": 16.512880325317383, - "learning_rate": 2.4937355918612813e-06, - "loss": 1.9608, + "epoch": 1.1, + "grad_norm": 21.28787612915039, + "learning_rate": 1.2692967409948542e-05, + "loss": 1.2366, "step": 8734 }, { - "epoch": 2.63, - "grad_norm": 34.81612014770508, - "learning_rate": 2.491730981256891e-06, - "loss": 1.0885, + "epoch": 1.1, + "grad_norm": 113.95024108886719, + "learning_rate": 1.2692130694891856e-05, + "loss": 0.6453, "step": 8735 }, { - "epoch": 2.63, - "grad_norm": 31.149940490722656, - "learning_rate": 2.489726370652501e-06, - "loss": 1.0149, + "epoch": 1.1, + "grad_norm": 17.61992835998535, + "learning_rate": 1.2691293979835168e-05, + "loss": 3.0286, "step": 8736 }, { - "epoch": 2.63, - "grad_norm": 13.102104187011719, - "learning_rate": 2.4877217600481107e-06, - "loss": 1.5993, + "epoch": 1.1, + "grad_norm": 12.428811073303223, + "learning_rate": 1.2690457264778482e-05, + "loss": 2.1907, "step": 8737 }, { - "epoch": 2.63, - "grad_norm": 18.59217071533203, - "learning_rate": 2.4857171494437208e-06, - "loss": 1.3812, + "epoch": 1.1, + "grad_norm": 10.521194458007812, + "learning_rate": 1.2689620549721794e-05, + "loss": 0.8512, "step": 8738 }, { - "epoch": 2.63, - "grad_norm": 15.023920059204102, - "learning_rate": 2.483712538839331e-06, - "loss": 1.0019, + "epoch": 1.1, + "grad_norm": 17.262094497680664, + "learning_rate": 1.2688783834665105e-05, + "loss": 1.7099, "step": 8739 }, { - "epoch": 2.63, - "grad_norm": 15.331181526184082, - "learning_rate": 2.4817079282349405e-06, - "loss": 1.5826, + "epoch": 1.1, + "grad_norm": 8.695687294006348, + "learning_rate": 1.2687947119608419e-05, + "loss": 0.7872, "step": 8740 }, { - "epoch": 2.63, - "grad_norm": 32.79924774169922, - "learning_rate": 2.4797033176305506e-06, - "loss": 2.171, + "epoch": 1.1, + "grad_norm": 4.679690837860107, + "learning_rate": 1.268711040455173e-05, + "loss": 0.223, "step": 8741 }, { - "epoch": 2.63, - "grad_norm": 11.717575073242188, - "learning_rate": 2.4776987070261603e-06, - "loss": 0.7835, + "epoch": 1.1, + "grad_norm": 7.748942852020264, + "learning_rate": 1.2686273689495043e-05, + "loss": 0.763, "step": 8742 }, { - "epoch": 2.63, - "grad_norm": 19.555763244628906, - "learning_rate": 2.4756940964217704e-06, - "loss": 2.7394, + "epoch": 1.1, + "grad_norm": 10.922325134277344, + "learning_rate": 1.2685436974438357e-05, + "loss": 2.3854, "step": 8743 }, { - "epoch": 2.63, - "grad_norm": 37.40543746948242, - "learning_rate": 2.47368948581738e-06, - "loss": 2.1193, + "epoch": 1.1, + "grad_norm": 11.161537170410156, + "learning_rate": 1.2684600259381667e-05, + "loss": 1.1113, "step": 8744 }, { - "epoch": 2.63, - "grad_norm": 18.472028732299805, - "learning_rate": 2.47168487521299e-06, - "loss": 1.0426, + "epoch": 1.1, + "grad_norm": 16.115772247314453, + "learning_rate": 1.268376354432498e-05, + "loss": 1.7661, "step": 8745 }, { - "epoch": 2.63, - "grad_norm": 15.350752830505371, - "learning_rate": 2.4696802646085998e-06, - "loss": 1.0137, + "epoch": 1.1, + "grad_norm": 14.176069259643555, + "learning_rate": 1.2682926829268294e-05, + "loss": 0.4978, "step": 8746 }, { - "epoch": 2.63, - "grad_norm": 12.595955848693848, - "learning_rate": 2.46767565400421e-06, - "loss": 1.6324, + "epoch": 1.1, + "grad_norm": 8.118674278259277, + "learning_rate": 1.2682090114211608e-05, + "loss": 1.0084, "step": 8747 }, { - "epoch": 2.63, - "grad_norm": 29.87061309814453, - "learning_rate": 2.46567104339982e-06, - "loss": 1.5554, + "epoch": 1.1, + "grad_norm": 24.346820831298828, + "learning_rate": 1.2681253399154918e-05, + "loss": 1.9808, "step": 8748 }, { - "epoch": 2.63, - "grad_norm": 17.269176483154297, - "learning_rate": 2.4636664327954296e-06, - "loss": 1.6175, + "epoch": 1.1, + "grad_norm": 33.79582977294922, + "learning_rate": 1.2680416684098232e-05, + "loss": 1.0062, "step": 8749 }, { - "epoch": 2.63, - "grad_norm": 16.87700653076172, - "learning_rate": 2.4616618221910397e-06, - "loss": 1.0128, + "epoch": 1.1, + "grad_norm": 17.37476921081543, + "learning_rate": 1.2679579969041544e-05, + "loss": 1.4613, "step": 8750 }, { - "epoch": 2.63, - "grad_norm": 10.00948429107666, - "learning_rate": 2.4596572115866498e-06, - "loss": 0.9691, + "epoch": 1.1, + "grad_norm": 8.64911937713623, + "learning_rate": 1.2678743253984856e-05, + "loss": 1.0513, "step": 8751 }, { - "epoch": 2.63, - "grad_norm": 21.35502815246582, - "learning_rate": 2.4576526009822594e-06, - "loss": 1.3124, + "epoch": 1.1, + "grad_norm": 12.104917526245117, + "learning_rate": 1.267790653892817e-05, + "loss": 1.5139, "step": 8752 }, { - "epoch": 2.63, - "grad_norm": 11.689464569091797, - "learning_rate": 2.4556479903778695e-06, - "loss": 1.3952, + "epoch": 1.1, + "grad_norm": 27.262914657592773, + "learning_rate": 1.2677069823871481e-05, + "loss": 1.7941, "step": 8753 }, { - "epoch": 2.63, - "grad_norm": 141.92120361328125, - "learning_rate": 2.453643379773479e-06, - "loss": 2.629, + "epoch": 1.1, + "grad_norm": 13.147500991821289, + "learning_rate": 1.2676233108814795e-05, + "loss": 1.0754, "step": 8754 }, { - "epoch": 2.63, - "grad_norm": 26.330507278442383, - "learning_rate": 2.4516387691690893e-06, - "loss": 3.4614, + "epoch": 1.1, + "grad_norm": 8.71116828918457, + "learning_rate": 1.2675396393758105e-05, + "loss": 0.5459, "step": 8755 }, { - "epoch": 2.63, - "grad_norm": 10.556838035583496, - "learning_rate": 2.449634158564699e-06, - "loss": 1.7251, + "epoch": 1.1, + "grad_norm": 63.87320327758789, + "learning_rate": 1.2674559678701419e-05, + "loss": 1.7561, "step": 8756 }, { - "epoch": 2.63, - "grad_norm": 45.57450485229492, - "learning_rate": 2.4476295479603086e-06, - "loss": 1.7645, + "epoch": 1.1, + "grad_norm": 5.747971057891846, + "learning_rate": 1.2673722963644733e-05, + "loss": 0.4531, "step": 8757 }, { - "epoch": 2.63, - "grad_norm": 11.817948341369629, - "learning_rate": 2.4456249373559187e-06, - "loss": 1.0033, + "epoch": 1.1, + "grad_norm": 11.756169319152832, + "learning_rate": 1.2672886248588043e-05, + "loss": 0.7117, "step": 8758 }, { - "epoch": 2.63, - "grad_norm": 7.083280563354492, - "learning_rate": 2.4436203267515287e-06, - "loss": 0.7292, + "epoch": 1.1, + "grad_norm": 71.47728729248047, + "learning_rate": 1.2672049533531356e-05, + "loss": 2.1309, "step": 8759 }, { - "epoch": 2.63, - "grad_norm": 18.485204696655273, - "learning_rate": 2.4416157161471384e-06, - "loss": 1.5034, - "step": 8760 - }, - { - "epoch": 2.63, - "eval_loss": 0.163481205701828, - "eval_runtime": 43.3055, - "eval_samples_per_second": 34.153, - "eval_steps_per_second": 34.153, + "epoch": 1.1, + "grad_norm": 5.626829624176025, + "learning_rate": 1.267121281847467e-05, + "loss": 0.6731, "step": 8760 }, { - "epoch": 2.63, - "grad_norm": 21.252248764038086, - "learning_rate": 2.4396111055427485e-06, - "loss": 1.1208, + "epoch": 1.1, + "grad_norm": 11.510431289672852, + "learning_rate": 1.2670376103417982e-05, + "loss": 0.9846, "step": 8761 }, { - "epoch": 2.63, - "grad_norm": 16.638259887695312, - "learning_rate": 2.4376064949383586e-06, - "loss": 1.8795, + "epoch": 1.1, + "grad_norm": 125.09648895263672, + "learning_rate": 1.2669539388361294e-05, + "loss": 0.6987, "step": 8762 }, { - "epoch": 2.63, - "grad_norm": 20.396753311157227, - "learning_rate": 2.4356018843339682e-06, - "loss": 1.5712, + "epoch": 1.1, + "grad_norm": 12.258779525756836, + "learning_rate": 1.2668702673304608e-05, + "loss": 2.082, "step": 8763 }, { - "epoch": 2.63, - "grad_norm": 7.8299455642700195, - "learning_rate": 2.4335972737295783e-06, - "loss": 0.7808, + "epoch": 1.1, + "grad_norm": 13.310866355895996, + "learning_rate": 1.266786595824792e-05, + "loss": 1.4755, "step": 8764 }, { - "epoch": 2.64, - "grad_norm": 11.673842430114746, - "learning_rate": 2.4315926631251884e-06, - "loss": 0.8884, + "epoch": 1.1, + "grad_norm": 7.459881782531738, + "learning_rate": 1.2667029243191232e-05, + "loss": 0.6761, "step": 8765 }, { - "epoch": 2.64, - "grad_norm": 40.24734115600586, - "learning_rate": 2.429588052520798e-06, - "loss": 2.0446, + "epoch": 1.1, + "grad_norm": 18.733461380004883, + "learning_rate": 1.2666192528134544e-05, + "loss": 0.7805, "step": 8766 }, { - "epoch": 2.64, - "grad_norm": 29.379608154296875, - "learning_rate": 2.4275834419164077e-06, - "loss": 1.6934, + "epoch": 1.1, + "grad_norm": 4.180849075317383, + "learning_rate": 1.2665355813077857e-05, + "loss": 0.3807, "step": 8767 }, { - "epoch": 2.64, - "grad_norm": 66.81684112548828, - "learning_rate": 2.425578831312018e-06, - "loss": 1.426, + "epoch": 1.1, + "grad_norm": 10.724422454833984, + "learning_rate": 1.266451909802117e-05, + "loss": 2.1114, "step": 8768 }, { - "epoch": 2.64, - "grad_norm": 81.16704559326172, - "learning_rate": 2.4235742207076275e-06, - "loss": 1.3078, + "epoch": 1.1, + "grad_norm": 8.880762100219727, + "learning_rate": 1.2663682382964481e-05, + "loss": 1.4725, "step": 8769 }, { - "epoch": 2.64, - "grad_norm": 13.74372386932373, - "learning_rate": 2.4215696101032376e-06, - "loss": 1.543, + "epoch": 1.1, + "grad_norm": 10.20962142944336, + "learning_rate": 1.2662845667907795e-05, + "loss": 1.715, "step": 8770 }, { - "epoch": 2.64, - "grad_norm": 9.779702186584473, - "learning_rate": 2.4195649994988476e-06, - "loss": 1.0268, + "epoch": 1.1, + "grad_norm": 30.03700065612793, + "learning_rate": 1.2662008952851108e-05, + "loss": 1.6719, "step": 8771 }, { - "epoch": 2.64, - "grad_norm": 14.754984855651855, - "learning_rate": 2.4175603888944573e-06, - "loss": 1.7506, + "epoch": 1.1, + "grad_norm": 33.68233108520508, + "learning_rate": 1.2661172237794419e-05, + "loss": 2.2797, "step": 8772 }, { - "epoch": 2.64, - "grad_norm": 29.609939575195312, - "learning_rate": 2.4155557782900674e-06, - "loss": 2.8884, + "epoch": 1.1, + "grad_norm": 14.828858375549316, + "learning_rate": 1.2660335522737732e-05, + "loss": 0.8591, "step": 8773 }, { - "epoch": 2.64, - "grad_norm": 100.57302856445312, - "learning_rate": 2.4135511676856775e-06, - "loss": 1.1484, + "epoch": 1.1, + "grad_norm": 7.947477340698242, + "learning_rate": 1.2659498807681046e-05, + "loss": 0.9196, "step": 8774 }, { - "epoch": 2.64, - "grad_norm": 29.211971282958984, - "learning_rate": 2.411546557081287e-06, - "loss": 2.1784, + "epoch": 1.1, + "grad_norm": 9.227246284484863, + "learning_rate": 1.2658662092624358e-05, + "loss": 2.4799, "step": 8775 }, { - "epoch": 2.64, - "grad_norm": 31.128265380859375, - "learning_rate": 2.409541946476897e-06, - "loss": 1.6176, + "epoch": 1.1, + "grad_norm": 5.3799309730529785, + "learning_rate": 1.265782537756767e-05, + "loss": 1.7289, "step": 8776 }, { - "epoch": 2.64, - "grad_norm": 16.750701904296875, - "learning_rate": 2.4075373358725073e-06, - "loss": 0.8013, + "epoch": 1.1, + "grad_norm": 3.43022084236145, + "learning_rate": 1.2656988662510983e-05, + "loss": 0.0955, "step": 8777 }, { - "epoch": 2.64, - "grad_norm": 65.17308044433594, - "learning_rate": 2.405532725268117e-06, - "loss": 1.9283, + "epoch": 1.1, + "grad_norm": 15.954379081726074, + "learning_rate": 1.2656151947454295e-05, + "loss": 0.8652, "step": 8778 }, { - "epoch": 2.64, - "grad_norm": 18.673601150512695, - "learning_rate": 2.4035281146637266e-06, - "loss": 1.3132, + "epoch": 1.1, + "grad_norm": 9.277589797973633, + "learning_rate": 1.2655315232397607e-05, + "loss": 2.1941, "step": 8779 }, { - "epoch": 2.64, - "grad_norm": 12.052127838134766, - "learning_rate": 2.4015235040593367e-06, - "loss": 0.8893, + "epoch": 1.1, + "grad_norm": 25.019664764404297, + "learning_rate": 1.265447851734092e-05, + "loss": 0.8344, "step": 8780 }, { - "epoch": 2.64, - "grad_norm": 33.23358917236328, - "learning_rate": 2.3995188934549464e-06, - "loss": 1.5921, + "epoch": 1.1, + "grad_norm": 10.636250495910645, + "learning_rate": 1.2653641802284233e-05, + "loss": 1.0796, "step": 8781 }, { - "epoch": 2.64, - "grad_norm": 32.95631408691406, - "learning_rate": 2.3975142828505564e-06, - "loss": 1.5091, + "epoch": 1.1, + "grad_norm": 39.38780212402344, + "learning_rate": 1.2652805087227547e-05, + "loss": 1.5636, "step": 8782 }, { - "epoch": 2.64, - "grad_norm": 21.939970016479492, - "learning_rate": 2.395509672246166e-06, - "loss": 1.1536, + "epoch": 1.1, + "grad_norm": 7.546035289764404, + "learning_rate": 1.2651968372170857e-05, + "loss": 0.5325, "step": 8783 }, { - "epoch": 2.64, - "grad_norm": 19.46248435974121, - "learning_rate": 2.393505061641776e-06, - "loss": 1.7254, + "epoch": 1.1, + "grad_norm": 7.118682861328125, + "learning_rate": 1.265113165711417e-05, + "loss": 1.7248, "step": 8784 }, { - "epoch": 2.64, - "grad_norm": 20.779407501220703, - "learning_rate": 2.3915004510373863e-06, - "loss": 0.6734, + "epoch": 1.1, + "grad_norm": 26.231958389282227, + "learning_rate": 1.2650294942057484e-05, + "loss": 2.1657, "step": 8785 }, { - "epoch": 2.64, - "grad_norm": 18.915145874023438, - "learning_rate": 2.389495840432996e-06, - "loss": 1.4362, + "epoch": 1.1, + "grad_norm": 7.644165515899658, + "learning_rate": 1.2649458227000794e-05, + "loss": 0.5792, "step": 8786 }, { - "epoch": 2.64, - "grad_norm": 15.002533912658691, - "learning_rate": 2.387491229828606e-06, - "loss": 1.3399, + "epoch": 1.1, + "grad_norm": 10.340425491333008, + "learning_rate": 1.2648621511944108e-05, + "loss": 1.8263, "step": 8787 }, { - "epoch": 2.64, - "grad_norm": 205.16641235351562, - "learning_rate": 2.385486619224216e-06, - "loss": 1.7637, + "epoch": 1.1, + "grad_norm": 45.51021957397461, + "learning_rate": 1.2647784796887422e-05, + "loss": 1.2768, "step": 8788 }, { - "epoch": 2.64, - "grad_norm": 16.24716567993164, - "learning_rate": 2.3834820086198258e-06, - "loss": 2.3097, + "epoch": 1.1, + "grad_norm": 20.326396942138672, + "learning_rate": 1.2646948081830734e-05, + "loss": 0.5046, "step": 8789 }, { - "epoch": 2.64, - "grad_norm": 27.199005126953125, - "learning_rate": 2.381477398015436e-06, - "loss": 2.0923, + "epoch": 1.1, + "grad_norm": 14.142060279846191, + "learning_rate": 1.2646111366774046e-05, + "loss": 1.5594, "step": 8790 }, { - "epoch": 2.64, - "grad_norm": 7.843896865844727, - "learning_rate": 2.3794727874110455e-06, - "loss": 0.556, + "epoch": 1.1, + "grad_norm": 7.47965145111084, + "learning_rate": 1.264527465171736e-05, + "loss": 0.8239, "step": 8791 }, { - "epoch": 2.64, - "grad_norm": 15.323102951049805, - "learning_rate": 2.3774681768066556e-06, - "loss": 0.9552, + "epoch": 1.1, + "grad_norm": 16.085491180419922, + "learning_rate": 1.2644437936660671e-05, + "loss": 1.5384, "step": 8792 }, { - "epoch": 2.64, - "grad_norm": 14.061708450317383, - "learning_rate": 2.3754635662022653e-06, - "loss": 1.4372, + "epoch": 1.1, + "grad_norm": 16.351972579956055, + "learning_rate": 1.2643601221603983e-05, + "loss": 0.9503, "step": 8793 }, { - "epoch": 2.64, - "grad_norm": 12.933516502380371, - "learning_rate": 2.3734589555978753e-06, - "loss": 1.2338, + "epoch": 1.1, + "grad_norm": 12.335143089294434, + "learning_rate": 1.2642764506547295e-05, + "loss": 1.9275, "step": 8794 }, { - "epoch": 2.64, - "grad_norm": 7.49314546585083, - "learning_rate": 2.371454344993485e-06, - "loss": 0.731, + "epoch": 1.1, + "grad_norm": 10.822340965270996, + "learning_rate": 1.2641927791490609e-05, + "loss": 0.9195, "step": 8795 }, { - "epoch": 2.64, - "grad_norm": 15.913431167602539, - "learning_rate": 2.369449734389095e-06, - "loss": 1.2261, + "epoch": 1.1, + "grad_norm": 12.354034423828125, + "learning_rate": 1.2641091076433922e-05, + "loss": 2.6532, "step": 8796 }, { - "epoch": 2.64, - "grad_norm": 12.59531307220459, - "learning_rate": 2.367445123784705e-06, - "loss": 1.0611, + "epoch": 1.1, + "grad_norm": 8.366477966308594, + "learning_rate": 1.2640254361377233e-05, + "loss": 0.6206, "step": 8797 }, { - "epoch": 2.65, - "grad_norm": 16.93478012084961, - "learning_rate": 2.365440513180315e-06, - "loss": 1.6995, + "epoch": 1.1, + "grad_norm": 15.607980728149414, + "learning_rate": 1.2639417646320546e-05, + "loss": 1.3698, "step": 8798 }, { - "epoch": 2.65, - "grad_norm": 11.499807357788086, - "learning_rate": 2.363435902575925e-06, - "loss": 1.3401, + "epoch": 1.1, + "grad_norm": 11.297418594360352, + "learning_rate": 1.263858093126386e-05, + "loss": 0.9642, "step": 8799 }, { - "epoch": 2.65, - "grad_norm": 33.20011901855469, - "learning_rate": 2.3614312919715346e-06, - "loss": 2.086, + "epoch": 1.1, + "grad_norm": 14.777867317199707, + "learning_rate": 1.263774421620717e-05, + "loss": 2.1886, "step": 8800 }, { - "epoch": 2.65, - "grad_norm": 10.58979606628418, - "learning_rate": 2.3594266813671447e-06, - "loss": 1.1518, + "epoch": 1.1, + "eval_loss": 0.08400662243366241, + "eval_runtime": 94.1728, + "eval_samples_per_second": 37.612, + "eval_steps_per_second": 37.612, + "step": 8800 + }, + { + "epoch": 1.1, + "grad_norm": 76.77694702148438, + "learning_rate": 1.2636907501150484e-05, + "loss": 2.4871, "step": 8801 }, { - "epoch": 2.65, - "grad_norm": 15.411652565002441, - "learning_rate": 2.3574220707627547e-06, - "loss": 0.9793, + "epoch": 1.1, + "grad_norm": 13.288673400878906, + "learning_rate": 1.2636070786093798e-05, + "loss": 0.84, "step": 8802 }, { - "epoch": 2.65, - "grad_norm": 68.01510620117188, - "learning_rate": 2.3554174601583644e-06, - "loss": 2.0531, + "epoch": 1.1, + "grad_norm": 26.608503341674805, + "learning_rate": 1.263523407103711e-05, + "loss": 1.9318, "step": 8803 }, { - "epoch": 2.65, - "grad_norm": 56.63874053955078, - "learning_rate": 2.3534128495539745e-06, - "loss": 1.1724, + "epoch": 1.1, + "grad_norm": 15.085040092468262, + "learning_rate": 1.2634397355980422e-05, + "loss": 1.5832, "step": 8804 }, { - "epoch": 2.65, - "grad_norm": 60.3148193359375, - "learning_rate": 2.351408238949584e-06, - "loss": 2.7668, + "epoch": 1.11, + "grad_norm": 15.15543270111084, + "learning_rate": 1.2633560640923735e-05, + "loss": 2.2115, "step": 8805 }, { - "epoch": 2.65, - "grad_norm": 38.45302963256836, - "learning_rate": 2.3494036283451942e-06, - "loss": 1.0094, + "epoch": 1.11, + "grad_norm": 32.635929107666016, + "learning_rate": 1.2632723925867047e-05, + "loss": 2.8508, "step": 8806 }, { - "epoch": 2.65, - "grad_norm": 10.030008316040039, - "learning_rate": 2.347399017740804e-06, - "loss": 1.0792, + "epoch": 1.11, + "grad_norm": 6.457747936248779, + "learning_rate": 1.2631887210810359e-05, + "loss": 0.6168, "step": 8807 }, { - "epoch": 2.65, - "grad_norm": 10.08592700958252, - "learning_rate": 2.345394407136414e-06, - "loss": 0.7964, + "epoch": 1.11, + "grad_norm": 6.022579193115234, + "learning_rate": 1.2631050495753671e-05, + "loss": 1.2132, "step": 8808 }, { - "epoch": 2.65, - "grad_norm": 125.09164428710938, - "learning_rate": 2.3433897965320236e-06, - "loss": 3.0581, + "epoch": 1.11, + "grad_norm": 15.227607727050781, + "learning_rate": 1.2630213780696985e-05, + "loss": 1.8502, "step": 8809 }, { - "epoch": 2.65, - "grad_norm": 19.070087432861328, - "learning_rate": 2.3413851859276337e-06, - "loss": 0.8685, + "epoch": 1.11, + "grad_norm": 25.35992431640625, + "learning_rate": 1.2629377065640298e-05, + "loss": 3.0445, "step": 8810 }, { - "epoch": 2.65, - "grad_norm": 10.671518325805664, - "learning_rate": 2.339380575323244e-06, - "loss": 1.1835, + "epoch": 1.11, + "grad_norm": 74.23690032958984, + "learning_rate": 1.2628540350583609e-05, + "loss": 0.9138, "step": 8811 }, { - "epoch": 2.65, - "grad_norm": 27.613374710083008, - "learning_rate": 2.3373759647188535e-06, - "loss": 1.4339, + "epoch": 1.11, + "grad_norm": 9.95718002319336, + "learning_rate": 1.2627703635526922e-05, + "loss": 1.125, "step": 8812 }, { - "epoch": 2.65, - "grad_norm": 18.462194442749023, - "learning_rate": 2.3353713541144636e-06, - "loss": 1.0176, + "epoch": 1.11, + "grad_norm": 24.25860595703125, + "learning_rate": 1.2626866920470236e-05, + "loss": 1.1305, "step": 8813 }, { - "epoch": 2.65, - "grad_norm": 13.578800201416016, - "learning_rate": 2.3333667435100736e-06, - "loss": 2.1367, + "epoch": 1.11, + "grad_norm": 29.47481346130371, + "learning_rate": 1.2626030205413546e-05, + "loss": 0.4841, "step": 8814 }, { - "epoch": 2.65, - "grad_norm": 43.0266227722168, - "learning_rate": 2.3313621329056833e-06, - "loss": 2.1957, + "epoch": 1.11, + "grad_norm": 31.069408416748047, + "learning_rate": 1.262519349035686e-05, + "loss": 3.1053, "step": 8815 }, { - "epoch": 2.65, - "grad_norm": 40.213008880615234, - "learning_rate": 2.3293575223012934e-06, - "loss": 1.9988, + "epoch": 1.11, + "grad_norm": 17.31085777282715, + "learning_rate": 1.2624356775300173e-05, + "loss": 2.2952, "step": 8816 }, { - "epoch": 2.65, - "grad_norm": 14.245367050170898, - "learning_rate": 2.327352911696903e-06, - "loss": 1.891, + "epoch": 1.11, + "grad_norm": 15.16616153717041, + "learning_rate": 1.2623520060243485e-05, + "loss": 1.8619, "step": 8817 }, { - "epoch": 2.65, - "grad_norm": 13.524105072021484, - "learning_rate": 2.3253483010925127e-06, - "loss": 1.4238, + "epoch": 1.11, + "grad_norm": 52.80331802368164, + "learning_rate": 1.2622683345186797e-05, + "loss": 0.5975, "step": 8818 }, { - "epoch": 2.65, - "grad_norm": 22.680530548095703, - "learning_rate": 2.323343690488123e-06, - "loss": 1.2083, + "epoch": 1.11, + "grad_norm": 213.8785858154297, + "learning_rate": 1.262184663013011e-05, + "loss": 0.721, "step": 8819 }, { - "epoch": 2.65, - "grad_norm": 13.493782043457031, - "learning_rate": 2.3213390798837325e-06, - "loss": 1.1022, + "epoch": 1.11, + "grad_norm": 47.809234619140625, + "learning_rate": 1.2621009915073423e-05, + "loss": 2.5473, "step": 8820 }, { - "epoch": 2.65, - "grad_norm": 10.675830841064453, - "learning_rate": 2.3193344692793425e-06, - "loss": 0.5942, + "epoch": 1.11, + "grad_norm": 20.05143928527832, + "learning_rate": 1.2620173200016735e-05, + "loss": 0.8577, "step": 8821 }, { - "epoch": 2.65, - "grad_norm": 12.953835487365723, - "learning_rate": 2.3173298586749526e-06, - "loss": 1.445, + "epoch": 1.11, + "grad_norm": 58.52271270751953, + "learning_rate": 1.2619336484960047e-05, + "loss": 2.2125, "step": 8822 }, { - "epoch": 2.65, - "grad_norm": 10.076544761657715, - "learning_rate": 2.3153252480705623e-06, - "loss": 1.5124, + "epoch": 1.11, + "grad_norm": 73.26026916503906, + "learning_rate": 1.261849976990336e-05, + "loss": 2.2222, "step": 8823 }, { - "epoch": 2.65, - "grad_norm": 21.92123031616211, - "learning_rate": 2.3133206374661724e-06, - "loss": 1.8663, + "epoch": 1.11, + "grad_norm": 13.545487403869629, + "learning_rate": 1.2617663054846674e-05, + "loss": 1.5535, "step": 8824 }, { - "epoch": 2.65, - "grad_norm": 25.028993606567383, - "learning_rate": 2.3113160268617824e-06, - "loss": 1.0345, + "epoch": 1.11, + "grad_norm": 11.692740440368652, + "learning_rate": 1.2616826339789984e-05, + "loss": 1.13, "step": 8825 }, { - "epoch": 2.65, - "grad_norm": 8.76278305053711, - "learning_rate": 2.309311416257392e-06, - "loss": 0.9227, + "epoch": 1.11, + "grad_norm": 6.153340816497803, + "learning_rate": 1.2615989624733298e-05, + "loss": 0.6864, "step": 8826 }, { - "epoch": 2.65, - "grad_norm": 7.540186405181885, - "learning_rate": 2.307306805653002e-06, - "loss": 0.9597, + "epoch": 1.11, + "grad_norm": 8.990448951721191, + "learning_rate": 1.2615152909676612e-05, + "loss": 1.2014, "step": 8827 }, { - "epoch": 2.65, - "grad_norm": 21.144914627075195, - "learning_rate": 2.3053021950486123e-06, - "loss": 1.3642, + "epoch": 1.11, + "grad_norm": 1.3864907026290894, + "learning_rate": 1.2614316194619922e-05, + "loss": 0.0544, "step": 8828 }, { - "epoch": 2.65, - "grad_norm": 17.2004451751709, - "learning_rate": 2.303297584444222e-06, - "loss": 0.6705, + "epoch": 1.11, + "grad_norm": 20.9836368560791, + "learning_rate": 1.2613479479563236e-05, + "loss": 1.2737, "step": 8829 }, { - "epoch": 2.65, - "grad_norm": 8.921226501464844, - "learning_rate": 2.3012929738398316e-06, - "loss": 1.096, + "epoch": 1.11, + "grad_norm": 18.755577087402344, + "learning_rate": 1.261264276450655e-05, + "loss": 1.3111, "step": 8830 }, { - "epoch": 2.66, - "grad_norm": 13.363105773925781, - "learning_rate": 2.2992883632354417e-06, - "loss": 0.9989, + "epoch": 1.11, + "grad_norm": 26.43598747253418, + "learning_rate": 1.2611806049449861e-05, + "loss": 2.1915, "step": 8831 }, { - "epoch": 2.66, - "grad_norm": 11.100980758666992, - "learning_rate": 2.2972837526310513e-06, - "loss": 1.4, + "epoch": 1.11, + "grad_norm": 3.893955945968628, + "learning_rate": 1.2610969334393173e-05, + "loss": 0.1865, "step": 8832 }, { - "epoch": 2.66, - "grad_norm": 18.512340545654297, - "learning_rate": 2.2952791420266614e-06, - "loss": 1.3327, + "epoch": 1.11, + "grad_norm": 8.938155174255371, + "learning_rate": 1.2610132619336485e-05, + "loss": 0.6947, "step": 8833 }, { - "epoch": 2.66, - "grad_norm": 52.814693450927734, - "learning_rate": 2.2932745314222715e-06, - "loss": 1.4743, + "epoch": 1.11, + "grad_norm": 15.839922904968262, + "learning_rate": 1.2609295904279799e-05, + "loss": 1.2233, "step": 8834 }, { - "epoch": 2.66, - "grad_norm": 37.01776123046875, - "learning_rate": 2.291269920817881e-06, - "loss": 1.1938, + "epoch": 1.11, + "grad_norm": 24.305334091186523, + "learning_rate": 1.260845918922311e-05, + "loss": 1.8932, "step": 8835 }, { - "epoch": 2.66, - "grad_norm": 14.084249496459961, - "learning_rate": 2.2892653102134913e-06, - "loss": 1.4542, + "epoch": 1.11, + "grad_norm": 11.232524871826172, + "learning_rate": 1.2607622474166423e-05, + "loss": 2.2715, "step": 8836 }, { - "epoch": 2.66, - "grad_norm": 34.67374038696289, - "learning_rate": 2.2872606996091013e-06, - "loss": 1.9464, + "epoch": 1.11, + "grad_norm": 11.493144989013672, + "learning_rate": 1.2606785759109736e-05, + "loss": 0.793, "step": 8837 }, { - "epoch": 2.66, - "grad_norm": 16.59427833557129, - "learning_rate": 2.285256089004711e-06, - "loss": 1.5703, + "epoch": 1.11, + "grad_norm": 18.408987045288086, + "learning_rate": 1.260594904405305e-05, + "loss": 1.8757, "step": 8838 }, { - "epoch": 2.66, - "grad_norm": 20.186216354370117, - "learning_rate": 2.283251478400321e-06, - "loss": 1.527, + "epoch": 1.11, + "grad_norm": 41.016929626464844, + "learning_rate": 1.260511232899636e-05, + "loss": 4.0113, "step": 8839 }, { - "epoch": 2.66, - "grad_norm": 34.25483703613281, - "learning_rate": 2.281246867795931e-06, - "loss": 2.5718, + "epoch": 1.11, + "grad_norm": 27.24270248413086, + "learning_rate": 1.2604275613939674e-05, + "loss": 1.0534, "step": 8840 }, { - "epoch": 2.66, - "grad_norm": 28.123106002807617, - "learning_rate": 2.279242257191541e-06, - "loss": 1.5148, + "epoch": 1.11, + "grad_norm": 24.53693389892578, + "learning_rate": 1.2603438898882988e-05, + "loss": 2.7761, "step": 8841 }, { - "epoch": 2.66, - "grad_norm": 55.747276306152344, - "learning_rate": 2.2772376465871505e-06, - "loss": 1.7577, + "epoch": 1.11, + "grad_norm": 8.33435344696045, + "learning_rate": 1.2602602183826298e-05, + "loss": 0.3023, "step": 8842 }, { - "epoch": 2.66, - "grad_norm": 21.762004852294922, - "learning_rate": 2.2752330359827606e-06, - "loss": 1.3397, + "epoch": 1.11, + "grad_norm": 7.682891845703125, + "learning_rate": 1.2601765468769611e-05, + "loss": 0.821, "step": 8843 }, { - "epoch": 2.66, - "grad_norm": 38.36250305175781, - "learning_rate": 2.2732284253783702e-06, - "loss": 1.2055, + "epoch": 1.11, + "grad_norm": 41.553611755371094, + "learning_rate": 1.2600928753712925e-05, + "loss": 2.2671, "step": 8844 }, { - "epoch": 2.66, - "grad_norm": 14.818611145019531, - "learning_rate": 2.2712238147739803e-06, - "loss": 1.3097, + "epoch": 1.11, + "grad_norm": 11.70155143737793, + "learning_rate": 1.2600092038656237e-05, + "loss": 2.1812, "step": 8845 }, { - "epoch": 2.66, - "grad_norm": 7.749112129211426, - "learning_rate": 2.26921920416959e-06, - "loss": 0.4492, + "epoch": 1.11, + "grad_norm": 4.751408100128174, + "learning_rate": 1.2599255323599549e-05, + "loss": 0.4145, "step": 8846 }, { - "epoch": 2.66, - "grad_norm": 10.357666969299316, - "learning_rate": 2.2672145935652e-06, - "loss": 0.9897, + "epoch": 1.11, + "grad_norm": 36.31829071044922, + "learning_rate": 1.2598418608542861e-05, + "loss": 2.0711, "step": 8847 }, { - "epoch": 2.66, - "grad_norm": 15.227676391601562, - "learning_rate": 2.26520998296081e-06, - "loss": 1.3606, + "epoch": 1.11, + "grad_norm": 34.59927749633789, + "learning_rate": 1.2597581893486175e-05, + "loss": 2.4649, "step": 8848 }, { - "epoch": 2.66, - "grad_norm": 16.850852966308594, - "learning_rate": 2.26320537235642e-06, - "loss": 1.4527, + "epoch": 1.11, + "grad_norm": 7.014703273773193, + "learning_rate": 1.2596745178429487e-05, + "loss": 0.497, "step": 8849 }, { - "epoch": 2.66, - "grad_norm": 16.4047794342041, - "learning_rate": 2.26120076175203e-06, - "loss": 1.6789, + "epoch": 1.11, + "grad_norm": 15.280418395996094, + "learning_rate": 1.2595908463372799e-05, + "loss": 1.1994, "step": 8850 }, { - "epoch": 2.66, - "grad_norm": 7.696395397186279, - "learning_rate": 2.25919615114764e-06, - "loss": 0.9858, + "epoch": 1.11, + "grad_norm": 13.042275428771973, + "learning_rate": 1.2595071748316112e-05, + "loss": 0.7759, "step": 8851 }, { - "epoch": 2.66, - "grad_norm": 21.263986587524414, - "learning_rate": 2.2571915405432496e-06, - "loss": 1.7351, + "epoch": 1.11, + "grad_norm": 20.576278686523438, + "learning_rate": 1.2594235033259426e-05, + "loss": 1.3792, "step": 8852 }, { - "epoch": 2.66, - "grad_norm": 7.717570781707764, - "learning_rate": 2.2551869299388597e-06, - "loss": 1.0206, + "epoch": 1.11, + "grad_norm": 14.835959434509277, + "learning_rate": 1.2593398318202736e-05, + "loss": 1.2731, "step": 8853 }, { - "epoch": 2.66, - "grad_norm": 14.79688549041748, - "learning_rate": 2.2531823193344694e-06, - "loss": 1.7117, + "epoch": 1.11, + "grad_norm": 28.5734920501709, + "learning_rate": 1.259256160314605e-05, + "loss": 0.8112, "step": 8854 }, { - "epoch": 2.66, - "grad_norm": 8.93305492401123, - "learning_rate": 2.2511777087300795e-06, - "loss": 1.839, + "epoch": 1.11, + "grad_norm": 4.923260688781738, + "learning_rate": 1.2591724888089363e-05, + "loss": 0.3191, "step": 8855 }, { - "epoch": 2.66, - "grad_norm": 13.082503318786621, - "learning_rate": 2.249173098125689e-06, - "loss": 1.252, + "epoch": 1.11, + "grad_norm": 9.085829734802246, + "learning_rate": 1.2590888173032674e-05, + "loss": 1.3536, "step": 8856 }, { - "epoch": 2.66, - "grad_norm": 11.024530410766602, - "learning_rate": 2.2471684875212992e-06, - "loss": 0.5396, + "epoch": 1.11, + "grad_norm": 11.32632827758789, + "learning_rate": 1.2590051457975987e-05, + "loss": 3.1705, "step": 8857 }, { - "epoch": 2.66, - "grad_norm": 30.2106876373291, - "learning_rate": 2.245163876916909e-06, - "loss": 1.8259, + "epoch": 1.11, + "grad_norm": 59.948646545410156, + "learning_rate": 1.2589214742919301e-05, + "loss": 0.5682, "step": 8858 }, { - "epoch": 2.66, - "grad_norm": 12.592206954956055, - "learning_rate": 2.243159266312519e-06, - "loss": 2.2187, + "epoch": 1.11, + "grad_norm": 10.617377281188965, + "learning_rate": 1.2588378027862613e-05, + "loss": 0.6431, "step": 8859 }, { - "epoch": 2.66, - "grad_norm": 29.125699996948242, - "learning_rate": 2.241154655708129e-06, - "loss": 1.1455, + "epoch": 1.11, + "grad_norm": 18.310394287109375, + "learning_rate": 1.2587541312805925e-05, + "loss": 1.9332, "step": 8860 }, { - "epoch": 2.66, - "grad_norm": 16.28235626220703, - "learning_rate": 2.2391500451037387e-06, - "loss": 1.9047, + "epoch": 1.11, + "grad_norm": 23.954254150390625, + "learning_rate": 1.2586704597749237e-05, + "loss": 1.5929, "step": 8861 }, { - "epoch": 2.66, - "grad_norm": 30.546987533569336, - "learning_rate": 2.237145434499349e-06, - "loss": 1.7441, + "epoch": 1.11, + "grad_norm": 8.76931095123291, + "learning_rate": 1.258586788269255e-05, + "loss": 1.7597, "step": 8862 }, { - "epoch": 2.66, - "grad_norm": 12.745742797851562, - "learning_rate": 2.2351408238949585e-06, - "loss": 0.5604, + "epoch": 1.11, + "grad_norm": 9.1308012008667, + "learning_rate": 1.2585031167635862e-05, + "loss": 2.1952, "step": 8863 }, { - "epoch": 2.67, - "grad_norm": 10.053805351257324, - "learning_rate": 2.2331362132905685e-06, - "loss": 1.467, + "epoch": 1.11, + "grad_norm": 67.87699890136719, + "learning_rate": 1.2584194452579174e-05, + "loss": 0.6862, "step": 8864 }, { - "epoch": 2.67, - "grad_norm": 22.58064842224121, - "learning_rate": 2.2311316026861786e-06, - "loss": 1.3182, + "epoch": 1.11, + "grad_norm": 12.790180206298828, + "learning_rate": 1.2583357737522488e-05, + "loss": 3.0736, "step": 8865 }, { - "epoch": 2.67, - "grad_norm": 10.636580467224121, - "learning_rate": 2.2291269920817883e-06, - "loss": 1.3195, + "epoch": 1.11, + "grad_norm": 33.39744186401367, + "learning_rate": 1.2582521022465802e-05, + "loss": 3.5402, "step": 8866 }, { - "epoch": 2.67, - "grad_norm": 18.260143280029297, - "learning_rate": 2.2271223814773984e-06, - "loss": 1.3604, + "epoch": 1.11, + "grad_norm": 12.848188400268555, + "learning_rate": 1.2581684307409112e-05, + "loss": 1.7566, "step": 8867 }, { - "epoch": 2.67, - "grad_norm": 8.831524848937988, - "learning_rate": 2.225117770873008e-06, - "loss": 1.1861, + "epoch": 1.11, + "grad_norm": 5.089822769165039, + "learning_rate": 1.2580847592352426e-05, + "loss": 0.2165, "step": 8868 }, { - "epoch": 2.67, - "grad_norm": 13.567652702331543, - "learning_rate": 2.2231131602686177e-06, - "loss": 1.0606, + "epoch": 1.11, + "grad_norm": 12.391101837158203, + "learning_rate": 1.258001087729574e-05, + "loss": 0.4484, "step": 8869 }, { - "epoch": 2.67, - "grad_norm": 31.04169273376465, - "learning_rate": 2.2211085496642278e-06, - "loss": 1.1305, + "epoch": 1.11, + "grad_norm": 28.662338256835938, + "learning_rate": 1.257917416223905e-05, + "loss": 1.7781, "step": 8870 }, { - "epoch": 2.67, - "grad_norm": 17.62767219543457, - "learning_rate": 2.219103939059838e-06, - "loss": 1.8986, + "epoch": 1.11, + "grad_norm": 9.914118766784668, + "learning_rate": 1.2578337447182363e-05, + "loss": 3.4399, "step": 8871 }, { - "epoch": 2.67, - "grad_norm": 31.392688751220703, - "learning_rate": 2.2170993284554475e-06, - "loss": 2.3315, + "epoch": 1.11, + "grad_norm": 5.98568868637085, + "learning_rate": 1.2577500732125675e-05, + "loss": 0.8468, "step": 8872 }, { - "epoch": 2.67, - "grad_norm": 18.428953170776367, - "learning_rate": 2.2150947178510576e-06, - "loss": 1.559, + "epoch": 1.11, + "grad_norm": 11.589415550231934, + "learning_rate": 1.2576664017068989e-05, + "loss": 1.4256, "step": 8873 }, { - "epoch": 2.67, - "grad_norm": 24.9808292388916, - "learning_rate": 2.2130901072466677e-06, - "loss": 1.6667, + "epoch": 1.11, + "grad_norm": 11.69733715057373, + "learning_rate": 1.25758273020123e-05, + "loss": 1.1597, "step": 8874 }, { - "epoch": 2.67, - "grad_norm": 23.458446502685547, - "learning_rate": 2.2110854966422773e-06, - "loss": 0.8407, + "epoch": 1.11, + "grad_norm": 8.565375328063965, + "learning_rate": 1.2574990586955613e-05, + "loss": 1.5882, "step": 8875 }, { - "epoch": 2.67, - "grad_norm": 10.7035493850708, - "learning_rate": 2.2090808860378874e-06, - "loss": 1.3342, + "epoch": 1.11, + "grad_norm": 46.239627838134766, + "learning_rate": 1.2574153871898926e-05, + "loss": 2.1625, "step": 8876 }, { - "epoch": 2.67, - "grad_norm": 21.982637405395508, - "learning_rate": 2.2070762754334975e-06, - "loss": 1.8623, + "epoch": 1.11, + "grad_norm": 12.065559387207031, + "learning_rate": 1.2573317156842238e-05, + "loss": 2.7625, "step": 8877 }, { - "epoch": 2.67, - "grad_norm": 10.788835525512695, - "learning_rate": 2.205071664829107e-06, - "loss": 1.14, + "epoch": 1.11, + "grad_norm": 19.050992965698242, + "learning_rate": 1.257248044178555e-05, + "loss": 0.8145, "step": 8878 }, { - "epoch": 2.67, - "grad_norm": 12.52689266204834, - "learning_rate": 2.2030670542247173e-06, - "loss": 0.8905, + "epoch": 1.11, + "grad_norm": 23.644147872924805, + "learning_rate": 1.2571643726728864e-05, + "loss": 2.1455, "step": 8879 }, { - "epoch": 2.67, - "grad_norm": 12.546585083007812, - "learning_rate": 2.201062443620327e-06, - "loss": 1.4123, - "step": 8880 - }, - { - "epoch": 2.67, - "eval_loss": 0.1634318232536316, - "eval_runtime": 43.2728, - "eval_samples_per_second": 34.179, - "eval_steps_per_second": 34.179, + "epoch": 1.11, + "grad_norm": 38.043373107910156, + "learning_rate": 1.2570807011672178e-05, + "loss": 1.5802, "step": 8880 }, { - "epoch": 2.67, - "grad_norm": 23.065811157226562, - "learning_rate": 2.1990578330159366e-06, - "loss": 1.6073, + "epoch": 1.11, + "grad_norm": 11.92843246459961, + "learning_rate": 1.2569970296615488e-05, + "loss": 1.743, "step": 8881 }, { - "epoch": 2.67, - "grad_norm": 8.505629539489746, - "learning_rate": 2.1970532224115467e-06, - "loss": 0.6363, + "epoch": 1.11, + "grad_norm": 15.821548461914062, + "learning_rate": 1.2569133581558801e-05, + "loss": 1.729, "step": 8882 }, { - "epoch": 2.67, - "grad_norm": 19.605792999267578, - "learning_rate": 2.1950486118071563e-06, - "loss": 1.1445, + "epoch": 1.11, + "grad_norm": 9.177318572998047, + "learning_rate": 1.2568296866502115e-05, + "loss": 1.5359, "step": 8883 }, { - "epoch": 2.67, - "grad_norm": 62.97642517089844, - "learning_rate": 2.1930440012027664e-06, - "loss": 3.0308, + "epoch": 1.11, + "grad_norm": 22.191919326782227, + "learning_rate": 1.2567460151445425e-05, + "loss": 1.4169, "step": 8884 }, { - "epoch": 2.67, - "grad_norm": 34.175453186035156, - "learning_rate": 2.1910393905983765e-06, - "loss": 1.2063, + "epoch": 1.12, + "grad_norm": 16.08186149597168, + "learning_rate": 1.2566623436388739e-05, + "loss": 0.5535, "step": 8885 }, { - "epoch": 2.67, - "grad_norm": 41.114051818847656, - "learning_rate": 2.189034779993986e-06, - "loss": 1.2233, + "epoch": 1.12, + "grad_norm": 20.439388275146484, + "learning_rate": 1.2565786721332051e-05, + "loss": 3.2665, "step": 8886 }, { - "epoch": 2.67, - "grad_norm": 22.9063663482666, - "learning_rate": 2.1870301693895962e-06, - "loss": 1.1179, + "epoch": 1.12, + "grad_norm": 12.846830368041992, + "learning_rate": 1.2564950006275365e-05, + "loss": 1.4284, "step": 8887 }, { - "epoch": 2.67, - "grad_norm": 12.28271770477295, - "learning_rate": 2.1850255587852063e-06, - "loss": 1.1284, + "epoch": 1.12, + "grad_norm": 36.724246978759766, + "learning_rate": 1.2564113291218677e-05, + "loss": 2.6891, "step": 8888 }, { - "epoch": 2.67, - "grad_norm": 18.61870765686035, - "learning_rate": 2.183020948180816e-06, - "loss": 1.1018, + "epoch": 1.12, + "grad_norm": 31.265914916992188, + "learning_rate": 1.2563276576161988e-05, + "loss": 1.4702, "step": 8889 }, { - "epoch": 2.67, - "grad_norm": 14.14230728149414, - "learning_rate": 2.181016337576426e-06, - "loss": 2.1612, + "epoch": 1.12, + "grad_norm": 19.516441345214844, + "learning_rate": 1.2562439861105302e-05, + "loss": 1.3568, "step": 8890 }, { - "epoch": 2.67, - "grad_norm": 17.895893096923828, - "learning_rate": 2.179011726972036e-06, - "loss": 2.1376, + "epoch": 1.12, + "grad_norm": 22.5244140625, + "learning_rate": 1.2561603146048612e-05, + "loss": 1.7873, "step": 8891 }, { - "epoch": 2.67, - "grad_norm": 10.05941390991211, - "learning_rate": 2.177007116367646e-06, - "loss": 1.9443, + "epoch": 1.12, + "grad_norm": 7.6699113845825195, + "learning_rate": 1.2560766430991926e-05, + "loss": 1.5238, "step": 8892 }, { - "epoch": 2.67, - "grad_norm": 17.179000854492188, - "learning_rate": 2.1750025057632555e-06, - "loss": 1.3958, + "epoch": 1.12, + "grad_norm": 31.9620418548584, + "learning_rate": 1.255992971593524e-05, + "loss": 2.0262, "step": 8893 }, { - "epoch": 2.67, - "grad_norm": 18.431400299072266, - "learning_rate": 2.1729978951588656e-06, - "loss": 1.2758, + "epoch": 1.12, + "grad_norm": 8.691798210144043, + "learning_rate": 1.2559093000878553e-05, + "loss": 0.5248, "step": 8894 }, { - "epoch": 2.67, - "grad_norm": 18.084781646728516, - "learning_rate": 2.1709932845544752e-06, - "loss": 1.7674, + "epoch": 1.12, + "grad_norm": 9.342169761657715, + "learning_rate": 1.2558256285821864e-05, + "loss": 0.8525, "step": 8895 }, { - "epoch": 2.67, - "grad_norm": 15.693217277526855, - "learning_rate": 2.1689886739500853e-06, - "loss": 1.0816, + "epoch": 1.12, + "grad_norm": 6.691696643829346, + "learning_rate": 1.2557419570765177e-05, + "loss": 1.1909, "step": 8896 }, { - "epoch": 2.67, - "grad_norm": 18.634521484375, - "learning_rate": 2.1669840633456954e-06, - "loss": 1.1063, + "epoch": 1.12, + "grad_norm": 9.33571720123291, + "learning_rate": 1.2556582855708491e-05, + "loss": 0.4905, "step": 8897 }, { - "epoch": 2.68, - "grad_norm": 11.23396110534668, - "learning_rate": 2.164979452741305e-06, - "loss": 0.6603, + "epoch": 1.12, + "grad_norm": 4.31030797958374, + "learning_rate": 1.2555746140651801e-05, + "loss": 0.5247, "step": 8898 }, { - "epoch": 2.68, - "grad_norm": 8.881089210510254, - "learning_rate": 2.162974842136915e-06, - "loss": 0.7917, + "epoch": 1.12, + "grad_norm": 9.584003448486328, + "learning_rate": 1.2554909425595115e-05, + "loss": 0.3755, "step": 8899 }, { - "epoch": 2.68, - "grad_norm": 6.563168525695801, - "learning_rate": 2.1609702315325252e-06, - "loss": 0.7384, + "epoch": 1.12, + "grad_norm": 13.662652015686035, + "learning_rate": 1.2554072710538427e-05, + "loss": 1.4579, "step": 8900 }, { - "epoch": 2.68, - "grad_norm": 45.68901824951172, - "learning_rate": 2.158965620928135e-06, - "loss": 2.6647, + "epoch": 1.12, + "grad_norm": 32.94351577758789, + "learning_rate": 1.255323599548174e-05, + "loss": 3.6991, "step": 8901 }, { - "epoch": 2.68, - "grad_norm": 10.925772666931152, - "learning_rate": 2.156961010323745e-06, - "loss": 0.5719, + "epoch": 1.12, + "grad_norm": 26.51783561706543, + "learning_rate": 1.2552399280425052e-05, + "loss": 4.1407, "step": 8902 }, { - "epoch": 2.68, - "grad_norm": 52.76468276977539, - "learning_rate": 2.1549563997193546e-06, - "loss": 1.5088, + "epoch": 1.12, + "grad_norm": 39.995025634765625, + "learning_rate": 1.2551562565368364e-05, + "loss": 1.6063, "step": 8903 }, { - "epoch": 2.68, - "grad_norm": 17.248638153076172, - "learning_rate": 2.1529517891149647e-06, - "loss": 1.2967, + "epoch": 1.12, + "grad_norm": 13.210186004638672, + "learning_rate": 1.2550725850311678e-05, + "loss": 0.9404, "step": 8904 }, { - "epoch": 2.68, - "grad_norm": 51.29323196411133, - "learning_rate": 2.1509471785105744e-06, - "loss": 1.6994, + "epoch": 1.12, + "grad_norm": 20.54436492919922, + "learning_rate": 1.2549889135254988e-05, + "loss": 0.942, "step": 8905 }, { - "epoch": 2.68, - "grad_norm": 42.033302307128906, - "learning_rate": 2.1489425679061845e-06, - "loss": 1.3593, + "epoch": 1.12, + "grad_norm": 8.649977684020996, + "learning_rate": 1.2549052420198302e-05, + "loss": 1.3768, "step": 8906 }, { - "epoch": 2.68, - "grad_norm": 9.53936767578125, - "learning_rate": 2.146937957301794e-06, - "loss": 0.7497, + "epoch": 1.12, + "grad_norm": 5.319600582122803, + "learning_rate": 1.2548215705141616e-05, + "loss": 0.2921, "step": 8907 }, { - "epoch": 2.68, - "grad_norm": 29.297077178955078, - "learning_rate": 2.144933346697404e-06, - "loss": 1.042, + "epoch": 1.12, + "grad_norm": 19.230091094970703, + "learning_rate": 1.254737899008493e-05, + "loss": 0.9048, "step": 8908 }, { - "epoch": 2.68, - "grad_norm": 37.91065216064453, - "learning_rate": 2.142928736093014e-06, - "loss": 2.1151, + "epoch": 1.12, + "grad_norm": 15.009637832641602, + "learning_rate": 1.254654227502824e-05, + "loss": 1.4137, "step": 8909 }, { - "epoch": 2.68, - "grad_norm": 24.55820655822754, - "learning_rate": 2.140924125488624e-06, - "loss": 1.9639, + "epoch": 1.12, + "grad_norm": 83.87217712402344, + "learning_rate": 1.2545705559971553e-05, + "loss": 2.7705, "step": 8910 }, { - "epoch": 2.68, - "grad_norm": 30.16676902770996, - "learning_rate": 2.138919514884234e-06, - "loss": 1.3401, + "epoch": 1.12, + "grad_norm": 5.943860054016113, + "learning_rate": 1.2544868844914865e-05, + "loss": 0.6931, "step": 8911 }, { - "epoch": 2.68, - "grad_norm": 47.085235595703125, - "learning_rate": 2.1369149042798437e-06, - "loss": 2.4905, + "epoch": 1.12, + "grad_norm": 14.90063190460205, + "learning_rate": 1.2544032129858177e-05, + "loss": 1.1149, "step": 8912 }, { - "epoch": 2.68, - "grad_norm": 19.413427352905273, - "learning_rate": 2.1349102936754538e-06, - "loss": 1.4043, + "epoch": 1.12, + "grad_norm": 25.426666259765625, + "learning_rate": 1.254319541480149e-05, + "loss": 0.6162, "step": 8913 }, { - "epoch": 2.68, - "grad_norm": 31.471092224121094, - "learning_rate": 2.132905683071064e-06, - "loss": 1.9347, + "epoch": 1.12, + "grad_norm": 24.75497055053711, + "learning_rate": 1.2542358699744803e-05, + "loss": 2.9471, "step": 8914 }, { - "epoch": 2.68, - "grad_norm": 26.705747604370117, - "learning_rate": 2.1309010724666735e-06, - "loss": 2.8265, + "epoch": 1.12, + "grad_norm": 17.341344833374023, + "learning_rate": 1.2541521984688116e-05, + "loss": 2.3469, "step": 8915 }, { - "epoch": 2.68, - "grad_norm": 43.82736587524414, - "learning_rate": 2.1288964618622836e-06, - "loss": 1.3732, + "epoch": 1.12, + "grad_norm": 5.796677589416504, + "learning_rate": 1.2540685269631428e-05, + "loss": 0.5815, "step": 8916 }, { - "epoch": 2.68, - "grad_norm": 12.801732063293457, - "learning_rate": 2.1268918512578933e-06, - "loss": 0.7379, + "epoch": 1.12, + "grad_norm": 10.825651168823242, + "learning_rate": 1.253984855457474e-05, + "loss": 0.6657, "step": 8917 }, { - "epoch": 2.68, - "grad_norm": 14.50540828704834, - "learning_rate": 2.1248872406535033e-06, - "loss": 1.2645, + "epoch": 1.12, + "grad_norm": 15.243544578552246, + "learning_rate": 1.2539011839518054e-05, + "loss": 1.2522, "step": 8918 }, { - "epoch": 2.68, - "grad_norm": 55.6925163269043, - "learning_rate": 2.122882630049113e-06, - "loss": 1.05, + "epoch": 1.12, + "grad_norm": 2.303821563720703, + "learning_rate": 1.2538175124461364e-05, + "loss": 0.0838, "step": 8919 }, { - "epoch": 2.68, - "grad_norm": 8.958430290222168, - "learning_rate": 2.120878019444723e-06, - "loss": 0.9973, + "epoch": 1.12, + "grad_norm": 20.525466918945312, + "learning_rate": 1.2537338409404678e-05, + "loss": 0.8314, "step": 8920 }, { - "epoch": 2.68, - "grad_norm": 12.112589836120605, - "learning_rate": 2.1188734088403328e-06, - "loss": 0.5669, + "epoch": 1.12, + "grad_norm": 6.544447898864746, + "learning_rate": 1.2536501694347991e-05, + "loss": 0.4977, "step": 8921 }, { - "epoch": 2.68, - "grad_norm": 9.585906982421875, - "learning_rate": 2.116868798235943e-06, - "loss": 1.455, + "epoch": 1.12, + "grad_norm": 15.428367614746094, + "learning_rate": 1.2535664979291305e-05, + "loss": 0.4506, "step": 8922 }, { - "epoch": 2.68, - "grad_norm": 10.292905807495117, - "learning_rate": 2.114864187631553e-06, - "loss": 0.822, + "epoch": 1.12, + "grad_norm": 29.606430053710938, + "learning_rate": 1.2534828264234615e-05, + "loss": 2.3348, "step": 8923 }, { - "epoch": 2.68, - "grad_norm": 12.411725997924805, - "learning_rate": 2.1128595770271626e-06, - "loss": 0.5915, + "epoch": 1.12, + "grad_norm": 22.486295700073242, + "learning_rate": 1.2533991549177929e-05, + "loss": 1.0795, "step": 8924 }, { - "epoch": 2.68, - "grad_norm": 95.50955200195312, - "learning_rate": 2.1108549664227727e-06, - "loss": 1.2842, + "epoch": 1.12, + "grad_norm": 12.869139671325684, + "learning_rate": 1.2533154834121241e-05, + "loss": 3.1516, "step": 8925 }, { - "epoch": 2.68, - "grad_norm": 15.629396438598633, - "learning_rate": 2.1088503558183823e-06, - "loss": 1.6067, + "epoch": 1.12, + "grad_norm": 18.72137451171875, + "learning_rate": 1.2532318119064553e-05, + "loss": 1.5385, "step": 8926 }, { - "epoch": 2.68, - "grad_norm": 15.7233304977417, - "learning_rate": 2.1068457452139924e-06, - "loss": 0.9173, + "epoch": 1.12, + "grad_norm": 4.132017135620117, + "learning_rate": 1.2531481404007866e-05, + "loss": 0.1411, "step": 8927 }, { - "epoch": 2.68, - "grad_norm": 30.911314010620117, - "learning_rate": 2.1048411346096025e-06, - "loss": 2.3595, + "epoch": 1.12, + "grad_norm": 6.2769293785095215, + "learning_rate": 1.2530644688951178e-05, + "loss": 0.9071, "step": 8928 }, { - "epoch": 2.68, - "grad_norm": 12.196179389953613, - "learning_rate": 2.102836524005212e-06, - "loss": 1.0742, + "epoch": 1.12, + "grad_norm": 9.791159629821777, + "learning_rate": 1.2529807973894492e-05, + "loss": 0.9885, "step": 8929 }, { - "epoch": 2.68, - "grad_norm": 13.497405052185059, - "learning_rate": 2.1008319134008222e-06, - "loss": 1.1664, + "epoch": 1.12, + "grad_norm": 16.300458908081055, + "learning_rate": 1.2528971258837802e-05, + "loss": 1.1884, "step": 8930 }, { - "epoch": 2.69, - "grad_norm": 14.204902648925781, - "learning_rate": 2.098827302796432e-06, - "loss": 1.0476, + "epoch": 1.12, + "grad_norm": 9.561453819274902, + "learning_rate": 1.2528134543781116e-05, + "loss": 1.2753, "step": 8931 }, { - "epoch": 2.69, - "grad_norm": 21.779407501220703, - "learning_rate": 2.0968226921920416e-06, - "loss": 1.0798, + "epoch": 1.12, + "grad_norm": 68.78498077392578, + "learning_rate": 1.252729782872443e-05, + "loss": 4.0715, "step": 8932 }, { - "epoch": 2.69, - "grad_norm": 30.279451370239258, - "learning_rate": 2.0948180815876516e-06, - "loss": 1.2667, + "epoch": 1.12, + "grad_norm": 8.061681747436523, + "learning_rate": 1.252646111366774e-05, + "loss": 0.4884, "step": 8933 }, { - "epoch": 2.69, - "grad_norm": 13.741867065429688, - "learning_rate": 2.0928134709832617e-06, - "loss": 0.93, + "epoch": 1.12, + "grad_norm": 26.667295455932617, + "learning_rate": 1.2525624398611054e-05, + "loss": 3.0331, "step": 8934 }, { - "epoch": 2.69, - "grad_norm": 9.646475791931152, - "learning_rate": 2.0908088603788714e-06, - "loss": 1.304, + "epoch": 1.12, + "grad_norm": 15.182604789733887, + "learning_rate": 1.2524787683554367e-05, + "loss": 0.8844, "step": 8935 }, { - "epoch": 2.69, - "grad_norm": 13.321015357971191, - "learning_rate": 2.0888042497744815e-06, - "loss": 1.1108, + "epoch": 1.12, + "grad_norm": 3.172250270843506, + "learning_rate": 1.2523950968497681e-05, + "loss": 0.4641, "step": 8936 }, { - "epoch": 2.69, - "grad_norm": 15.75296688079834, - "learning_rate": 2.0867996391700916e-06, - "loss": 0.9929, + "epoch": 1.12, + "grad_norm": 14.022339820861816, + "learning_rate": 1.2523114253440991e-05, + "loss": 2.2339, "step": 8937 }, { - "epoch": 2.69, - "grad_norm": 24.70446014404297, - "learning_rate": 2.0847950285657012e-06, - "loss": 1.2706, + "epoch": 1.12, + "grad_norm": 12.469718933105469, + "learning_rate": 1.2522277538384305e-05, + "loss": 1.5523, "step": 8938 }, { - "epoch": 2.69, - "grad_norm": 13.372381210327148, - "learning_rate": 2.0827904179613113e-06, - "loss": 0.6046, + "epoch": 1.12, + "grad_norm": 15.62543773651123, + "learning_rate": 1.2521440823327617e-05, + "loss": 2.1353, "step": 8939 }, { - "epoch": 2.69, - "grad_norm": 40.783939361572266, - "learning_rate": 2.0807858073569214e-06, - "loss": 1.5435, + "epoch": 1.12, + "grad_norm": 17.858619689941406, + "learning_rate": 1.2520604108270929e-05, + "loss": 2.2041, "step": 8940 }, { - "epoch": 2.69, - "grad_norm": 8.127727508544922, - "learning_rate": 2.078781196752531e-06, - "loss": 0.8914, + "epoch": 1.12, + "grad_norm": 5.224055290222168, + "learning_rate": 1.2519767393214242e-05, + "loss": 0.2525, "step": 8941 }, { - "epoch": 2.69, - "grad_norm": 16.346284866333008, - "learning_rate": 2.076776586148141e-06, - "loss": 1.139, + "epoch": 1.12, + "grad_norm": 6.102902889251709, + "learning_rate": 1.2518930678157554e-05, + "loss": 0.4351, "step": 8942 }, { - "epoch": 2.69, - "grad_norm": 83.08602142333984, - "learning_rate": 2.074771975543751e-06, - "loss": 2.625, + "epoch": 1.12, + "grad_norm": 17.269641876220703, + "learning_rate": 1.2518093963100868e-05, + "loss": 1.5761, "step": 8943 }, { - "epoch": 2.69, - "grad_norm": 13.884016036987305, - "learning_rate": 2.0727673649393605e-06, - "loss": 1.0258, + "epoch": 1.12, + "grad_norm": 9.63320541381836, + "learning_rate": 1.2517257248044178e-05, + "loss": 0.6898, "step": 8944 }, { - "epoch": 2.69, - "grad_norm": 11.885502815246582, - "learning_rate": 2.0707627543349705e-06, - "loss": 0.8579, + "epoch": 1.12, + "grad_norm": 4.770727157592773, + "learning_rate": 1.2516420532987492e-05, + "loss": 0.3419, "step": 8945 }, { - "epoch": 2.69, - "grad_norm": 9.849443435668945, - "learning_rate": 2.06875814373058e-06, - "loss": 0.9329, + "epoch": 1.12, + "grad_norm": 7.858762264251709, + "learning_rate": 1.2515583817930805e-05, + "loss": 0.9, "step": 8946 }, { - "epoch": 2.69, - "grad_norm": 9.109235763549805, - "learning_rate": 2.0667535331261903e-06, - "loss": 0.9577, + "epoch": 1.12, + "grad_norm": 8.816036224365234, + "learning_rate": 1.2514747102874116e-05, + "loss": 1.6838, "step": 8947 }, { - "epoch": 2.69, - "grad_norm": 10.755121231079102, - "learning_rate": 2.0647489225218004e-06, - "loss": 0.6667, + "epoch": 1.12, + "grad_norm": 20.732986450195312, + "learning_rate": 1.251391038781743e-05, + "loss": 1.6266, "step": 8948 }, { - "epoch": 2.69, - "grad_norm": 10.410956382751465, - "learning_rate": 2.06274431191741e-06, - "loss": 0.4745, + "epoch": 1.12, + "grad_norm": 17.330759048461914, + "learning_rate": 1.2513073672760743e-05, + "loss": 1.7112, "step": 8949 }, { - "epoch": 2.69, - "grad_norm": 10.473151206970215, - "learning_rate": 2.06073970131302e-06, - "loss": 1.2898, + "epoch": 1.12, + "grad_norm": 18.552534103393555, + "learning_rate": 1.2512236957704057e-05, + "loss": 0.8954, "step": 8950 }, { - "epoch": 2.69, - "grad_norm": 14.992378234863281, - "learning_rate": 2.05873509070863e-06, - "loss": 1.5699, + "epoch": 1.12, + "grad_norm": 9.809928894042969, + "learning_rate": 1.2511400242647367e-05, + "loss": 0.8482, "step": 8951 }, { - "epoch": 2.69, - "grad_norm": 9.013774871826172, - "learning_rate": 2.05673048010424e-06, - "loss": 1.1274, + "epoch": 1.12, + "grad_norm": 14.423686027526855, + "learning_rate": 1.251056352759068e-05, + "loss": 1.1748, "step": 8952 }, { - "epoch": 2.69, - "grad_norm": 18.34583854675293, - "learning_rate": 2.05472586949985e-06, - "loss": 1.1243, + "epoch": 1.12, + "grad_norm": 15.003713607788086, + "learning_rate": 1.2509726812533993e-05, + "loss": 0.8353, "step": 8953 }, { - "epoch": 2.69, - "grad_norm": 9.34281063079834, - "learning_rate": 2.0527212588954596e-06, - "loss": 0.977, + "epoch": 1.12, + "grad_norm": 32.19587707519531, + "learning_rate": 1.2508890097477305e-05, + "loss": 1.1426, "step": 8954 }, { - "epoch": 2.69, - "grad_norm": 19.33155632019043, - "learning_rate": 2.0507166482910697e-06, - "loss": 0.7991, + "epoch": 1.12, + "grad_norm": 26.24030303955078, + "learning_rate": 1.2508053382420618e-05, + "loss": 1.2817, "step": 8955 }, { - "epoch": 2.69, - "grad_norm": 62.87168502807617, - "learning_rate": 2.0487120376866794e-06, - "loss": 1.8296, + "epoch": 1.12, + "grad_norm": 31.24359703063965, + "learning_rate": 1.250721666736393e-05, + "loss": 3.232, "step": 8956 }, { - "epoch": 2.69, - "grad_norm": 14.451416015625, - "learning_rate": 2.0467074270822894e-06, - "loss": 0.9752, + "epoch": 1.12, + "grad_norm": 7.957561492919922, + "learning_rate": 1.2506379952307244e-05, + "loss": 0.8478, "step": 8957 }, { - "epoch": 2.69, - "grad_norm": 13.052489280700684, - "learning_rate": 2.044702816477899e-06, - "loss": 1.0093, + "epoch": 1.12, + "grad_norm": 26.59343910217285, + "learning_rate": 1.2505543237250554e-05, + "loss": 1.7395, "step": 8958 }, { - "epoch": 2.69, - "grad_norm": 10.156307220458984, - "learning_rate": 2.042698205873509e-06, - "loss": 1.142, + "epoch": 1.12, + "grad_norm": 216.5959014892578, + "learning_rate": 1.2504706522193868e-05, + "loss": 1.4375, "step": 8959 }, { - "epoch": 2.69, - "grad_norm": 32.259788513183594, - "learning_rate": 2.0406935952691193e-06, - "loss": 2.6409, + "epoch": 1.12, + "grad_norm": 12.135117530822754, + "learning_rate": 1.2503869807137181e-05, + "loss": 0.4616, "step": 8960 }, { - "epoch": 2.69, - "grad_norm": 35.24033737182617, - "learning_rate": 2.038688984664729e-06, - "loss": 1.7325, + "epoch": 1.12, + "grad_norm": 34.77619934082031, + "learning_rate": 1.2503033092080492e-05, + "loss": 1.1826, "step": 8961 }, { - "epoch": 2.69, - "grad_norm": 15.073776245117188, - "learning_rate": 2.036684374060339e-06, - "loss": 0.7844, + "epoch": 1.12, + "grad_norm": 6.7146430015563965, + "learning_rate": 1.2502196377023805e-05, + "loss": 1.1367, "step": 8962 }, { - "epoch": 2.69, - "grad_norm": 26.254253387451172, - "learning_rate": 2.034679763455949e-06, - "loss": 0.9144, + "epoch": 1.12, + "grad_norm": 15.25345230102539, + "learning_rate": 1.2501359661967119e-05, + "loss": 1.2281, "step": 8963 }, { - "epoch": 2.7, - "grad_norm": 10.655972480773926, - "learning_rate": 2.0326751528515588e-06, - "loss": 1.5722, + "epoch": 1.12, + "grad_norm": 20.43260383605957, + "learning_rate": 1.250052294691043e-05, + "loss": 1.3328, "step": 8964 }, { - "epoch": 2.7, - "grad_norm": 15.8784761428833, - "learning_rate": 2.030670542247169e-06, - "loss": 1.5385, + "epoch": 1.13, + "grad_norm": 7.702591896057129, + "learning_rate": 1.2499686231853743e-05, + "loss": 0.6018, "step": 8965 }, { - "epoch": 2.7, - "grad_norm": 75.25648498535156, - "learning_rate": 2.0286659316427785e-06, - "loss": 1.2864, + "epoch": 1.13, + "grad_norm": 13.08178424835205, + "learning_rate": 1.2498849516797056e-05, + "loss": 1.6774, "step": 8966 }, { - "epoch": 2.7, - "grad_norm": 12.602245330810547, - "learning_rate": 2.0266613210383886e-06, - "loss": 1.6121, + "epoch": 1.13, + "grad_norm": 27.949283599853516, + "learning_rate": 1.2498012801740368e-05, + "loss": 1.9005, "step": 8967 }, { - "epoch": 2.7, - "grad_norm": 26.045513153076172, - "learning_rate": 2.0246567104339982e-06, - "loss": 1.1818, + "epoch": 1.13, + "grad_norm": 21.547008514404297, + "learning_rate": 1.249717608668368e-05, + "loss": 2.2632, "step": 8968 }, { - "epoch": 2.7, - "grad_norm": 15.361552238464355, - "learning_rate": 2.0226520998296083e-06, - "loss": 0.8867, + "epoch": 1.13, + "grad_norm": 28.614538192749023, + "learning_rate": 1.2496339371626994e-05, + "loss": 1.5634, "step": 8969 }, { - "epoch": 2.7, - "grad_norm": 52.4283561706543, - "learning_rate": 2.020647489225218e-06, - "loss": 2.3197, + "epoch": 1.13, + "grad_norm": 9.390619277954102, + "learning_rate": 1.2495502656570306e-05, + "loss": 1.8472, "step": 8970 }, { - "epoch": 2.7, - "grad_norm": 10.277100563049316, - "learning_rate": 2.018642878620828e-06, - "loss": 1.2036, + "epoch": 1.13, + "grad_norm": 68.93138885498047, + "learning_rate": 1.249466594151362e-05, + "loss": 0.6978, "step": 8971 }, { - "epoch": 2.7, - "grad_norm": 56.391624450683594, - "learning_rate": 2.0166382680164377e-06, - "loss": 1.6374, + "epoch": 1.13, + "grad_norm": 16.693071365356445, + "learning_rate": 1.249382922645693e-05, + "loss": 0.9754, "step": 8972 }, { - "epoch": 2.7, - "grad_norm": 16.56217384338379, - "learning_rate": 2.014633657412048e-06, - "loss": 1.2358, + "epoch": 1.13, + "grad_norm": 16.193252563476562, + "learning_rate": 1.2492992511400244e-05, + "loss": 2.1549, "step": 8973 }, { - "epoch": 2.7, - "grad_norm": 30.0115966796875, - "learning_rate": 2.012629046807658e-06, - "loss": 1.2294, + "epoch": 1.13, + "grad_norm": 11.039328575134277, + "learning_rate": 1.2492155796343557e-05, + "loss": 1.1695, "step": 8974 }, { - "epoch": 2.7, - "grad_norm": 10.70960807800293, - "learning_rate": 2.0106244362032676e-06, - "loss": 1.2672, + "epoch": 1.13, + "grad_norm": 15.025418281555176, + "learning_rate": 1.2491319081286867e-05, + "loss": 1.1417, "step": 8975 }, { - "epoch": 2.7, - "grad_norm": 12.158578872680664, - "learning_rate": 2.0086198255988776e-06, - "loss": 0.794, + "epoch": 1.13, + "grad_norm": 4.358430862426758, + "learning_rate": 1.2490482366230181e-05, + "loss": 0.8119, "step": 8976 }, { - "epoch": 2.7, - "grad_norm": 6.594705104827881, - "learning_rate": 2.0066152149944877e-06, - "loss": 0.964, + "epoch": 1.13, + "grad_norm": 12.447125434875488, + "learning_rate": 1.2489645651173495e-05, + "loss": 2.9231, "step": 8977 }, { - "epoch": 2.7, - "grad_norm": 18.69150161743164, - "learning_rate": 2.0046106043900974e-06, - "loss": 1.1616, + "epoch": 1.13, + "grad_norm": 17.1337833404541, + "learning_rate": 1.2488808936116807e-05, + "loss": 1.0778, "step": 8978 }, { - "epoch": 2.7, - "grad_norm": 19.576065063476562, - "learning_rate": 2.0026059937857075e-06, - "loss": 0.9053, + "epoch": 1.13, + "grad_norm": 6.489715576171875, + "learning_rate": 1.2487972221060119e-05, + "loss": 0.3476, "step": 8979 }, { - "epoch": 2.7, - "grad_norm": 30.62327003479004, - "learning_rate": 2.000601383181317e-06, - "loss": 1.0141, + "epoch": 1.13, + "grad_norm": 7.586145877838135, + "learning_rate": 1.2487135506003432e-05, + "loss": 1.4568, "step": 8980 }, { - "epoch": 2.7, - "grad_norm": 16.1602840423584, - "learning_rate": 1.9985967725769272e-06, - "loss": 1.0071, + "epoch": 1.13, + "grad_norm": 36.23131561279297, + "learning_rate": 1.2486298790946744e-05, + "loss": 1.7715, "step": 8981 }, { - "epoch": 2.7, - "grad_norm": 16.035024642944336, - "learning_rate": 1.996592161972537e-06, - "loss": 1.9886, + "epoch": 1.13, + "grad_norm": 10.976786613464355, + "learning_rate": 1.2485462075890056e-05, + "loss": 0.4711, "step": 8982 }, { - "epoch": 2.7, - "grad_norm": 32.672157287597656, - "learning_rate": 1.994587551368147e-06, - "loss": 1.9142, + "epoch": 1.13, + "grad_norm": 10.669611930847168, + "learning_rate": 1.2484625360833368e-05, + "loss": 1.7268, "step": 8983 }, { - "epoch": 2.7, - "grad_norm": 34.7219352722168, - "learning_rate": 1.9925829407637566e-06, - "loss": 1.6279, + "epoch": 1.13, + "grad_norm": 10.771439552307129, + "learning_rate": 1.2483788645776682e-05, + "loss": 1.6016, "step": 8984 }, { - "epoch": 2.7, - "grad_norm": 38.268131256103516, - "learning_rate": 1.9905783301593667e-06, - "loss": 1.3426, + "epoch": 1.13, + "grad_norm": 16.643007278442383, + "learning_rate": 1.2482951930719995e-05, + "loss": 1.1995, "step": 8985 }, { - "epoch": 2.7, - "grad_norm": 10.52354621887207, - "learning_rate": 1.988573719554977e-06, - "loss": 1.3539, + "epoch": 1.13, + "grad_norm": 8.137476921081543, + "learning_rate": 1.2482115215663306e-05, + "loss": 1.8578, "step": 8986 }, { - "epoch": 2.7, - "grad_norm": 19.19072914123535, - "learning_rate": 1.9865691089505865e-06, - "loss": 1.7263, + "epoch": 1.13, + "grad_norm": 11.106561660766602, + "learning_rate": 1.248127850060662e-05, + "loss": 0.9672, "step": 8987 }, { - "epoch": 2.7, - "grad_norm": 35.17476272583008, - "learning_rate": 1.9845644983461965e-06, - "loss": 1.2377, + "epoch": 1.13, + "grad_norm": 56.206581115722656, + "learning_rate": 1.2480441785549933e-05, + "loss": 0.7559, "step": 8988 }, { - "epoch": 2.7, - "grad_norm": 23.662670135498047, - "learning_rate": 1.982559887741806e-06, - "loss": 0.9037, + "epoch": 1.13, + "grad_norm": 12.278787612915039, + "learning_rate": 1.2479605070493243e-05, + "loss": 1.7761, "step": 8989 }, { - "epoch": 2.7, - "grad_norm": 25.2396297454834, - "learning_rate": 1.9805552771374163e-06, - "loss": 1.8102, + "epoch": 1.13, + "grad_norm": 50.02405548095703, + "learning_rate": 1.2478768355436557e-05, + "loss": 2.2794, "step": 8990 }, { - "epoch": 2.7, - "grad_norm": 8.776323318481445, - "learning_rate": 1.9785506665330264e-06, - "loss": 0.7533, + "epoch": 1.13, + "grad_norm": 58.88485336303711, + "learning_rate": 1.247793164037987e-05, + "loss": 3.1132, "step": 8991 }, { - "epoch": 2.7, - "grad_norm": 16.68621253967285, - "learning_rate": 1.976546055928636e-06, - "loss": 1.3865, + "epoch": 1.13, + "grad_norm": 9.378216743469238, + "learning_rate": 1.2477094925323183e-05, + "loss": 1.1562, "step": 8992 }, { - "epoch": 2.7, - "grad_norm": 17.21820640563965, - "learning_rate": 1.974541445324246e-06, - "loss": 1.0538, + "epoch": 1.13, + "grad_norm": 43.093109130859375, + "learning_rate": 1.2476258210266494e-05, + "loss": 2.3941, "step": 8993 }, { - "epoch": 2.7, - "grad_norm": 13.057286262512207, - "learning_rate": 1.9725368347198558e-06, - "loss": 1.1173, + "epoch": 1.13, + "grad_norm": 21.89565658569336, + "learning_rate": 1.2475421495209808e-05, + "loss": 1.4894, "step": 8994 }, { - "epoch": 2.7, - "grad_norm": 16.948637008666992, - "learning_rate": 1.9705322241154654e-06, - "loss": 0.5743, + "epoch": 1.13, + "grad_norm": 14.45287036895752, + "learning_rate": 1.247458478015312e-05, + "loss": 1.5325, "step": 8995 }, { - "epoch": 2.7, - "grad_norm": 11.57693099975586, - "learning_rate": 1.9685276135110755e-06, - "loss": 1.2016, + "epoch": 1.13, + "grad_norm": 10.38625431060791, + "learning_rate": 1.2473748065096432e-05, + "loss": 0.2503, "step": 8996 }, { - "epoch": 2.71, - "grad_norm": 25.845434188842773, - "learning_rate": 1.9665230029066856e-06, - "loss": 1.6141, + "epoch": 1.13, + "grad_norm": 17.61884307861328, + "learning_rate": 1.2472911350039744e-05, + "loss": 1.1376, "step": 8997 }, { - "epoch": 2.71, - "grad_norm": 11.109925270080566, - "learning_rate": 1.9645183923022953e-06, - "loss": 1.2974, + "epoch": 1.13, + "grad_norm": 17.80261993408203, + "learning_rate": 1.2472074634983058e-05, + "loss": 0.8628, "step": 8998 }, { - "epoch": 2.71, - "grad_norm": 17.424144744873047, - "learning_rate": 1.9625137816979054e-06, - "loss": 1.0136, + "epoch": 1.13, + "grad_norm": 23.364965438842773, + "learning_rate": 1.2471237919926371e-05, + "loss": 0.5915, "step": 8999 }, { - "epoch": 2.71, - "grad_norm": 13.590910911560059, - "learning_rate": 1.9605091710935154e-06, - "loss": 2.0274, - "step": 9000 - }, - { - "epoch": 2.71, - "eval_loss": 0.16217102110385895, - "eval_runtime": 43.386, - "eval_samples_per_second": 34.089, - "eval_steps_per_second": 34.089, + "epoch": 1.13, + "grad_norm": 9.120946884155273, + "learning_rate": 1.2470401204869682e-05, + "loss": 1.3486, "step": 9000 }, { - "epoch": 2.71, - "grad_norm": 16.488697052001953, - "learning_rate": 1.958504560489125e-06, - "loss": 1.3625, + "epoch": 1.13, + "grad_norm": 14.520109176635742, + "learning_rate": 1.2469564489812995e-05, + "loss": 1.9042, "step": 9001 }, { - "epoch": 2.71, - "grad_norm": 9.223793983459473, - "learning_rate": 1.956499949884735e-06, - "loss": 1.2811, + "epoch": 1.13, + "grad_norm": 7.26044225692749, + "learning_rate": 1.2468727774756309e-05, + "loss": 0.4613, "step": 9002 }, { - "epoch": 2.71, - "grad_norm": 13.224336624145508, - "learning_rate": 1.9544953392803453e-06, - "loss": 1.5226, + "epoch": 1.13, + "grad_norm": 4.86530065536499, + "learning_rate": 1.2467891059699619e-05, + "loss": 0.1759, "step": 9003 }, { - "epoch": 2.71, - "grad_norm": 39.96490478515625, - "learning_rate": 1.952490728675955e-06, - "loss": 0.9551, + "epoch": 1.13, + "grad_norm": 12.943403244018555, + "learning_rate": 1.2467054344642933e-05, + "loss": 1.1472, "step": 9004 }, { - "epoch": 2.71, - "grad_norm": 8.030670166015625, - "learning_rate": 1.9504861180715646e-06, - "loss": 0.5237, + "epoch": 1.13, + "grad_norm": 12.199639320373535, + "learning_rate": 1.2466217629586246e-05, + "loss": 0.5435, "step": 9005 }, { - "epoch": 2.71, - "grad_norm": 45.17795944213867, - "learning_rate": 1.9484815074671747e-06, - "loss": 1.1533, + "epoch": 1.13, + "grad_norm": 11.572731018066406, + "learning_rate": 1.2465380914529558e-05, + "loss": 2.8074, "step": 9006 }, { - "epoch": 2.71, - "grad_norm": 13.118334770202637, - "learning_rate": 1.9464768968627843e-06, - "loss": 1.3035, + "epoch": 1.13, + "grad_norm": 14.404424667358398, + "learning_rate": 1.246454419947287e-05, + "loss": 1.2332, "step": 9007 }, { - "epoch": 2.71, - "grad_norm": 13.178396224975586, - "learning_rate": 1.9444722862583944e-06, - "loss": 1.2787, + "epoch": 1.13, + "grad_norm": 16.319948196411133, + "learning_rate": 1.2463707484416184e-05, + "loss": 1.5866, "step": 9008 }, { - "epoch": 2.71, - "grad_norm": 12.746395111083984, - "learning_rate": 1.942467675654004e-06, - "loss": 1.3914, + "epoch": 1.13, + "grad_norm": 8.145049095153809, + "learning_rate": 1.2462870769359496e-05, + "loss": 0.8607, "step": 9009 }, { - "epoch": 2.71, - "grad_norm": 25.33348274230957, - "learning_rate": 1.940463065049614e-06, - "loss": 1.2344, + "epoch": 1.13, + "grad_norm": 57.9113883972168, + "learning_rate": 1.2462034054302808e-05, + "loss": 2.2547, "step": 9010 }, { - "epoch": 2.71, - "grad_norm": 18.61649513244629, - "learning_rate": 1.9384584544452242e-06, - "loss": 0.9396, + "epoch": 1.13, + "grad_norm": 34.40373229980469, + "learning_rate": 1.246119733924612e-05, + "loss": 1.0205, "step": 9011 }, { - "epoch": 2.71, - "grad_norm": 98.35498809814453, - "learning_rate": 1.936453843840834e-06, - "loss": 2.0006, + "epoch": 1.13, + "grad_norm": 6.570141315460205, + "learning_rate": 1.2460360624189433e-05, + "loss": 1.4808, "step": 9012 }, { - "epoch": 2.71, - "grad_norm": 12.018935203552246, - "learning_rate": 1.934449233236444e-06, - "loss": 1.1039, + "epoch": 1.13, + "grad_norm": 22.321439743041992, + "learning_rate": 1.2459523909132747e-05, + "loss": 1.64, "step": 9013 }, { - "epoch": 2.71, - "grad_norm": 18.714567184448242, - "learning_rate": 1.932444622632054e-06, - "loss": 1.2751, + "epoch": 1.13, + "grad_norm": 24.136550903320312, + "learning_rate": 1.2458687194076057e-05, + "loss": 1.4336, "step": 9014 }, { - "epoch": 2.71, - "grad_norm": 41.53782653808594, - "learning_rate": 1.9304400120276637e-06, - "loss": 1.7534, + "epoch": 1.13, + "grad_norm": 21.779146194458008, + "learning_rate": 1.2457850479019371e-05, + "loss": 1.6992, "step": 9015 }, { - "epoch": 2.71, - "grad_norm": 37.50891876220703, - "learning_rate": 1.928435401423274e-06, - "loss": 1.1007, + "epoch": 1.13, + "grad_norm": 16.919523239135742, + "learning_rate": 1.2457013763962685e-05, + "loss": 1.242, "step": 9016 }, { - "epoch": 2.71, - "grad_norm": 14.983277320861816, - "learning_rate": 1.9264307908188835e-06, - "loss": 1.6123, + "epoch": 1.13, + "grad_norm": 22.42137336730957, + "learning_rate": 1.2456177048905995e-05, + "loss": 0.4977, "step": 9017 }, { - "epoch": 2.71, - "grad_norm": 13.673733711242676, - "learning_rate": 1.9244261802144936e-06, - "loss": 1.0661, + "epoch": 1.13, + "grad_norm": 26.349485397338867, + "learning_rate": 1.2455340333849309e-05, + "loss": 0.5325, "step": 9018 }, { - "epoch": 2.71, - "grad_norm": 9.97967529296875, - "learning_rate": 1.9224215696101032e-06, - "loss": 1.5572, + "epoch": 1.13, + "grad_norm": 5.285884380340576, + "learning_rate": 1.2454503618792622e-05, + "loss": 1.6786, "step": 9019 }, { - "epoch": 2.71, - "grad_norm": 12.354124069213867, - "learning_rate": 1.9204169590057133e-06, - "loss": 1.2915, + "epoch": 1.13, + "grad_norm": 29.1854190826416, + "learning_rate": 1.2453666903735934e-05, + "loss": 1.9452, "step": 9020 }, { - "epoch": 2.71, - "grad_norm": 11.94011116027832, - "learning_rate": 1.918412348401323e-06, - "loss": 1.3119, + "epoch": 1.13, + "grad_norm": 47.3242301940918, + "learning_rate": 1.2452830188679246e-05, + "loss": 2.2183, "step": 9021 }, { - "epoch": 2.71, - "grad_norm": 11.489762306213379, - "learning_rate": 1.916407737796933e-06, - "loss": 1.1241, + "epoch": 1.13, + "grad_norm": 11.776655197143555, + "learning_rate": 1.245199347362256e-05, + "loss": 1.815, "step": 9022 }, { - "epoch": 2.71, - "grad_norm": 26.69053840637207, - "learning_rate": 1.914403127192543e-06, - "loss": 1.3275, + "epoch": 1.13, + "grad_norm": 16.114843368530273, + "learning_rate": 1.2451156758565872e-05, + "loss": 1.3109, "step": 9023 }, { - "epoch": 2.71, - "grad_norm": 15.625397682189941, - "learning_rate": 1.912398516588153e-06, - "loss": 1.2162, + "epoch": 1.13, + "grad_norm": 6.5335211753845215, + "learning_rate": 1.2450320043509184e-05, + "loss": 1.0004, "step": 9024 }, { - "epoch": 2.71, - "grad_norm": 23.008207321166992, - "learning_rate": 1.910393905983763e-06, - "loss": 1.4855, + "epoch": 1.13, + "grad_norm": 22.013607025146484, + "learning_rate": 1.2449483328452496e-05, + "loss": 1.4589, "step": 9025 }, { - "epoch": 2.71, - "grad_norm": 122.37059783935547, - "learning_rate": 1.908389295379373e-06, - "loss": 1.5003, + "epoch": 1.13, + "grad_norm": 11.52690601348877, + "learning_rate": 1.244864661339581e-05, + "loss": 0.9607, "step": 9026 }, { - "epoch": 2.71, - "grad_norm": 47.32773208618164, - "learning_rate": 1.9063846847749826e-06, - "loss": 2.3885, + "epoch": 1.13, + "grad_norm": 13.754624366760254, + "learning_rate": 1.2447809898339123e-05, + "loss": 1.4942, "step": 9027 }, { - "epoch": 2.71, - "grad_norm": 10.500248908996582, - "learning_rate": 1.9043800741705925e-06, - "loss": 1.3469, + "epoch": 1.13, + "grad_norm": 11.588741302490234, + "learning_rate": 1.2446973183282433e-05, + "loss": 1.6175, "step": 9028 }, { - "epoch": 2.71, - "grad_norm": 12.691232681274414, - "learning_rate": 1.9023754635662024e-06, - "loss": 1.5982, + "epoch": 1.13, + "grad_norm": 12.320586204528809, + "learning_rate": 1.2446136468225747e-05, + "loss": 0.9224, "step": 9029 }, { - "epoch": 2.71, - "grad_norm": 15.978590965270996, - "learning_rate": 1.9003708529618122e-06, - "loss": 0.9491, + "epoch": 1.13, + "grad_norm": 7.940244197845459, + "learning_rate": 1.244529975316906e-05, + "loss": 1.0349, "step": 9030 }, { - "epoch": 2.72, - "grad_norm": 22.821592330932617, - "learning_rate": 1.8983662423574223e-06, - "loss": 1.3128, + "epoch": 1.13, + "grad_norm": 12.41991901397705, + "learning_rate": 1.244446303811237e-05, + "loss": 0.7951, "step": 9031 }, { - "epoch": 2.72, - "grad_norm": 33.413455963134766, - "learning_rate": 1.896361631753032e-06, - "loss": 2.5185, + "epoch": 1.13, + "grad_norm": 15.359567642211914, + "learning_rate": 1.2443626323055684e-05, + "loss": 0.455, "step": 9032 }, { - "epoch": 2.72, - "grad_norm": 14.448841094970703, - "learning_rate": 1.894357021148642e-06, - "loss": 1.7291, + "epoch": 1.13, + "grad_norm": 9.817809104919434, + "learning_rate": 1.2442789607998998e-05, + "loss": 0.5092, "step": 9033 }, { - "epoch": 2.72, - "grad_norm": 17.098773956298828, - "learning_rate": 1.892352410544252e-06, - "loss": 0.7803, + "epoch": 1.13, + "grad_norm": 15.863314628601074, + "learning_rate": 1.244195289294231e-05, + "loss": 1.0122, "step": 9034 }, { - "epoch": 2.72, - "grad_norm": 22.17733383178711, - "learning_rate": 1.8903477999398618e-06, - "loss": 2.054, + "epoch": 1.13, + "grad_norm": 39.14425277709961, + "learning_rate": 1.2441116177885622e-05, + "loss": 3.0413, "step": 9035 }, { - "epoch": 2.72, - "grad_norm": 18.0053768157959, - "learning_rate": 1.8883431893354717e-06, - "loss": 0.9891, + "epoch": 1.13, + "grad_norm": 24.42272186279297, + "learning_rate": 1.2440279462828934e-05, + "loss": 2.5408, "step": 9036 }, { - "epoch": 2.72, - "grad_norm": 7.076298713684082, - "learning_rate": 1.8863385787310818e-06, - "loss": 0.6043, + "epoch": 1.13, + "grad_norm": 12.801360130310059, + "learning_rate": 1.2439442747772248e-05, + "loss": 1.0546, "step": 9037 }, { - "epoch": 2.72, - "grad_norm": 10.31680965423584, - "learning_rate": 1.8843339681266914e-06, - "loss": 0.6896, + "epoch": 1.13, + "grad_norm": 16.35565948486328, + "learning_rate": 1.243860603271556e-05, + "loss": 0.3435, "step": 9038 }, { - "epoch": 2.72, - "grad_norm": 34.10591125488281, - "learning_rate": 1.8823293575223015e-06, - "loss": 1.6691, + "epoch": 1.13, + "grad_norm": 29.577281951904297, + "learning_rate": 1.2437769317658871e-05, + "loss": 1.5476, "step": 9039 }, { - "epoch": 2.72, - "grad_norm": 19.84099006652832, - "learning_rate": 1.8803247469179114e-06, - "loss": 1.7946, + "epoch": 1.13, + "grad_norm": 13.342458724975586, + "learning_rate": 1.2436932602602185e-05, + "loss": 2.3566, "step": 9040 }, { - "epoch": 2.72, - "grad_norm": 16.761245727539062, - "learning_rate": 1.8783201363135213e-06, - "loss": 1.7972, + "epoch": 1.13, + "grad_norm": 30.523876190185547, + "learning_rate": 1.2436095887545499e-05, + "loss": 1.688, "step": 9041 }, { - "epoch": 2.72, - "grad_norm": 19.77780532836914, - "learning_rate": 1.8763155257091311e-06, - "loss": 1.0711, + "epoch": 1.13, + "grad_norm": 23.766157150268555, + "learning_rate": 1.2435259172488809e-05, + "loss": 2.5348, "step": 9042 }, { - "epoch": 2.72, - "grad_norm": 12.905445098876953, - "learning_rate": 1.8743109151047412e-06, - "loss": 0.79, + "epoch": 1.13, + "grad_norm": 19.39542007446289, + "learning_rate": 1.2434422457432123e-05, + "loss": 2.1838, "step": 9043 }, { - "epoch": 2.72, - "grad_norm": 34.32182312011719, - "learning_rate": 1.8723063045003509e-06, - "loss": 1.5714, + "epoch": 1.14, + "grad_norm": 26.63437271118164, + "learning_rate": 1.2433585742375436e-05, + "loss": 2.7722, "step": 9044 }, { - "epoch": 2.72, - "grad_norm": 15.384803771972656, - "learning_rate": 1.870301693895961e-06, - "loss": 0.9761, + "epoch": 1.14, + "grad_norm": 9.547733306884766, + "learning_rate": 1.2432749027318747e-05, + "loss": 0.6108, "step": 9045 }, { - "epoch": 2.72, - "grad_norm": 10.162015914916992, - "learning_rate": 1.8682970832915708e-06, - "loss": 1.0334, + "epoch": 1.14, + "grad_norm": 4.224358081817627, + "learning_rate": 1.243191231226206e-05, + "loss": 0.2862, "step": 9046 }, { - "epoch": 2.72, - "grad_norm": 21.561569213867188, - "learning_rate": 1.8662924726871807e-06, - "loss": 0.7688, + "epoch": 1.14, + "grad_norm": 13.092987060546875, + "learning_rate": 1.2431075597205374e-05, + "loss": 1.6516, "step": 9047 }, { - "epoch": 2.72, - "grad_norm": 8.59749698638916, - "learning_rate": 1.8642878620827906e-06, - "loss": 0.533, + "epoch": 1.14, + "grad_norm": 10.587092399597168, + "learning_rate": 1.2430238882148686e-05, + "loss": 1.8526, "step": 9048 }, { - "epoch": 2.72, - "grad_norm": 17.311405181884766, - "learning_rate": 1.8622832514784007e-06, - "loss": 0.9795, + "epoch": 1.14, + "grad_norm": 19.686052322387695, + "learning_rate": 1.2429402167091998e-05, + "loss": 1.1136, "step": 9049 }, { - "epoch": 2.72, - "grad_norm": 67.3037109375, - "learning_rate": 1.8602786408740103e-06, - "loss": 2.6467, + "epoch": 1.14, + "grad_norm": 8.99689769744873, + "learning_rate": 1.242856545203531e-05, + "loss": 2.2862, "step": 9050 }, { - "epoch": 2.72, - "grad_norm": 12.96740436553955, - "learning_rate": 1.8582740302696204e-06, - "loss": 1.1722, + "epoch": 1.14, + "grad_norm": 12.086695671081543, + "learning_rate": 1.2427728736978623e-05, + "loss": 0.4117, "step": 9051 }, { - "epoch": 2.72, - "grad_norm": 16.440349578857422, - "learning_rate": 1.85626941966523e-06, - "loss": 0.6379, + "epoch": 1.14, + "grad_norm": 18.265026092529297, + "learning_rate": 1.2426892021921935e-05, + "loss": 0.6975, "step": 9052 }, { - "epoch": 2.72, - "grad_norm": 35.48670196533203, - "learning_rate": 1.85426480906084e-06, - "loss": 1.4359, + "epoch": 1.14, + "grad_norm": 25.84575653076172, + "learning_rate": 1.2426055306865247e-05, + "loss": 0.4512, "step": 9053 }, { - "epoch": 2.72, - "grad_norm": 55.38397216796875, - "learning_rate": 1.85226019845645e-06, - "loss": 2.2976, + "epoch": 1.14, + "grad_norm": 20.959501266479492, + "learning_rate": 1.2425218591808561e-05, + "loss": 1.0482, "step": 9054 }, { - "epoch": 2.72, - "grad_norm": 20.49527931213379, - "learning_rate": 1.8502555878520597e-06, - "loss": 1.1748, + "epoch": 1.14, + "grad_norm": 13.340580940246582, + "learning_rate": 1.2424381876751875e-05, + "loss": 0.9483, "step": 9055 }, { - "epoch": 2.72, - "grad_norm": 11.808248519897461, - "learning_rate": 1.8482509772476698e-06, - "loss": 1.6608, + "epoch": 1.14, + "grad_norm": 9.03878116607666, + "learning_rate": 1.2423545161695185e-05, + "loss": 0.34, "step": 9056 }, { - "epoch": 2.72, - "grad_norm": 25.69436264038086, - "learning_rate": 1.8462463666432799e-06, - "loss": 1.0124, + "epoch": 1.14, + "grad_norm": 5.133035182952881, + "learning_rate": 1.2422708446638499e-05, + "loss": 0.5256, "step": 9057 }, { - "epoch": 2.72, - "grad_norm": 27.099803924560547, - "learning_rate": 1.8442417560388895e-06, - "loss": 1.4334, + "epoch": 1.14, + "grad_norm": 8.312919616699219, + "learning_rate": 1.2421871731581812e-05, + "loss": 2.1823, "step": 9058 }, { - "epoch": 2.72, - "grad_norm": 16.551921844482422, - "learning_rate": 1.8422371454344994e-06, - "loss": 1.4909, + "epoch": 1.14, + "grad_norm": 10.040514945983887, + "learning_rate": 1.2421035016525122e-05, + "loss": 0.9566, "step": 9059 }, { - "epoch": 2.72, - "grad_norm": 14.500848770141602, - "learning_rate": 1.8402325348301095e-06, - "loss": 1.1442, + "epoch": 1.14, + "grad_norm": 26.158926010131836, + "learning_rate": 1.2420198301468436e-05, + "loss": 1.1192, "step": 9060 }, { - "epoch": 2.72, - "grad_norm": 14.154791831970215, - "learning_rate": 1.8382279242257191e-06, - "loss": 1.8214, + "epoch": 1.14, + "grad_norm": 18.3576717376709, + "learning_rate": 1.241936158641175e-05, + "loss": 1.3846, "step": 9061 }, { - "epoch": 2.72, - "grad_norm": 28.4224796295166, - "learning_rate": 1.8362233136213292e-06, - "loss": 1.2986, + "epoch": 1.14, + "grad_norm": 106.99333953857422, + "learning_rate": 1.2418524871355062e-05, + "loss": 1.9476, "step": 9062 }, { - "epoch": 2.72, - "grad_norm": 14.508337020874023, - "learning_rate": 1.8342187030169393e-06, - "loss": 1.1846, + "epoch": 1.14, + "grad_norm": 19.000123977661133, + "learning_rate": 1.2417688156298374e-05, + "loss": 2.4688, "step": 9063 }, { - "epoch": 2.73, - "grad_norm": 17.251667022705078, - "learning_rate": 1.832214092412549e-06, - "loss": 1.2499, + "epoch": 1.14, + "grad_norm": 7.971542835235596, + "learning_rate": 1.2416851441241686e-05, + "loss": 1.0279, "step": 9064 }, { - "epoch": 2.73, - "grad_norm": 37.404449462890625, - "learning_rate": 1.8302094818081588e-06, - "loss": 0.9916, + "epoch": 1.14, + "grad_norm": 33.608123779296875, + "learning_rate": 1.2416014726185e-05, + "loss": 2.7088, "step": 9065 }, { - "epoch": 2.73, - "grad_norm": 13.81652545928955, - "learning_rate": 1.828204871203769e-06, - "loss": 1.2755, + "epoch": 1.14, + "grad_norm": 38.017791748046875, + "learning_rate": 1.2415178011128311e-05, + "loss": 2.9421, "step": 9066 }, { - "epoch": 2.73, - "grad_norm": 21.725704193115234, - "learning_rate": 1.8262002605993786e-06, - "loss": 1.1399, + "epoch": 1.14, + "grad_norm": 17.621917724609375, + "learning_rate": 1.2414341296071623e-05, + "loss": 1.0552, "step": 9067 }, { - "epoch": 2.73, - "grad_norm": 10.298942565917969, - "learning_rate": 1.8241956499949887e-06, - "loss": 0.7941, + "epoch": 1.14, + "grad_norm": 11.231819152832031, + "learning_rate": 1.2413504581014937e-05, + "loss": 1.1976, "step": 9068 }, { - "epoch": 2.73, - "grad_norm": 25.29086685180664, - "learning_rate": 1.8221910393905988e-06, - "loss": 1.3804, + "epoch": 1.14, + "grad_norm": 17.117918014526367, + "learning_rate": 1.241266786595825e-05, + "loss": 1.6858, "step": 9069 }, { - "epoch": 2.73, - "grad_norm": 18.70489501953125, - "learning_rate": 1.8201864287862084e-06, - "loss": 1.1317, + "epoch": 1.14, + "grad_norm": 16.66362953186035, + "learning_rate": 1.241183115090156e-05, + "loss": 0.8505, "step": 9070 }, { - "epoch": 2.73, - "grad_norm": 11.406014442443848, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.4713, + "epoch": 1.14, + "grad_norm": 13.61717700958252, + "learning_rate": 1.2410994435844874e-05, + "loss": 3.3554, "step": 9071 }, { - "epoch": 2.73, - "grad_norm": 22.935609817504883, - "learning_rate": 1.8161772075774282e-06, - "loss": 1.6272, + "epoch": 1.14, + "grad_norm": 8.80383586883545, + "learning_rate": 1.2410157720788188e-05, + "loss": 0.3408, "step": 9072 }, { - "epoch": 2.73, - "grad_norm": 33.32746124267578, - "learning_rate": 1.814172596973038e-06, - "loss": 1.3237, + "epoch": 1.14, + "grad_norm": 9.178657531738281, + "learning_rate": 1.2409321005731498e-05, + "loss": 0.6548, "step": 9073 }, { - "epoch": 2.73, - "grad_norm": 8.96728801727295, - "learning_rate": 1.8121679863686481e-06, - "loss": 0.556, + "epoch": 1.14, + "grad_norm": 50.2661247253418, + "learning_rate": 1.2408484290674812e-05, + "loss": 2.264, "step": 9074 }, { - "epoch": 2.73, - "grad_norm": 23.04669761657715, - "learning_rate": 1.8101633757642578e-06, - "loss": 1.5039, + "epoch": 1.14, + "grad_norm": 17.373971939086914, + "learning_rate": 1.2407647575618124e-05, + "loss": 1.2542, "step": 9075 }, { - "epoch": 2.73, - "grad_norm": 10.999143600463867, - "learning_rate": 1.8081587651598679e-06, - "loss": 1.7855, + "epoch": 1.14, + "grad_norm": 9.856574058532715, + "learning_rate": 1.2406810860561438e-05, + "loss": 0.8595, "step": 9076 }, { - "epoch": 2.73, - "grad_norm": 21.91204261779785, - "learning_rate": 1.8061541545554777e-06, - "loss": 1.4705, + "epoch": 1.14, + "grad_norm": 27.904253005981445, + "learning_rate": 1.240597414550475e-05, + "loss": 2.2831, "step": 9077 }, { - "epoch": 2.73, - "grad_norm": 31.69646644592285, - "learning_rate": 1.8041495439510876e-06, - "loss": 1.5004, + "epoch": 1.14, + "grad_norm": 30.6309814453125, + "learning_rate": 1.2405137430448061e-05, + "loss": 1.7967, "step": 9078 }, { - "epoch": 2.73, - "grad_norm": 8.526796340942383, - "learning_rate": 1.8021449333466975e-06, - "loss": 0.8363, + "epoch": 1.14, + "grad_norm": 5.129134178161621, + "learning_rate": 1.2404300715391375e-05, + "loss": 1.6209, "step": 9079 }, { - "epoch": 2.73, - "grad_norm": 91.82820129394531, - "learning_rate": 1.8001403227423076e-06, - "loss": 2.5966, + "epoch": 1.14, + "grad_norm": 15.763965606689453, + "learning_rate": 1.2403464000334687e-05, + "loss": 3.2273, "step": 9080 }, { - "epoch": 2.73, - "grad_norm": 62.776432037353516, - "learning_rate": 1.7981357121379172e-06, - "loss": 2.0551, + "epoch": 1.14, + "grad_norm": 7.9168267250061035, + "learning_rate": 1.2402627285277999e-05, + "loss": 1.4169, "step": 9081 }, { - "epoch": 2.73, - "grad_norm": 7.502096652984619, - "learning_rate": 1.7961311015335273e-06, - "loss": 0.9674, + "epoch": 1.14, + "grad_norm": 9.040215492248535, + "learning_rate": 1.2401790570221313e-05, + "loss": 2.3169, "step": 9082 }, { - "epoch": 2.73, - "grad_norm": 13.519929885864258, - "learning_rate": 1.7941264909291372e-06, - "loss": 0.9669, + "epoch": 1.14, + "grad_norm": 12.345662117004395, + "learning_rate": 1.2400953855164626e-05, + "loss": 0.7656, "step": 9083 }, { - "epoch": 2.73, - "grad_norm": 18.463733673095703, - "learning_rate": 1.792121880324747e-06, - "loss": 1.557, + "epoch": 1.14, + "grad_norm": 32.8636360168457, + "learning_rate": 1.2400117140107937e-05, + "loss": 3.5922, "step": 9084 }, { - "epoch": 2.73, - "grad_norm": 18.185571670532227, - "learning_rate": 1.790117269720357e-06, - "loss": 1.9059, + "epoch": 1.14, + "grad_norm": 31.467174530029297, + "learning_rate": 1.239928042505125e-05, + "loss": 4.4755, "step": 9085 }, { - "epoch": 2.73, - "grad_norm": 11.766922950744629, - "learning_rate": 1.788112659115967e-06, - "loss": 1.3468, + "epoch": 1.14, + "grad_norm": 16.0479679107666, + "learning_rate": 1.2398443709994564e-05, + "loss": 1.2256, "step": 9086 }, { - "epoch": 2.73, - "grad_norm": 22.52054786682129, - "learning_rate": 1.7861080485115767e-06, - "loss": 0.9551, + "epoch": 1.14, + "grad_norm": 15.858236312866211, + "learning_rate": 1.2397606994937874e-05, + "loss": 1.4377, "step": 9087 }, { - "epoch": 2.73, - "grad_norm": 8.950528144836426, - "learning_rate": 1.7841034379071868e-06, - "loss": 0.6039, + "epoch": 1.14, + "grad_norm": 19.002588272094727, + "learning_rate": 1.2396770279881188e-05, + "loss": 1.7684, "step": 9088 }, { - "epoch": 2.73, - "grad_norm": 19.103931427001953, - "learning_rate": 1.7820988273027966e-06, - "loss": 1.0202, + "epoch": 1.14, + "grad_norm": 5.8403215408325195, + "learning_rate": 1.23959335648245e-05, + "loss": 0.4953, "step": 9089 }, { - "epoch": 2.73, - "grad_norm": 14.955410957336426, - "learning_rate": 1.7800942166984065e-06, - "loss": 1.4588, + "epoch": 1.14, + "grad_norm": 10.064393043518066, + "learning_rate": 1.2395096849767813e-05, + "loss": 0.698, "step": 9090 }, { - "epoch": 2.73, - "grad_norm": 49.647361755371094, - "learning_rate": 1.7780896060940164e-06, - "loss": 3.6037, + "epoch": 1.14, + "grad_norm": 14.260061264038086, + "learning_rate": 1.2394260134711125e-05, + "loss": 1.5792, "step": 9091 }, { - "epoch": 2.73, - "grad_norm": 111.04601287841797, - "learning_rate": 1.7760849954896263e-06, - "loss": 1.5603, + "epoch": 1.14, + "grad_norm": 24.716506958007812, + "learning_rate": 1.2393423419654437e-05, + "loss": 1.5605, "step": 9092 }, { - "epoch": 2.73, - "grad_norm": 49.5386962890625, - "learning_rate": 1.7740803848852361e-06, - "loss": 2.5773, + "epoch": 1.14, + "grad_norm": 8.207243919372559, + "learning_rate": 1.2392586704597751e-05, + "loss": 0.3092, "step": 9093 }, { - "epoch": 2.73, - "grad_norm": 12.374490737915039, - "learning_rate": 1.7720757742808462e-06, - "loss": 1.1015, + "epoch": 1.14, + "grad_norm": 12.12687873840332, + "learning_rate": 1.2391749989541061e-05, + "loss": 1.008, "step": 9094 }, { - "epoch": 2.73, - "grad_norm": 9.451532363891602, - "learning_rate": 1.7700711636764559e-06, - "loss": 0.4628, + "epoch": 1.14, + "grad_norm": 13.592269897460938, + "learning_rate": 1.2390913274484375e-05, + "loss": 1.4859, "step": 9095 }, { - "epoch": 2.73, - "grad_norm": 13.187180519104004, - "learning_rate": 1.768066553072066e-06, - "loss": 0.8442, + "epoch": 1.14, + "grad_norm": 27.075054168701172, + "learning_rate": 1.2390076559427688e-05, + "loss": 2.4585, "step": 9096 }, { - "epoch": 2.74, - "grad_norm": 12.320247650146484, - "learning_rate": 1.7660619424676758e-06, - "loss": 0.88, + "epoch": 1.14, + "grad_norm": 13.686470031738281, + "learning_rate": 1.2389239844371002e-05, + "loss": 1.3728, "step": 9097 }, { - "epoch": 2.74, - "grad_norm": 8.388055801391602, - "learning_rate": 1.7640573318632857e-06, - "loss": 0.7241, + "epoch": 1.14, + "grad_norm": 16.724834442138672, + "learning_rate": 1.2388403129314312e-05, + "loss": 1.2046, "step": 9098 }, { - "epoch": 2.74, - "grad_norm": 11.43036937713623, - "learning_rate": 1.7620527212588956e-06, - "loss": 0.8333, + "epoch": 1.14, + "grad_norm": 53.69195556640625, + "learning_rate": 1.2387566414257626e-05, + "loss": 1.6093, "step": 9099 }, { - "epoch": 2.74, - "grad_norm": 8.495409965515137, - "learning_rate": 1.7600481106545057e-06, - "loss": 1.1459, + "epoch": 1.14, + "grad_norm": 6.855299949645996, + "learning_rate": 1.238672969920094e-05, + "loss": 0.5229, "step": 9100 }, { - "epoch": 2.74, - "grad_norm": 11.421164512634277, - "learning_rate": 1.7580435000501153e-06, - "loss": 1.0815, + "epoch": 1.14, + "grad_norm": 19.45718002319336, + "learning_rate": 1.238589298414425e-05, + "loss": 1.5701, "step": 9101 }, { - "epoch": 2.74, - "grad_norm": 17.927734375, - "learning_rate": 1.7560388894457254e-06, - "loss": 1.1274, + "epoch": 1.14, + "grad_norm": 19.386138916015625, + "learning_rate": 1.2385056269087564e-05, + "loss": 0.9297, "step": 9102 }, { - "epoch": 2.74, - "grad_norm": 22.090795516967773, - "learning_rate": 1.7540342788413353e-06, - "loss": 0.8337, + "epoch": 1.14, + "grad_norm": 17.744094848632812, + "learning_rate": 1.2384219554030876e-05, + "loss": 1.6128, "step": 9103 }, { - "epoch": 2.74, - "grad_norm": 11.036092758178711, - "learning_rate": 1.752029668236945e-06, - "loss": 1.3825, + "epoch": 1.14, + "grad_norm": 8.07080364227295, + "learning_rate": 1.2383382838974188e-05, + "loss": 0.6567, "step": 9104 }, { - "epoch": 2.74, - "grad_norm": 10.002640724182129, - "learning_rate": 1.750025057632555e-06, - "loss": 1.174, + "epoch": 1.14, + "grad_norm": 23.605592727661133, + "learning_rate": 1.2382546123917501e-05, + "loss": 3.0387, "step": 9105 }, { - "epoch": 2.74, - "grad_norm": 13.688326835632324, - "learning_rate": 1.748020447028165e-06, - "loss": 1.0106, + "epoch": 1.14, + "grad_norm": 10.705403327941895, + "learning_rate": 1.2381709408860813e-05, + "loss": 0.4005, "step": 9106 }, { - "epoch": 2.74, - "grad_norm": 18.911972045898438, - "learning_rate": 1.7460158364237748e-06, - "loss": 1.3007, + "epoch": 1.14, + "grad_norm": 9.318358421325684, + "learning_rate": 1.2380872693804127e-05, + "loss": 0.8697, "step": 9107 }, { - "epoch": 2.74, - "grad_norm": 28.944271087646484, - "learning_rate": 1.7440112258193848e-06, - "loss": 1.5335, + "epoch": 1.14, + "grad_norm": 12.265621185302734, + "learning_rate": 1.2380035978747437e-05, + "loss": 0.9076, "step": 9108 }, { - "epoch": 2.74, - "grad_norm": 17.66625213623047, - "learning_rate": 1.7420066152149947e-06, - "loss": 0.7189, + "epoch": 1.14, + "grad_norm": 8.256604194641113, + "learning_rate": 1.237919926369075e-05, + "loss": 2.0401, "step": 9109 }, { - "epoch": 2.74, - "grad_norm": 28.595500946044922, - "learning_rate": 1.7400020046106044e-06, - "loss": 1.4251, + "epoch": 1.14, + "grad_norm": 12.32669734954834, + "learning_rate": 1.2378362548634064e-05, + "loss": 1.6128, "step": 9110 }, { - "epoch": 2.74, - "grad_norm": 11.355489730834961, - "learning_rate": 1.7379973940062145e-06, - "loss": 0.8031, + "epoch": 1.14, + "grad_norm": 10.947827339172363, + "learning_rate": 1.2377525833577375e-05, + "loss": 1.6705, "step": 9111 }, { - "epoch": 2.74, - "grad_norm": 20.97258949279785, - "learning_rate": 1.7359927834018241e-06, - "loss": 0.8911, + "epoch": 1.14, + "grad_norm": 29.249637603759766, + "learning_rate": 1.2376689118520688e-05, + "loss": 1.42, "step": 9112 }, { - "epoch": 2.74, - "grad_norm": 21.422672271728516, - "learning_rate": 1.7339881727974342e-06, - "loss": 1.5909, + "epoch": 1.14, + "grad_norm": 5.854145526885986, + "learning_rate": 1.2375852403464002e-05, + "loss": 0.3727, "step": 9113 }, { - "epoch": 2.74, - "grad_norm": 16.778446197509766, - "learning_rate": 1.7319835621930443e-06, - "loss": 0.5871, + "epoch": 1.14, + "grad_norm": 8.388299942016602, + "learning_rate": 1.2375015688407316e-05, + "loss": 0.7125, "step": 9114 }, { - "epoch": 2.74, - "grad_norm": 14.981059074401855, - "learning_rate": 1.729978951588654e-06, - "loss": 0.938, + "epoch": 1.14, + "grad_norm": 5.731067180633545, + "learning_rate": 1.2374178973350626e-05, + "loss": 0.5314, "step": 9115 }, { - "epoch": 2.74, - "grad_norm": 17.02187156677246, - "learning_rate": 1.7279743409842638e-06, - "loss": 1.4252, + "epoch": 1.14, + "grad_norm": 7.892755031585693, + "learning_rate": 1.237334225829394e-05, + "loss": 0.8244, "step": 9116 }, { - "epoch": 2.74, - "grad_norm": 20.162086486816406, - "learning_rate": 1.725969730379874e-06, - "loss": 0.9187, + "epoch": 1.14, + "grad_norm": 20.525279998779297, + "learning_rate": 1.2372505543237251e-05, + "loss": 2.1919, "step": 9117 }, { - "epoch": 2.74, - "grad_norm": 22.232948303222656, - "learning_rate": 1.7239651197754836e-06, - "loss": 1.1912, + "epoch": 1.14, + "grad_norm": 13.459932327270508, + "learning_rate": 1.2371668828180563e-05, + "loss": 1.0175, "step": 9118 }, { - "epoch": 2.74, - "grad_norm": 21.19387435913086, - "learning_rate": 1.7219605091710937e-06, - "loss": 0.7006, + "epoch": 1.14, + "grad_norm": 6.225473880767822, + "learning_rate": 1.2370832113123877e-05, + "loss": 0.7936, "step": 9119 }, { - "epoch": 2.74, - "grad_norm": 11.553994178771973, - "learning_rate": 1.7199558985667037e-06, - "loss": 0.7409, + "epoch": 1.14, + "grad_norm": 19.290508270263672, + "learning_rate": 1.2369995398067189e-05, + "loss": 2.8995, "step": 9120 }, { - "epoch": 2.74, - "eval_loss": 0.16245336830615997, - "eval_runtime": 43.6231, - "eval_samples_per_second": 33.904, - "eval_steps_per_second": 33.904, - "step": 9120 + "epoch": 1.14, + "grad_norm": 19.17864227294922, + "learning_rate": 1.2369158683010503e-05, + "loss": 0.7154, + "step": 9121 + }, + { + "epoch": 1.14, + "grad_norm": 6.110776424407959, + "learning_rate": 1.2368321967953813e-05, + "loss": 0.2638, + "step": 9122 + }, + { + "epoch": 1.14, + "grad_norm": 13.87114143371582, + "learning_rate": 1.2367485252897127e-05, + "loss": 1.0967, + "step": 9123 + }, + { + "epoch": 1.15, + "grad_norm": 8.471000671386719, + "learning_rate": 1.236664853784044e-05, + "loss": 1.3324, + "step": 9124 + }, + { + "epoch": 1.15, + "grad_norm": 7.963351726531982, + "learning_rate": 1.236581182278375e-05, + "loss": 1.545, + "step": 9125 + }, + { + "epoch": 1.15, + "grad_norm": 7.431926727294922, + "learning_rate": 1.2364975107727064e-05, + "loss": 0.3971, + "step": 9126 + }, + { + "epoch": 1.15, + "grad_norm": 26.25963592529297, + "learning_rate": 1.2364138392670378e-05, + "loss": 2.6638, + "step": 9127 + }, + { + "epoch": 1.15, + "grad_norm": 6.95812463760376, + "learning_rate": 1.236330167761369e-05, + "loss": 1.4096, + "step": 9128 + }, + { + "epoch": 1.15, + "grad_norm": 24.770366668701172, + "learning_rate": 1.2362464962557002e-05, + "loss": 2.2656, + "step": 9129 + }, + { + "epoch": 1.15, + "grad_norm": 10.823984146118164, + "learning_rate": 1.2361628247500315e-05, + "loss": 1.6969, + "step": 9130 + }, + { + "epoch": 1.15, + "grad_norm": 7.5049519538879395, + "learning_rate": 1.2360791532443627e-05, + "loss": 2.208, + "step": 9131 + }, + { + "epoch": 1.15, + "grad_norm": 18.726926803588867, + "learning_rate": 1.235995481738694e-05, + "loss": 1.0615, + "step": 9132 + }, + { + "epoch": 1.15, + "grad_norm": 18.43231964111328, + "learning_rate": 1.2359118102330253e-05, + "loss": 1.283, + "step": 9133 + }, + { + "epoch": 1.15, + "grad_norm": 20.206207275390625, + "learning_rate": 1.2358281387273565e-05, + "loss": 1.5557, + "step": 9134 + }, + { + "epoch": 1.15, + "grad_norm": 9.573138236999512, + "learning_rate": 1.2357444672216878e-05, + "loss": 0.3381, + "step": 9135 + }, + { + "epoch": 1.15, + "grad_norm": 7.645369529724121, + "learning_rate": 1.2356607957160189e-05, + "loss": 1.2922, + "step": 9136 + }, + { + "epoch": 1.15, + "grad_norm": 49.07716369628906, + "learning_rate": 1.2355771242103502e-05, + "loss": 3.4945, + "step": 9137 + }, + { + "epoch": 1.15, + "grad_norm": 4.765970706939697, + "learning_rate": 1.2354934527046816e-05, + "loss": 0.4129, + "step": 9138 + }, + { + "epoch": 1.15, + "grad_norm": 34.59571075439453, + "learning_rate": 1.2354097811990126e-05, + "loss": 2.1463, + "step": 9139 + }, + { + "epoch": 1.15, + "grad_norm": 25.068452835083008, + "learning_rate": 1.235326109693344e-05, + "loss": 1.5953, + "step": 9140 + }, + { + "epoch": 1.15, + "grad_norm": 18.312746047973633, + "learning_rate": 1.2352424381876754e-05, + "loss": 2.3889, + "step": 9141 + }, + { + "epoch": 1.15, + "grad_norm": 9.369312286376953, + "learning_rate": 1.2351587666820066e-05, + "loss": 0.1455, + "step": 9142 + }, + { + "epoch": 1.15, + "grad_norm": 13.723098754882812, + "learning_rate": 1.2350750951763377e-05, + "loss": 0.9427, + "step": 9143 + }, + { + "epoch": 1.15, + "grad_norm": 40.40791320800781, + "learning_rate": 1.2349914236706691e-05, + "loss": 1.8195, + "step": 9144 + }, + { + "epoch": 1.15, + "grad_norm": 20.710609436035156, + "learning_rate": 1.2349077521650003e-05, + "loss": 2.6318, + "step": 9145 + }, + { + "epoch": 1.15, + "grad_norm": 7.522031307220459, + "learning_rate": 1.2348240806593315e-05, + "loss": 0.2665, + "step": 9146 + }, + { + "epoch": 1.15, + "grad_norm": 5.169207572937012, + "learning_rate": 1.2347404091536627e-05, + "loss": 0.6525, + "step": 9147 + }, + { + "epoch": 1.15, + "grad_norm": 14.694474220275879, + "learning_rate": 1.234656737647994e-05, + "loss": 0.767, + "step": 9148 + }, + { + "epoch": 1.15, + "grad_norm": 19.8188533782959, + "learning_rate": 1.2345730661423254e-05, + "loss": 2.2416, + "step": 9149 + }, + { + "epoch": 1.15, + "grad_norm": 24.8552188873291, + "learning_rate": 1.2344893946366565e-05, + "loss": 1.8361, + "step": 9150 + }, + { + "epoch": 1.15, + "grad_norm": 22.294574737548828, + "learning_rate": 1.2344057231309878e-05, + "loss": 1.0202, + "step": 9151 + }, + { + "epoch": 1.15, + "grad_norm": 23.81131362915039, + "learning_rate": 1.2343220516253192e-05, + "loss": 2.1269, + "step": 9152 + }, + { + "epoch": 1.15, + "grad_norm": 19.063304901123047, + "learning_rate": 1.2342383801196502e-05, + "loss": 1.2329, + "step": 9153 + }, + { + "epoch": 1.15, + "grad_norm": 10.666409492492676, + "learning_rate": 1.2341547086139816e-05, + "loss": 1.3701, + "step": 9154 + }, + { + "epoch": 1.15, + "grad_norm": 9.337384223937988, + "learning_rate": 1.234071037108313e-05, + "loss": 0.4143, + "step": 9155 + }, + { + "epoch": 1.15, + "grad_norm": 39.719398498535156, + "learning_rate": 1.2339873656026441e-05, + "loss": 1.3722, + "step": 9156 + }, + { + "epoch": 1.15, + "grad_norm": 13.493542671203613, + "learning_rate": 1.2339036940969753e-05, + "loss": 1.4881, + "step": 9157 + }, + { + "epoch": 1.15, + "grad_norm": 10.449769973754883, + "learning_rate": 1.2338200225913067e-05, + "loss": 0.882, + "step": 9158 + }, + { + "epoch": 1.15, + "grad_norm": 14.406185150146484, + "learning_rate": 1.2337363510856379e-05, + "loss": 1.1041, + "step": 9159 + }, + { + "epoch": 1.15, + "grad_norm": 11.488336563110352, + "learning_rate": 1.2336526795799691e-05, + "loss": 0.9631, + "step": 9160 + }, + { + "epoch": 1.15, + "grad_norm": 21.735170364379883, + "learning_rate": 1.2335690080743003e-05, + "loss": 2.287, + "step": 9161 + }, + { + "epoch": 1.15, + "grad_norm": 19.89383316040039, + "learning_rate": 1.2334853365686316e-05, + "loss": 1.5426, + "step": 9162 + }, + { + "epoch": 1.15, + "grad_norm": 142.1265106201172, + "learning_rate": 1.233401665062963e-05, + "loss": 2.0707, + "step": 9163 + }, + { + "epoch": 1.15, + "grad_norm": 9.750329971313477, + "learning_rate": 1.233317993557294e-05, + "loss": 1.1023, + "step": 9164 + }, + { + "epoch": 1.15, + "grad_norm": 12.105993270874023, + "learning_rate": 1.2332343220516254e-05, + "loss": 1.6093, + "step": 9165 + }, + { + "epoch": 1.15, + "grad_norm": 13.682340621948242, + "learning_rate": 1.2331506505459568e-05, + "loss": 0.7277, + "step": 9166 + }, + { + "epoch": 1.15, + "grad_norm": 14.506525993347168, + "learning_rate": 1.2330669790402878e-05, + "loss": 1.3144, + "step": 9167 + }, + { + "epoch": 1.15, + "grad_norm": 8.809659004211426, + "learning_rate": 1.2329833075346192e-05, + "loss": 0.4419, + "step": 9168 + }, + { + "epoch": 1.15, + "grad_norm": 35.33649826049805, + "learning_rate": 1.2328996360289505e-05, + "loss": 0.847, + "step": 9169 + }, + { + "epoch": 1.15, + "grad_norm": 14.611030578613281, + "learning_rate": 1.2328159645232817e-05, + "loss": 1.426, + "step": 9170 + }, + { + "epoch": 1.15, + "grad_norm": 22.434511184692383, + "learning_rate": 1.2327322930176129e-05, + "loss": 1.5308, + "step": 9171 + }, + { + "epoch": 1.15, + "grad_norm": 6.115286827087402, + "learning_rate": 1.2326486215119443e-05, + "loss": 0.9465, + "step": 9172 + }, + { + "epoch": 1.15, + "grad_norm": 12.794254302978516, + "learning_rate": 1.2325649500062755e-05, + "loss": 1.4792, + "step": 9173 + }, + { + "epoch": 1.15, + "grad_norm": 10.999887466430664, + "learning_rate": 1.2324812785006067e-05, + "loss": 1.8432, + "step": 9174 + }, + { + "epoch": 1.15, + "grad_norm": 5.971545696258545, + "learning_rate": 1.2323976069949379e-05, + "loss": 0.3954, + "step": 9175 + }, + { + "epoch": 1.15, + "grad_norm": 11.400793075561523, + "learning_rate": 1.2323139354892692e-05, + "loss": 1.5969, + "step": 9176 + }, + { + "epoch": 1.15, + "grad_norm": 12.92943000793457, + "learning_rate": 1.2322302639836006e-05, + "loss": 2.1288, + "step": 9177 + }, + { + "epoch": 1.15, + "grad_norm": 6.143886089324951, + "learning_rate": 1.2321465924779316e-05, + "loss": 1.2216, + "step": 9178 + }, + { + "epoch": 1.15, + "grad_norm": 7.842630386352539, + "learning_rate": 1.232062920972263e-05, + "loss": 0.4645, + "step": 9179 + }, + { + "epoch": 1.15, + "grad_norm": 27.85214614868164, + "learning_rate": 1.2319792494665944e-05, + "loss": 1.5454, + "step": 9180 + }, + { + "epoch": 1.15, + "grad_norm": 14.218128204345703, + "learning_rate": 1.2318955779609254e-05, + "loss": 2.8992, + "step": 9181 + }, + { + "epoch": 1.15, + "grad_norm": 14.0691556930542, + "learning_rate": 1.2318119064552567e-05, + "loss": 1.0916, + "step": 9182 + }, + { + "epoch": 1.15, + "grad_norm": 24.324790954589844, + "learning_rate": 1.2317282349495881e-05, + "loss": 2.3248, + "step": 9183 + }, + { + "epoch": 1.15, + "grad_norm": 33.42656707763672, + "learning_rate": 1.2316445634439193e-05, + "loss": 3.127, + "step": 9184 + }, + { + "epoch": 1.15, + "grad_norm": 6.54421329498291, + "learning_rate": 1.2315608919382505e-05, + "loss": 2.6659, + "step": 9185 + }, + { + "epoch": 1.15, + "grad_norm": 9.281428337097168, + "learning_rate": 1.2314772204325817e-05, + "loss": 1.0513, + "step": 9186 + }, + { + "epoch": 1.15, + "grad_norm": 12.191452980041504, + "learning_rate": 1.231393548926913e-05, + "loss": 0.6355, + "step": 9187 + }, + { + "epoch": 1.15, + "grad_norm": 16.978662490844727, + "learning_rate": 1.2313098774212443e-05, + "loss": 1.0362, + "step": 9188 + }, + { + "epoch": 1.15, + "grad_norm": 40.00162887573242, + "learning_rate": 1.2312262059155754e-05, + "loss": 1.1716, + "step": 9189 + }, + { + "epoch": 1.15, + "grad_norm": 14.924356460571289, + "learning_rate": 1.2311425344099068e-05, + "loss": 1.7509, + "step": 9190 + }, + { + "epoch": 1.15, + "grad_norm": 9.85426139831543, + "learning_rate": 1.2310588629042382e-05, + "loss": 0.3192, + "step": 9191 + }, + { + "epoch": 1.15, + "grad_norm": 11.907785415649414, + "learning_rate": 1.2309751913985692e-05, + "loss": 1.013, + "step": 9192 + }, + { + "epoch": 1.15, + "grad_norm": 8.675817489624023, + "learning_rate": 1.2308915198929006e-05, + "loss": 0.9179, + "step": 9193 + }, + { + "epoch": 1.15, + "grad_norm": 42.699310302734375, + "learning_rate": 1.230807848387232e-05, + "loss": 2.2475, + "step": 9194 + }, + { + "epoch": 1.15, + "grad_norm": 20.156429290771484, + "learning_rate": 1.230724176881563e-05, + "loss": 2.1368, + "step": 9195 + }, + { + "epoch": 1.15, + "grad_norm": 12.853020668029785, + "learning_rate": 1.2306405053758943e-05, + "loss": 0.9512, + "step": 9196 + }, + { + "epoch": 1.15, + "grad_norm": 8.198304176330566, + "learning_rate": 1.2305568338702257e-05, + "loss": 0.6033, + "step": 9197 + }, + { + "epoch": 1.15, + "grad_norm": 11.125846862792969, + "learning_rate": 1.2304731623645569e-05, + "loss": 0.7922, + "step": 9198 + }, + { + "epoch": 1.15, + "grad_norm": 34.10848617553711, + "learning_rate": 1.230389490858888e-05, + "loss": 1.4218, + "step": 9199 + }, + { + "epoch": 1.15, + "grad_norm": 11.90444564819336, + "learning_rate": 1.2303058193532193e-05, + "loss": 1.9948, + "step": 9200 + }, + { + "epoch": 1.15, + "eval_loss": 0.09518160670995712, + "eval_runtime": 94.306, + "eval_samples_per_second": 37.559, + "eval_steps_per_second": 37.559, + "step": 9200 + }, + { + "epoch": 1.15, + "grad_norm": 7.9429755210876465, + "learning_rate": 1.2302221478475506e-05, + "loss": 0.9086, + "step": 9201 + }, + { + "epoch": 1.15, + "grad_norm": 23.948022842407227, + "learning_rate": 1.2301384763418818e-05, + "loss": 1.653, + "step": 9202 + }, + { + "epoch": 1.15, + "grad_norm": 27.833898544311523, + "learning_rate": 1.230054804836213e-05, + "loss": 1.4532, + "step": 9203 + }, + { + "epoch": 1.16, + "grad_norm": 10.013521194458008, + "learning_rate": 1.2299711333305444e-05, + "loss": 0.9502, + "step": 9204 + }, + { + "epoch": 1.16, + "grad_norm": 6.075160503387451, + "learning_rate": 1.2298874618248758e-05, + "loss": 1.1342, + "step": 9205 + }, + { + "epoch": 1.16, + "grad_norm": 30.68280792236328, + "learning_rate": 1.2298037903192068e-05, + "loss": 1.3133, + "step": 9206 + }, + { + "epoch": 1.16, + "grad_norm": 11.82780933380127, + "learning_rate": 1.2297201188135382e-05, + "loss": 1.5006, + "step": 9207 + }, + { + "epoch": 1.16, + "grad_norm": 5.6558427810668945, + "learning_rate": 1.2296364473078695e-05, + "loss": 0.924, + "step": 9208 + }, + { + "epoch": 1.16, + "grad_norm": 5.863631725311279, + "learning_rate": 1.2295527758022005e-05, + "loss": 0.4676, + "step": 9209 + }, + { + "epoch": 1.16, + "grad_norm": 19.110694885253906, + "learning_rate": 1.2294691042965319e-05, + "loss": 1.1357, + "step": 9210 + }, + { + "epoch": 1.16, + "grad_norm": 66.21870422363281, + "learning_rate": 1.2293854327908633e-05, + "loss": 1.5988, + "step": 9211 + }, + { + "epoch": 1.16, + "grad_norm": 11.78620719909668, + "learning_rate": 1.2293017612851945e-05, + "loss": 1.1191, + "step": 9212 + }, + { + "epoch": 1.16, + "grad_norm": 12.453399658203125, + "learning_rate": 1.2292180897795257e-05, + "loss": 0.5343, + "step": 9213 + }, + { + "epoch": 1.16, + "grad_norm": 19.970155715942383, + "learning_rate": 1.2291344182738569e-05, + "loss": 0.6287, + "step": 9214 + }, + { + "epoch": 1.16, + "grad_norm": 9.808201789855957, + "learning_rate": 1.2290507467681882e-05, + "loss": 1.2625, + "step": 9215 + }, + { + "epoch": 1.16, + "grad_norm": 16.423688888549805, + "learning_rate": 1.2289670752625194e-05, + "loss": 0.4894, + "step": 9216 + }, + { + "epoch": 1.16, + "grad_norm": 14.505757331848145, + "learning_rate": 1.2288834037568506e-05, + "loss": 1.3869, + "step": 9217 + }, + { + "epoch": 1.16, + "grad_norm": 3.26084303855896, + "learning_rate": 1.228799732251182e-05, + "loss": 0.1848, + "step": 9218 + }, + { + "epoch": 1.16, + "grad_norm": 96.6875228881836, + "learning_rate": 1.2287160607455133e-05, + "loss": 1.6699, + "step": 9219 + }, + { + "epoch": 1.16, + "grad_norm": 131.69342041015625, + "learning_rate": 1.2286323892398444e-05, + "loss": 2.1287, + "step": 9220 + }, + { + "epoch": 1.16, + "grad_norm": 3.78143310546875, + "learning_rate": 1.2285487177341757e-05, + "loss": 0.4735, + "step": 9221 + }, + { + "epoch": 1.16, + "grad_norm": 20.10757827758789, + "learning_rate": 1.2284650462285071e-05, + "loss": 2.2554, + "step": 9222 + }, + { + "epoch": 1.16, + "grad_norm": 7.9812092781066895, + "learning_rate": 1.2283813747228381e-05, + "loss": 0.4739, + "step": 9223 + }, + { + "epoch": 1.16, + "grad_norm": 3.661271810531616, + "learning_rate": 1.2282977032171695e-05, + "loss": 0.2297, + "step": 9224 + }, + { + "epoch": 1.16, + "grad_norm": 16.200122833251953, + "learning_rate": 1.2282140317115009e-05, + "loss": 3.4471, + "step": 9225 + }, + { + "epoch": 1.16, + "grad_norm": 9.316689491271973, + "learning_rate": 1.228130360205832e-05, + "loss": 0.4403, + "step": 9226 + }, + { + "epoch": 1.16, + "grad_norm": 7.576048374176025, + "learning_rate": 1.2280466887001632e-05, + "loss": 0.1168, + "step": 9227 + }, + { + "epoch": 1.16, + "grad_norm": 6.368162155151367, + "learning_rate": 1.2279630171944944e-05, + "loss": 0.5202, + "step": 9228 + }, + { + "epoch": 1.16, + "grad_norm": 102.35724639892578, + "learning_rate": 1.2278793456888258e-05, + "loss": 1.4843, + "step": 9229 + }, + { + "epoch": 1.16, + "grad_norm": 12.44650650024414, + "learning_rate": 1.227795674183157e-05, + "loss": 2.0575, + "step": 9230 + }, + { + "epoch": 1.16, + "grad_norm": 9.62086296081543, + "learning_rate": 1.2277120026774882e-05, + "loss": 1.6411, + "step": 9231 + }, + { + "epoch": 1.16, + "grad_norm": 35.608333587646484, + "learning_rate": 1.2276283311718196e-05, + "loss": 0.4515, + "step": 9232 + }, + { + "epoch": 1.16, + "grad_norm": 29.228580474853516, + "learning_rate": 1.227544659666151e-05, + "loss": 2.278, + "step": 9233 + }, + { + "epoch": 1.16, + "grad_norm": 14.064435005187988, + "learning_rate": 1.227460988160482e-05, + "loss": 2.7028, + "step": 9234 + }, + { + "epoch": 1.16, + "grad_norm": 11.022710800170898, + "learning_rate": 1.2273773166548133e-05, + "loss": 0.677, + "step": 9235 + }, + { + "epoch": 1.16, + "grad_norm": 11.585491180419922, + "learning_rate": 1.2272936451491447e-05, + "loss": 1.2076, + "step": 9236 + }, + { + "epoch": 1.16, + "grad_norm": 12.864282608032227, + "learning_rate": 1.2272099736434757e-05, + "loss": 1.8201, + "step": 9237 + }, + { + "epoch": 1.16, + "grad_norm": 35.674781799316406, + "learning_rate": 1.227126302137807e-05, + "loss": 1.6032, + "step": 9238 + }, + { + "epoch": 1.16, + "grad_norm": 20.760175704956055, + "learning_rate": 1.2270426306321383e-05, + "loss": 2.4856, + "step": 9239 + }, + { + "epoch": 1.16, + "grad_norm": 10.651836395263672, + "learning_rate": 1.2269589591264696e-05, + "loss": 0.824, + "step": 9240 + }, + { + "epoch": 1.16, + "grad_norm": 15.389017105102539, + "learning_rate": 1.2268752876208008e-05, + "loss": 1.4099, + "step": 9241 + }, + { + "epoch": 1.16, + "grad_norm": 15.191420555114746, + "learning_rate": 1.226791616115132e-05, + "loss": 2.6674, + "step": 9242 + }, + { + "epoch": 1.16, + "grad_norm": 23.26456069946289, + "learning_rate": 1.2267079446094634e-05, + "loss": 2.9811, + "step": 9243 + }, + { + "epoch": 1.16, + "grad_norm": 8.608368873596191, + "learning_rate": 1.2266242731037946e-05, + "loss": 1.2524, + "step": 9244 + }, + { + "epoch": 1.16, + "grad_norm": 14.599865913391113, + "learning_rate": 1.2265406015981258e-05, + "loss": 0.48, + "step": 9245 + }, + { + "epoch": 1.16, + "grad_norm": 10.090410232543945, + "learning_rate": 1.2264569300924571e-05, + "loss": 0.8899, + "step": 9246 + }, + { + "epoch": 1.16, + "grad_norm": 23.541912078857422, + "learning_rate": 1.2263732585867885e-05, + "loss": 1.5031, + "step": 9247 + }, + { + "epoch": 1.16, + "grad_norm": 17.355106353759766, + "learning_rate": 1.2262895870811195e-05, + "loss": 1.131, + "step": 9248 + }, + { + "epoch": 1.16, + "grad_norm": 7.4488019943237305, + "learning_rate": 1.2262059155754509e-05, + "loss": 0.4096, + "step": 9249 + }, + { + "epoch": 1.16, + "grad_norm": 37.859893798828125, + "learning_rate": 1.2261222440697823e-05, + "loss": 1.2356, + "step": 9250 + }, + { + "epoch": 1.16, + "grad_norm": 8.11927604675293, + "learning_rate": 1.2260385725641133e-05, + "loss": 0.4152, + "step": 9251 + }, + { + "epoch": 1.16, + "grad_norm": 107.45669555664062, + "learning_rate": 1.2259549010584447e-05, + "loss": 2.2546, + "step": 9252 + }, + { + "epoch": 1.16, + "grad_norm": 19.212947845458984, + "learning_rate": 1.2258712295527759e-05, + "loss": 1.3481, + "step": 9253 + }, + { + "epoch": 1.16, + "grad_norm": 15.844950675964355, + "learning_rate": 1.2257875580471072e-05, + "loss": 0.9506, + "step": 9254 + }, + { + "epoch": 1.16, + "grad_norm": 32.015132904052734, + "learning_rate": 1.2257038865414384e-05, + "loss": 1.5036, + "step": 9255 + }, + { + "epoch": 1.16, + "grad_norm": 27.190282821655273, + "learning_rate": 1.2256202150357696e-05, + "loss": 2.8504, + "step": 9256 + }, + { + "epoch": 1.16, + "grad_norm": 11.14805793762207, + "learning_rate": 1.225536543530101e-05, + "loss": 0.7194, + "step": 9257 + }, + { + "epoch": 1.16, + "grad_norm": 15.691932678222656, + "learning_rate": 1.225452872024432e-05, + "loss": 1.5595, + "step": 9258 + }, + { + "epoch": 1.16, + "grad_norm": 15.650479316711426, + "learning_rate": 1.2253692005187634e-05, + "loss": 0.4911, + "step": 9259 + }, + { + "epoch": 1.16, + "grad_norm": 4.42494535446167, + "learning_rate": 1.2252855290130947e-05, + "loss": 0.4337, + "step": 9260 + }, + { + "epoch": 1.16, + "grad_norm": 19.028564453125, + "learning_rate": 1.2252018575074261e-05, + "loss": 0.6831, + "step": 9261 + }, + { + "epoch": 1.16, + "grad_norm": 4.2690110206604, + "learning_rate": 1.2251181860017571e-05, + "loss": 0.4737, + "step": 9262 + }, + { + "epoch": 1.16, + "grad_norm": 26.352842330932617, + "learning_rate": 1.2250345144960885e-05, + "loss": 1.4589, + "step": 9263 + }, + { + "epoch": 1.16, + "grad_norm": 31.72899627685547, + "learning_rate": 1.2249508429904199e-05, + "loss": 1.9815, + "step": 9264 + }, + { + "epoch": 1.16, + "grad_norm": 33.27836608886719, + "learning_rate": 1.2248671714847509e-05, + "loss": 2.9694, + "step": 9265 + }, + { + "epoch": 1.16, + "grad_norm": 17.309558868408203, + "learning_rate": 1.2247834999790822e-05, + "loss": 1.6879, + "step": 9266 + }, + { + "epoch": 1.16, + "grad_norm": 8.14188003540039, + "learning_rate": 1.2246998284734134e-05, + "loss": 1.1734, + "step": 9267 + }, + { + "epoch": 1.16, + "grad_norm": 36.28446578979492, + "learning_rate": 1.2246161569677448e-05, + "loss": 2.6889, + "step": 9268 + }, + { + "epoch": 1.16, + "grad_norm": 8.997748374938965, + "learning_rate": 1.224532485462076e-05, + "loss": 1.4469, + "step": 9269 + }, + { + "epoch": 1.16, + "grad_norm": 5.327282428741455, + "learning_rate": 1.2244488139564072e-05, + "loss": 0.3503, + "step": 9270 + }, + { + "epoch": 1.16, + "grad_norm": 11.748549461364746, + "learning_rate": 1.2243651424507386e-05, + "loss": 0.467, + "step": 9271 + }, + { + "epoch": 1.16, + "grad_norm": 46.473114013671875, + "learning_rate": 1.2242814709450696e-05, + "loss": 1.4334, + "step": 9272 + }, + { + "epoch": 1.16, + "grad_norm": 15.982047080993652, + "learning_rate": 1.224197799439401e-05, + "loss": 2.7342, + "step": 9273 + }, + { + "epoch": 1.16, + "grad_norm": 9.6621732711792, + "learning_rate": 1.2241141279337323e-05, + "loss": 1.6504, + "step": 9274 + }, + { + "epoch": 1.16, + "grad_norm": 63.71792984008789, + "learning_rate": 1.2240304564280637e-05, + "loss": 1.4364, + "step": 9275 + }, + { + "epoch": 1.16, + "grad_norm": 9.43286418914795, + "learning_rate": 1.2239467849223947e-05, + "loss": 0.2358, + "step": 9276 + }, + { + "epoch": 1.16, + "grad_norm": 88.21047973632812, + "learning_rate": 1.223863113416726e-05, + "loss": 0.829, + "step": 9277 + }, + { + "epoch": 1.16, + "grad_norm": 11.336211204528809, + "learning_rate": 1.2237794419110574e-05, + "loss": 0.9357, + "step": 9278 + }, + { + "epoch": 1.16, + "grad_norm": 10.616588592529297, + "learning_rate": 1.2236957704053885e-05, + "loss": 0.3243, + "step": 9279 + }, + { + "epoch": 1.16, + "grad_norm": 9.622396469116211, + "learning_rate": 1.2236120988997198e-05, + "loss": 0.981, + "step": 9280 + }, + { + "epoch": 1.16, + "grad_norm": 10.186108589172363, + "learning_rate": 1.223528427394051e-05, + "loss": 1.0945, + "step": 9281 + }, + { + "epoch": 1.16, + "grad_norm": 7.330602169036865, + "learning_rate": 1.2234447558883824e-05, + "loss": 0.5163, + "step": 9282 + }, + { + "epoch": 1.16, + "grad_norm": 7.188134670257568, + "learning_rate": 1.2233610843827136e-05, + "loss": 0.4402, + "step": 9283 + }, + { + "epoch": 1.17, + "grad_norm": 51.132850646972656, + "learning_rate": 1.2232774128770448e-05, + "loss": 3.2306, + "step": 9284 + }, + { + "epoch": 1.17, + "grad_norm": 5.105656623840332, + "learning_rate": 1.2231937413713761e-05, + "loss": 0.5543, + "step": 9285 + }, + { + "epoch": 1.17, + "grad_norm": 17.768465042114258, + "learning_rate": 1.2231100698657072e-05, + "loss": 2.3156, + "step": 9286 + }, + { + "epoch": 1.17, + "grad_norm": 19.271150588989258, + "learning_rate": 1.2230263983600385e-05, + "loss": 0.8452, + "step": 9287 + }, + { + "epoch": 1.17, + "grad_norm": 134.21510314941406, + "learning_rate": 1.2229427268543699e-05, + "loss": 3.0031, + "step": 9288 + }, + { + "epoch": 1.17, + "grad_norm": 12.793252944946289, + "learning_rate": 1.2228590553487013e-05, + "loss": 2.1216, + "step": 9289 + }, + { + "epoch": 1.17, + "grad_norm": 42.16161346435547, + "learning_rate": 1.2227753838430323e-05, + "loss": 0.4732, + "step": 9290 + }, + { + "epoch": 1.17, + "grad_norm": 10.013446807861328, + "learning_rate": 1.2226917123373637e-05, + "loss": 1.3425, + "step": 9291 + }, + { + "epoch": 1.17, + "grad_norm": 24.325328826904297, + "learning_rate": 1.2226080408316949e-05, + "loss": 2.1346, + "step": 9292 + }, + { + "epoch": 1.17, + "grad_norm": 15.697874069213867, + "learning_rate": 1.222524369326026e-05, + "loss": 1.4914, + "step": 9293 + }, + { + "epoch": 1.17, + "grad_norm": 4.737473487854004, + "learning_rate": 1.2224406978203574e-05, + "loss": 1.0438, + "step": 9294 + }, + { + "epoch": 1.17, + "grad_norm": 35.751625061035156, + "learning_rate": 1.2223570263146886e-05, + "loss": 2.2221, + "step": 9295 + }, + { + "epoch": 1.17, + "grad_norm": 120.206787109375, + "learning_rate": 1.22227335480902e-05, + "loss": 3.6724, + "step": 9296 + }, + { + "epoch": 1.17, + "grad_norm": 36.27464294433594, + "learning_rate": 1.222189683303351e-05, + "loss": 1.2908, + "step": 9297 + }, + { + "epoch": 1.17, + "grad_norm": 9.949959754943848, + "learning_rate": 1.2221060117976824e-05, + "loss": 1.3326, + "step": 9298 + }, + { + "epoch": 1.17, + "grad_norm": 15.542511940002441, + "learning_rate": 1.2220223402920137e-05, + "loss": 1.6399, + "step": 9299 + }, + { + "epoch": 1.17, + "grad_norm": 31.601863861083984, + "learning_rate": 1.2219386687863448e-05, + "loss": 1.898, + "step": 9300 + }, + { + "epoch": 1.17, + "grad_norm": 5.258601665496826, + "learning_rate": 1.2218549972806761e-05, + "loss": 0.6207, + "step": 9301 + }, + { + "epoch": 1.17, + "grad_norm": 7.814626693725586, + "learning_rate": 1.2217713257750075e-05, + "loss": 1.6168, + "step": 9302 + }, + { + "epoch": 1.17, + "grad_norm": 16.59807777404785, + "learning_rate": 1.2216876542693388e-05, + "loss": 1.2187, + "step": 9303 + }, + { + "epoch": 1.17, + "grad_norm": 21.79674530029297, + "learning_rate": 1.2216039827636699e-05, + "loss": 2.087, + "step": 9304 + }, + { + "epoch": 1.17, + "grad_norm": 4.025087356567383, + "learning_rate": 1.2215203112580012e-05, + "loss": 0.156, + "step": 9305 + }, + { + "epoch": 1.17, + "grad_norm": 16.624496459960938, + "learning_rate": 1.2214366397523324e-05, + "loss": 1.5409, + "step": 9306 + }, + { + "epoch": 1.17, + "grad_norm": 46.0156135559082, + "learning_rate": 1.2213529682466636e-05, + "loss": 0.845, + "step": 9307 + }, + { + "epoch": 1.17, + "grad_norm": 6.28947114944458, + "learning_rate": 1.221269296740995e-05, + "loss": 1.1145, + "step": 9308 + }, + { + "epoch": 1.17, + "grad_norm": 7.662294864654541, + "learning_rate": 1.2211856252353262e-05, + "loss": 0.4594, + "step": 9309 + }, + { + "epoch": 1.17, + "grad_norm": 8.82614517211914, + "learning_rate": 1.2211019537296576e-05, + "loss": 0.4761, + "step": 9310 + }, + { + "epoch": 1.17, + "grad_norm": 10.320185661315918, + "learning_rate": 1.2210182822239886e-05, + "loss": 1.7961, + "step": 9311 + }, + { + "epoch": 1.17, + "grad_norm": 24.989471435546875, + "learning_rate": 1.22093461071832e-05, + "loss": 1.0871, + "step": 9312 + }, + { + "epoch": 1.17, + "grad_norm": 70.48097229003906, + "learning_rate": 1.2208509392126513e-05, + "loss": 1.5549, + "step": 9313 + }, + { + "epoch": 1.17, + "grad_norm": 9.001704216003418, + "learning_rate": 1.2207672677069823e-05, + "loss": 1.3008, + "step": 9314 + }, + { + "epoch": 1.17, + "grad_norm": 38.65049743652344, + "learning_rate": 1.2206835962013137e-05, + "loss": 1.9781, + "step": 9315 + }, + { + "epoch": 1.17, + "grad_norm": 22.112384796142578, + "learning_rate": 1.220599924695645e-05, + "loss": 0.8403, + "step": 9316 + }, + { + "epoch": 1.17, + "grad_norm": 12.990418434143066, + "learning_rate": 1.2205162531899764e-05, + "loss": 1.2436, + "step": 9317 + }, + { + "epoch": 1.17, + "grad_norm": 23.103830337524414, + "learning_rate": 1.2204325816843075e-05, + "loss": 1.6826, + "step": 9318 + }, + { + "epoch": 1.17, + "grad_norm": 30.30733871459961, + "learning_rate": 1.2203489101786388e-05, + "loss": 2.2576, + "step": 9319 + }, + { + "epoch": 1.17, + "grad_norm": 14.125082015991211, + "learning_rate": 1.22026523867297e-05, + "loss": 2.0127, + "step": 9320 + }, + { + "epoch": 1.17, + "grad_norm": 21.839176177978516, + "learning_rate": 1.2201815671673012e-05, + "loss": 1.1863, + "step": 9321 + }, + { + "epoch": 1.17, + "grad_norm": 8.785724639892578, + "learning_rate": 1.2200978956616326e-05, + "loss": 0.6927, + "step": 9322 + }, + { + "epoch": 1.17, + "grad_norm": 11.992521286010742, + "learning_rate": 1.2200142241559638e-05, + "loss": 1.8727, + "step": 9323 + }, + { + "epoch": 1.17, + "grad_norm": 10.777341842651367, + "learning_rate": 1.2199305526502951e-05, + "loss": 0.4468, + "step": 9324 + }, + { + "epoch": 1.17, + "grad_norm": 9.365225791931152, + "learning_rate": 1.2198468811446262e-05, + "loss": 0.5203, + "step": 9325 + }, + { + "epoch": 1.17, + "grad_norm": 9.874039649963379, + "learning_rate": 1.2197632096389575e-05, + "loss": 1.1379, + "step": 9326 + }, + { + "epoch": 1.17, + "grad_norm": 27.9597225189209, + "learning_rate": 1.2196795381332889e-05, + "loss": 1.5369, + "step": 9327 + }, + { + "epoch": 1.17, + "grad_norm": 16.303752899169922, + "learning_rate": 1.21959586662762e-05, + "loss": 1.3029, + "step": 9328 + }, + { + "epoch": 1.17, + "grad_norm": 14.0924072265625, + "learning_rate": 1.2195121951219513e-05, + "loss": 2.3521, + "step": 9329 + }, + { + "epoch": 1.17, + "grad_norm": 21.292030334472656, + "learning_rate": 1.2194285236162827e-05, + "loss": 1.3894, + "step": 9330 + }, + { + "epoch": 1.17, + "grad_norm": 14.060272216796875, + "learning_rate": 1.2193448521106138e-05, + "loss": 1.4786, + "step": 9331 + }, + { + "epoch": 1.17, + "grad_norm": 19.819843292236328, + "learning_rate": 1.219261180604945e-05, + "loss": 1.2166, + "step": 9332 + }, + { + "epoch": 1.17, + "grad_norm": 3.6164567470550537, + "learning_rate": 1.2191775090992764e-05, + "loss": 0.1673, + "step": 9333 + }, + { + "epoch": 1.17, + "grad_norm": 36.086036682128906, + "learning_rate": 1.2190938375936076e-05, + "loss": 0.7871, + "step": 9334 + }, + { + "epoch": 1.17, + "grad_norm": 34.218055725097656, + "learning_rate": 1.2190101660879388e-05, + "loss": 1.5674, + "step": 9335 + }, + { + "epoch": 1.17, + "grad_norm": 13.336847305297852, + "learning_rate": 1.2189264945822702e-05, + "loss": 0.7522, + "step": 9336 + }, + { + "epoch": 1.17, + "grad_norm": 7.8779616355896, + "learning_rate": 1.2188428230766014e-05, + "loss": 0.9476, + "step": 9337 + }, + { + "epoch": 1.17, + "grad_norm": 16.02096176147461, + "learning_rate": 1.2187591515709327e-05, + "loss": 1.2687, + "step": 9338 + }, + { + "epoch": 1.17, + "grad_norm": 7.479913234710693, + "learning_rate": 1.2186754800652637e-05, + "loss": 0.2904, + "step": 9339 + }, + { + "epoch": 1.17, + "grad_norm": 13.346236228942871, + "learning_rate": 1.2185918085595951e-05, + "loss": 1.0085, + "step": 9340 + }, + { + "epoch": 1.17, + "grad_norm": 20.374624252319336, + "learning_rate": 1.2185081370539265e-05, + "loss": 1.7728, + "step": 9341 + }, + { + "epoch": 1.17, + "grad_norm": 23.75489044189453, + "learning_rate": 1.2184244655482575e-05, + "loss": 0.9994, + "step": 9342 + }, + { + "epoch": 1.17, + "grad_norm": 15.159979820251465, + "learning_rate": 1.2183407940425889e-05, + "loss": 0.9954, + "step": 9343 + }, + { + "epoch": 1.17, + "grad_norm": 18.356822967529297, + "learning_rate": 1.2182571225369202e-05, + "loss": 0.8046, + "step": 9344 + }, + { + "epoch": 1.17, + "grad_norm": 9.173781394958496, + "learning_rate": 1.2181734510312514e-05, + "loss": 1.5786, + "step": 9345 + }, + { + "epoch": 1.17, + "grad_norm": 12.518787384033203, + "learning_rate": 1.2180897795255826e-05, + "loss": 2.3228, + "step": 9346 + }, + { + "epoch": 1.17, + "grad_norm": 5.2768473625183105, + "learning_rate": 1.218006108019914e-05, + "loss": 0.2989, + "step": 9347 + }, + { + "epoch": 1.17, + "grad_norm": 20.644575119018555, + "learning_rate": 1.2179224365142452e-05, + "loss": 1.0783, + "step": 9348 + }, + { + "epoch": 1.17, + "grad_norm": 7.835660934448242, + "learning_rate": 1.2178387650085764e-05, + "loss": 1.0289, + "step": 9349 + }, + { + "epoch": 1.17, + "grad_norm": 15.82724666595459, + "learning_rate": 1.2177550935029076e-05, + "loss": 1.1451, + "step": 9350 + }, + { + "epoch": 1.17, + "grad_norm": 24.29384994506836, + "learning_rate": 1.217671421997239e-05, + "loss": 1.8794, + "step": 9351 + }, + { + "epoch": 1.17, + "grad_norm": 10.641297340393066, + "learning_rate": 1.2175877504915703e-05, + "loss": 2.2985, + "step": 9352 + }, + { + "epoch": 1.17, + "grad_norm": 9.606209754943848, + "learning_rate": 1.2175040789859013e-05, + "loss": 0.808, + "step": 9353 + }, + { + "epoch": 1.17, + "grad_norm": 23.063278198242188, + "learning_rate": 1.2174204074802327e-05, + "loss": 1.5276, + "step": 9354 + }, + { + "epoch": 1.17, + "grad_norm": 13.640690803527832, + "learning_rate": 1.217336735974564e-05, + "loss": 0.839, + "step": 9355 + }, + { + "epoch": 1.17, + "grad_norm": 16.292888641357422, + "learning_rate": 1.2172530644688951e-05, + "loss": 2.0711, + "step": 9356 + }, + { + "epoch": 1.17, + "grad_norm": 15.702163696289062, + "learning_rate": 1.2171693929632265e-05, + "loss": 0.8678, + "step": 9357 + }, + { + "epoch": 1.17, + "grad_norm": 8.12215518951416, + "learning_rate": 1.2170857214575578e-05, + "loss": 0.5314, + "step": 9358 + }, + { + "epoch": 1.17, + "grad_norm": 30.536754608154297, + "learning_rate": 1.217002049951889e-05, + "loss": 3.6003, + "step": 9359 + }, + { + "epoch": 1.17, + "grad_norm": 12.415929794311523, + "learning_rate": 1.2169183784462202e-05, + "loss": 1.9438, + "step": 9360 + }, + { + "epoch": 1.17, + "grad_norm": 76.1987075805664, + "learning_rate": 1.2168347069405516e-05, + "loss": 2.6336, + "step": 9361 + }, + { + "epoch": 1.17, + "grad_norm": 12.82883071899414, + "learning_rate": 1.2167510354348828e-05, + "loss": 2.2039, + "step": 9362 + }, + { + "epoch": 1.18, + "grad_norm": 5.220084190368652, + "learning_rate": 1.216667363929214e-05, + "loss": 0.2783, + "step": 9363 + }, + { + "epoch": 1.18, + "grad_norm": 3.6379334926605225, + "learning_rate": 1.2165836924235452e-05, + "loss": 0.4021, + "step": 9364 + }, + { + "epoch": 1.18, + "grad_norm": 6.193840026855469, + "learning_rate": 1.2165000209178765e-05, + "loss": 1.1253, + "step": 9365 + }, + { + "epoch": 1.18, + "grad_norm": 22.884403228759766, + "learning_rate": 1.2164163494122079e-05, + "loss": 2.1672, + "step": 9366 + }, + { + "epoch": 1.18, + "grad_norm": 22.401716232299805, + "learning_rate": 1.216332677906539e-05, + "loss": 1.6772, + "step": 9367 + }, + { + "epoch": 1.18, + "grad_norm": 27.59799575805664, + "learning_rate": 1.2162490064008703e-05, + "loss": 1.5655, + "step": 9368 + }, + { + "epoch": 1.18, + "grad_norm": 18.193872451782227, + "learning_rate": 1.2161653348952016e-05, + "loss": 1.5153, + "step": 9369 + }, + { + "epoch": 1.18, + "grad_norm": 18.48906135559082, + "learning_rate": 1.2160816633895327e-05, + "loss": 1.3137, + "step": 9370 + }, + { + "epoch": 1.18, + "grad_norm": 8.570785522460938, + "learning_rate": 1.215997991883864e-05, + "loss": 0.4115, + "step": 9371 + }, + { + "epoch": 1.18, + "grad_norm": 29.38730239868164, + "learning_rate": 1.2159143203781954e-05, + "loss": 0.9199, + "step": 9372 + }, + { + "epoch": 1.18, + "grad_norm": 34.051998138427734, + "learning_rate": 1.2158306488725266e-05, + "loss": 1.2682, + "step": 9373 + }, + { + "epoch": 1.18, + "grad_norm": 11.954904556274414, + "learning_rate": 1.2157469773668578e-05, + "loss": 1.2436, + "step": 9374 + }, + { + "epoch": 1.18, + "grad_norm": 16.466739654541016, + "learning_rate": 1.2156633058611892e-05, + "loss": 0.7362, + "step": 9375 + }, + { + "epoch": 1.18, + "grad_norm": 20.73897933959961, + "learning_rate": 1.2155796343555204e-05, + "loss": 1.3595, + "step": 9376 + }, + { + "epoch": 1.18, + "grad_norm": 23.741695404052734, + "learning_rate": 1.2154959628498515e-05, + "loss": 2.058, + "step": 9377 + }, + { + "epoch": 1.18, + "grad_norm": 21.302831649780273, + "learning_rate": 1.2154122913441827e-05, + "loss": 0.7095, + "step": 9378 + }, + { + "epoch": 1.18, + "grad_norm": 6.31294584274292, + "learning_rate": 1.2153286198385141e-05, + "loss": 0.6791, + "step": 9379 + }, + { + "epoch": 1.18, + "grad_norm": 33.14387130737305, + "learning_rate": 1.2152449483328455e-05, + "loss": 1.9076, + "step": 9380 + }, + { + "epoch": 1.18, + "grad_norm": 33.726531982421875, + "learning_rate": 1.2151612768271765e-05, + "loss": 2.0859, + "step": 9381 + }, + { + "epoch": 1.18, + "grad_norm": 19.91851234436035, + "learning_rate": 1.2150776053215079e-05, + "loss": 1.0878, + "step": 9382 + }, + { + "epoch": 1.18, + "grad_norm": 6.069289684295654, + "learning_rate": 1.2149939338158392e-05, + "loss": 0.5945, + "step": 9383 + }, + { + "epoch": 1.18, + "grad_norm": 24.550518035888672, + "learning_rate": 1.2149102623101703e-05, + "loss": 1.9977, + "step": 9384 + }, + { + "epoch": 1.18, + "grad_norm": 23.480838775634766, + "learning_rate": 1.2148265908045016e-05, + "loss": 1.4497, + "step": 9385 + }, + { + "epoch": 1.18, + "grad_norm": 9.521613121032715, + "learning_rate": 1.214742919298833e-05, + "loss": 0.4875, + "step": 9386 + }, + { + "epoch": 1.18, + "grad_norm": 30.82222557067871, + "learning_rate": 1.2146592477931642e-05, + "loss": 1.4363, + "step": 9387 + }, + { + "epoch": 1.18, + "grad_norm": 8.662206649780273, + "learning_rate": 1.2145755762874954e-05, + "loss": 1.2419, + "step": 9388 + }, + { + "epoch": 1.18, + "grad_norm": 5.719494342803955, + "learning_rate": 1.2144919047818267e-05, + "loss": 0.6059, + "step": 9389 + }, + { + "epoch": 1.18, + "grad_norm": 9.777727127075195, + "learning_rate": 1.214408233276158e-05, + "loss": 1.2383, + "step": 9390 + }, + { + "epoch": 1.18, + "grad_norm": 24.78661346435547, + "learning_rate": 1.2143245617704891e-05, + "loss": 1.2971, + "step": 9391 + }, + { + "epoch": 1.18, + "grad_norm": 8.432952880859375, + "learning_rate": 1.2142408902648203e-05, + "loss": 1.4945, + "step": 9392 + }, + { + "epoch": 1.18, + "grad_norm": 86.90373229980469, + "learning_rate": 1.2141572187591517e-05, + "loss": 2.8384, + "step": 9393 + }, + { + "epoch": 1.18, + "grad_norm": 16.666542053222656, + "learning_rate": 1.214073547253483e-05, + "loss": 0.9968, + "step": 9394 + }, + { + "epoch": 1.18, + "grad_norm": 17.281185150146484, + "learning_rate": 1.213989875747814e-05, + "loss": 1.9701, + "step": 9395 + }, + { + "epoch": 1.18, + "grad_norm": 14.964309692382812, + "learning_rate": 1.2139062042421454e-05, + "loss": 1.5485, + "step": 9396 + }, + { + "epoch": 1.18, + "grad_norm": 12.134135246276855, + "learning_rate": 1.2138225327364768e-05, + "loss": 1.7547, + "step": 9397 + }, + { + "epoch": 1.18, + "grad_norm": 48.08967971801758, + "learning_rate": 1.2137388612308078e-05, + "loss": 1.4255, + "step": 9398 + }, + { + "epoch": 1.18, + "grad_norm": 50.927547454833984, + "learning_rate": 1.2136551897251392e-05, + "loss": 2.8541, + "step": 9399 + }, + { + "epoch": 1.18, + "grad_norm": 28.605714797973633, + "learning_rate": 1.2135715182194706e-05, + "loss": 2.351, + "step": 9400 + }, + { + "epoch": 1.18, + "grad_norm": 13.128934860229492, + "learning_rate": 1.2134878467138018e-05, + "loss": 1.8754, + "step": 9401 + }, + { + "epoch": 1.18, + "grad_norm": 7.692806243896484, + "learning_rate": 1.213404175208133e-05, + "loss": 0.3688, + "step": 9402 + }, + { + "epoch": 1.18, + "grad_norm": 5.0777764320373535, + "learning_rate": 1.2133205037024642e-05, + "loss": 0.4806, + "step": 9403 + }, + { + "epoch": 1.18, + "grad_norm": 7.016156196594238, + "learning_rate": 1.2132368321967955e-05, + "loss": 1.9269, + "step": 9404 + }, + { + "epoch": 1.18, + "grad_norm": 11.855449676513672, + "learning_rate": 1.2131531606911267e-05, + "loss": 1.3325, + "step": 9405 + }, + { + "epoch": 1.18, + "grad_norm": 33.3604736328125, + "learning_rate": 1.2130694891854579e-05, + "loss": 1.5821, + "step": 9406 + }, + { + "epoch": 1.18, + "grad_norm": 15.44910717010498, + "learning_rate": 1.2129858176797893e-05, + "loss": 1.9013, + "step": 9407 + }, + { + "epoch": 1.18, + "grad_norm": 14.292071342468262, + "learning_rate": 1.2129021461741206e-05, + "loss": 0.676, + "step": 9408 + }, + { + "epoch": 1.18, + "grad_norm": 17.352420806884766, + "learning_rate": 1.2128184746684517e-05, + "loss": 2.2619, + "step": 9409 + }, + { + "epoch": 1.18, + "grad_norm": 15.946002006530762, + "learning_rate": 1.212734803162783e-05, + "loss": 1.85, + "step": 9410 + }, + { + "epoch": 1.18, + "grad_norm": 20.615232467651367, + "learning_rate": 1.2126511316571144e-05, + "loss": 1.3353, + "step": 9411 + }, + { + "epoch": 1.18, + "grad_norm": 17.23834800720215, + "learning_rate": 1.2125674601514454e-05, + "loss": 1.5281, + "step": 9412 + }, + { + "epoch": 1.18, + "grad_norm": 35.31224822998047, + "learning_rate": 1.2124837886457768e-05, + "loss": 1.5222, + "step": 9413 + }, + { + "epoch": 1.18, + "grad_norm": 13.358646392822266, + "learning_rate": 1.2124001171401082e-05, + "loss": 1.0086, + "step": 9414 + }, + { + "epoch": 1.18, + "grad_norm": 16.674345016479492, + "learning_rate": 1.2123164456344393e-05, + "loss": 1.2741, + "step": 9415 + }, + { + "epoch": 1.18, + "grad_norm": 24.218284606933594, + "learning_rate": 1.2122327741287705e-05, + "loss": 1.7174, + "step": 9416 + }, + { + "epoch": 1.18, + "grad_norm": 16.73403549194336, + "learning_rate": 1.2121491026231017e-05, + "loss": 0.8684, + "step": 9417 + }, + { + "epoch": 1.18, + "grad_norm": 16.773893356323242, + "learning_rate": 1.2120654311174331e-05, + "loss": 2.454, + "step": 9418 + }, + { + "epoch": 1.18, + "grad_norm": 11.357705116271973, + "learning_rate": 1.2119817596117643e-05, + "loss": 1.5169, + "step": 9419 + }, + { + "epoch": 1.18, + "grad_norm": 8.991296768188477, + "learning_rate": 1.2118980881060955e-05, + "loss": 1.1927, + "step": 9420 + }, + { + "epoch": 1.18, + "grad_norm": 33.26502990722656, + "learning_rate": 1.2118144166004269e-05, + "loss": 1.6418, + "step": 9421 + }, + { + "epoch": 1.18, + "grad_norm": 9.168608665466309, + "learning_rate": 1.2117307450947582e-05, + "loss": 1.3493, + "step": 9422 + }, + { + "epoch": 1.18, + "grad_norm": 7.570466041564941, + "learning_rate": 1.2116470735890893e-05, + "loss": 0.6953, + "step": 9423 + }, + { + "epoch": 1.18, + "grad_norm": 14.572734832763672, + "learning_rate": 1.2115634020834206e-05, + "loss": 1.4042, + "step": 9424 + }, + { + "epoch": 1.18, + "grad_norm": 7.8805694580078125, + "learning_rate": 1.211479730577752e-05, + "loss": 0.6732, + "step": 9425 + }, + { + "epoch": 1.18, + "grad_norm": 15.376143455505371, + "learning_rate": 1.211396059072083e-05, + "loss": 1.7732, + "step": 9426 + }, + { + "epoch": 1.18, + "grad_norm": 24.803756713867188, + "learning_rate": 1.2113123875664144e-05, + "loss": 1.6964, + "step": 9427 + }, + { + "epoch": 1.18, + "grad_norm": 8.024435997009277, + "learning_rate": 1.2112287160607457e-05, + "loss": 1.6161, + "step": 9428 + }, + { + "epoch": 1.18, + "grad_norm": 7.506372928619385, + "learning_rate": 1.211145044555077e-05, + "loss": 0.4802, + "step": 9429 + }, + { + "epoch": 1.18, + "grad_norm": 9.939638137817383, + "learning_rate": 1.2110613730494081e-05, + "loss": 1.49, + "step": 9430 + }, + { + "epoch": 1.18, + "grad_norm": 14.319657325744629, + "learning_rate": 1.2109777015437393e-05, + "loss": 3.0862, + "step": 9431 + }, + { + "epoch": 1.18, + "grad_norm": 17.68967628479004, + "learning_rate": 1.2108940300380707e-05, + "loss": 3.3128, + "step": 9432 + }, + { + "epoch": 1.18, + "grad_norm": 67.42790222167969, + "learning_rate": 1.2108103585324019e-05, + "loss": 1.6133, + "step": 9433 + }, + { + "epoch": 1.18, + "grad_norm": 13.851888656616211, + "learning_rate": 1.210726687026733e-05, + "loss": 1.8041, + "step": 9434 + }, + { + "epoch": 1.18, + "grad_norm": 17.65105438232422, + "learning_rate": 1.2106430155210644e-05, + "loss": 1.6052, + "step": 9435 + }, + { + "epoch": 1.18, + "grad_norm": 21.24810791015625, + "learning_rate": 1.2105593440153958e-05, + "loss": 2.0339, + "step": 9436 + }, + { + "epoch": 1.18, + "grad_norm": 17.26531410217285, + "learning_rate": 1.2104756725097268e-05, + "loss": 1.6868, + "step": 9437 + }, + { + "epoch": 1.18, + "grad_norm": 31.289440155029297, + "learning_rate": 1.2103920010040582e-05, + "loss": 2.7377, + "step": 9438 + }, + { + "epoch": 1.18, + "grad_norm": 10.300957679748535, + "learning_rate": 1.2103083294983896e-05, + "loss": 1.3814, + "step": 9439 + }, + { + "epoch": 1.18, + "grad_norm": 9.7875394821167, + "learning_rate": 1.2102246579927206e-05, + "loss": 0.6515, + "step": 9440 + }, + { + "epoch": 1.18, + "grad_norm": 25.404930114746094, + "learning_rate": 1.210140986487052e-05, + "loss": 0.9794, + "step": 9441 + }, + { + "epoch": 1.18, + "grad_norm": 44.835391998291016, + "learning_rate": 1.2100573149813832e-05, + "loss": 2.3566, + "step": 9442 + }, + { + "epoch": 1.19, + "grad_norm": 23.619457244873047, + "learning_rate": 1.2099736434757145e-05, + "loss": 1.4021, + "step": 9443 + }, + { + "epoch": 1.19, + "grad_norm": 11.631579399108887, + "learning_rate": 1.2098899719700457e-05, + "loss": 0.956, + "step": 9444 + }, + { + "epoch": 1.19, + "grad_norm": 11.371979713439941, + "learning_rate": 1.2098063004643769e-05, + "loss": 1.1925, + "step": 9445 + }, + { + "epoch": 1.19, + "grad_norm": 9.25599479675293, + "learning_rate": 1.2097226289587083e-05, + "loss": 0.718, + "step": 9446 + }, + { + "epoch": 1.19, + "grad_norm": 16.738479614257812, + "learning_rate": 1.2096389574530395e-05, + "loss": 1.8042, + "step": 9447 + }, + { + "epoch": 1.19, + "grad_norm": 5.86751651763916, + "learning_rate": 1.2095552859473707e-05, + "loss": 0.4073, + "step": 9448 + }, + { + "epoch": 1.19, + "grad_norm": 8.155218124389648, + "learning_rate": 1.209471614441702e-05, + "loss": 0.9582, + "step": 9449 + }, + { + "epoch": 1.19, + "grad_norm": 10.982494354248047, + "learning_rate": 1.2093879429360334e-05, + "loss": 0.7085, + "step": 9450 + }, + { + "epoch": 1.19, + "grad_norm": 11.960037231445312, + "learning_rate": 1.2093042714303644e-05, + "loss": 0.6946, + "step": 9451 + }, + { + "epoch": 1.19, + "grad_norm": 13.039807319641113, + "learning_rate": 1.2092205999246958e-05, + "loss": 2.2661, + "step": 9452 + }, + { + "epoch": 1.19, + "grad_norm": 19.76038932800293, + "learning_rate": 1.2091369284190271e-05, + "loss": 1.2509, + "step": 9453 + }, + { + "epoch": 1.19, + "grad_norm": 41.41776657104492, + "learning_rate": 1.2090532569133582e-05, + "loss": 2.9388, + "step": 9454 + }, + { + "epoch": 1.19, + "grad_norm": 29.097850799560547, + "learning_rate": 1.2089695854076895e-05, + "loss": 1.1569, + "step": 9455 + }, + { + "epoch": 1.19, + "grad_norm": 24.557373046875, + "learning_rate": 1.2088859139020207e-05, + "loss": 2.6758, + "step": 9456 + }, + { + "epoch": 1.19, + "grad_norm": 17.72818946838379, + "learning_rate": 1.208802242396352e-05, + "loss": 1.5893, + "step": 9457 + }, + { + "epoch": 1.19, + "grad_norm": 10.774362564086914, + "learning_rate": 1.2087185708906833e-05, + "loss": 0.3926, + "step": 9458 + }, + { + "epoch": 1.19, + "grad_norm": 27.049278259277344, + "learning_rate": 1.2086348993850145e-05, + "loss": 2.1748, + "step": 9459 + }, + { + "epoch": 1.19, + "grad_norm": 2.9520184993743896, + "learning_rate": 1.2085512278793459e-05, + "loss": 0.1981, + "step": 9460 + }, + { + "epoch": 1.19, + "grad_norm": 13.119494438171387, + "learning_rate": 1.2084675563736769e-05, + "loss": 0.9479, + "step": 9461 + }, + { + "epoch": 1.19, + "grad_norm": 62.712379455566406, + "learning_rate": 1.2083838848680082e-05, + "loss": 1.5795, + "step": 9462 + }, + { + "epoch": 1.19, + "grad_norm": 21.442840576171875, + "learning_rate": 1.2083002133623396e-05, + "loss": 0.9436, + "step": 9463 + }, + { + "epoch": 1.19, + "grad_norm": 16.34206771850586, + "learning_rate": 1.2082165418566706e-05, + "loss": 1.7083, + "step": 9464 + }, + { + "epoch": 1.19, + "grad_norm": 2.9206340312957764, + "learning_rate": 1.208132870351002e-05, + "loss": 0.1217, + "step": 9465 + }, + { + "epoch": 1.19, + "grad_norm": 12.184715270996094, + "learning_rate": 1.2080491988453334e-05, + "loss": 1.1073, + "step": 9466 + }, + { + "epoch": 1.19, + "grad_norm": 14.369974136352539, + "learning_rate": 1.2079655273396647e-05, + "loss": 2.6136, + "step": 9467 + }, + { + "epoch": 1.19, + "grad_norm": 11.854850769042969, + "learning_rate": 1.2078818558339958e-05, + "loss": 2.6624, + "step": 9468 + }, + { + "epoch": 1.19, + "grad_norm": 13.329398155212402, + "learning_rate": 1.2077981843283271e-05, + "loss": 1.7601, + "step": 9469 + }, + { + "epoch": 1.19, + "grad_norm": 15.483675956726074, + "learning_rate": 1.2077145128226583e-05, + "loss": 1.6243, + "step": 9470 + }, + { + "epoch": 1.19, + "grad_norm": 9.290410995483398, + "learning_rate": 1.2076308413169895e-05, + "loss": 1.1646, + "step": 9471 + }, + { + "epoch": 1.19, + "grad_norm": 14.221891403198242, + "learning_rate": 1.2075471698113209e-05, + "loss": 1.1798, + "step": 9472 + }, + { + "epoch": 1.19, + "grad_norm": 13.933663368225098, + "learning_rate": 1.207463498305652e-05, + "loss": 1.2359, + "step": 9473 + }, + { + "epoch": 1.19, + "grad_norm": 64.45641326904297, + "learning_rate": 1.2073798267999834e-05, + "loss": 2.1569, + "step": 9474 + }, + { + "epoch": 1.19, + "grad_norm": 8.408763885498047, + "learning_rate": 1.2072961552943145e-05, + "loss": 1.7103, + "step": 9475 + }, + { + "epoch": 1.19, + "grad_norm": 12.310800552368164, + "learning_rate": 1.2072124837886458e-05, + "loss": 1.4251, + "step": 9476 + }, + { + "epoch": 1.19, + "grad_norm": 12.391608238220215, + "learning_rate": 1.2071288122829772e-05, + "loss": 1.4087, + "step": 9477 + }, + { + "epoch": 1.19, + "grad_norm": 10.71947193145752, + "learning_rate": 1.2070451407773082e-05, + "loss": 2.111, + "step": 9478 + }, + { + "epoch": 1.19, + "grad_norm": 14.64475154876709, + "learning_rate": 1.2069614692716396e-05, + "loss": 1.4027, + "step": 9479 + }, + { + "epoch": 1.19, + "grad_norm": 24.743968963623047, + "learning_rate": 1.206877797765971e-05, + "loss": 2.4821, + "step": 9480 + }, + { + "epoch": 1.19, + "grad_norm": 26.19554901123047, + "learning_rate": 1.2067941262603023e-05, + "loss": 1.9297, + "step": 9481 + }, + { + "epoch": 1.19, + "grad_norm": 9.869488716125488, + "learning_rate": 1.2067104547546333e-05, + "loss": 0.6276, + "step": 9482 + }, + { + "epoch": 1.19, + "grad_norm": 9.796403884887695, + "learning_rate": 1.2066267832489647e-05, + "loss": 1.121, + "step": 9483 + }, + { + "epoch": 1.19, + "grad_norm": 16.084341049194336, + "learning_rate": 1.2065431117432959e-05, + "loss": 1.5531, + "step": 9484 + }, + { + "epoch": 1.19, + "grad_norm": 16.72888946533203, + "learning_rate": 1.2064594402376271e-05, + "loss": 1.2476, + "step": 9485 + }, + { + "epoch": 1.19, + "grad_norm": 16.48817253112793, + "learning_rate": 1.2063757687319585e-05, + "loss": 2.8725, + "step": 9486 + }, + { + "epoch": 1.19, + "grad_norm": 16.455909729003906, + "learning_rate": 1.2062920972262897e-05, + "loss": 2.6381, + "step": 9487 + }, + { + "epoch": 1.19, + "grad_norm": 12.815619468688965, + "learning_rate": 1.206208425720621e-05, + "loss": 0.9785, + "step": 9488 + }, + { + "epoch": 1.19, + "grad_norm": 6.762529373168945, + "learning_rate": 1.206124754214952e-05, + "loss": 0.1354, + "step": 9489 + }, + { + "epoch": 1.19, + "grad_norm": 30.014179229736328, + "learning_rate": 1.2060410827092834e-05, + "loss": 1.2246, + "step": 9490 + }, + { + "epoch": 1.19, + "grad_norm": 19.356752395629883, + "learning_rate": 1.2059574112036148e-05, + "loss": 0.9066, + "step": 9491 + }, + { + "epoch": 1.19, + "grad_norm": 8.490389823913574, + "learning_rate": 1.2058737396979458e-05, + "loss": 1.1231, + "step": 9492 + }, + { + "epoch": 1.19, + "grad_norm": 17.957721710205078, + "learning_rate": 1.2057900681922772e-05, + "loss": 1.0549, + "step": 9493 + }, + { + "epoch": 1.19, + "grad_norm": 10.60168170928955, + "learning_rate": 1.2057063966866085e-05, + "loss": 1.1987, + "step": 9494 + }, + { + "epoch": 1.19, + "grad_norm": 6.99045467376709, + "learning_rate": 1.2056227251809397e-05, + "loss": 0.9482, + "step": 9495 + }, + { + "epoch": 1.19, + "grad_norm": 15.194043159484863, + "learning_rate": 1.205539053675271e-05, + "loss": 2.5112, + "step": 9496 + }, + { + "epoch": 1.19, + "grad_norm": 24.683975219726562, + "learning_rate": 1.2054553821696023e-05, + "loss": 2.1607, + "step": 9497 + }, + { + "epoch": 1.19, + "grad_norm": 9.09386157989502, + "learning_rate": 1.2053717106639335e-05, + "loss": 0.9821, + "step": 9498 + }, + { + "epoch": 1.19, + "grad_norm": 11.077011108398438, + "learning_rate": 1.2052880391582647e-05, + "loss": 1.0397, + "step": 9499 + }, + { + "epoch": 1.19, + "grad_norm": 15.114931106567383, + "learning_rate": 1.205204367652596e-05, + "loss": 1.6384, + "step": 9500 + }, + { + "epoch": 1.19, + "grad_norm": 22.820463180541992, + "learning_rate": 1.2051206961469272e-05, + "loss": 1.9424, + "step": 9501 + }, + { + "epoch": 1.19, + "grad_norm": 8.954093933105469, + "learning_rate": 1.2050370246412586e-05, + "loss": 1.0824, + "step": 9502 + }, + { + "epoch": 1.19, + "grad_norm": 22.414491653442383, + "learning_rate": 1.2049533531355896e-05, + "loss": 1.0721, + "step": 9503 + }, + { + "epoch": 1.19, + "grad_norm": 6.525824069976807, + "learning_rate": 1.204869681629921e-05, + "loss": 0.3603, + "step": 9504 + }, + { + "epoch": 1.19, + "grad_norm": 30.8877010345459, + "learning_rate": 1.2047860101242524e-05, + "loss": 2.2824, + "step": 9505 + }, + { + "epoch": 1.19, + "grad_norm": 24.046707153320312, + "learning_rate": 1.2047023386185834e-05, + "loss": 0.8705, + "step": 9506 + }, + { + "epoch": 1.19, + "grad_norm": 9.529458045959473, + "learning_rate": 1.2046186671129148e-05, + "loss": 0.9239, + "step": 9507 + }, + { + "epoch": 1.19, + "grad_norm": 32.48001480102539, + "learning_rate": 1.2045349956072461e-05, + "loss": 2.0995, + "step": 9508 + }, + { + "epoch": 1.19, + "grad_norm": 16.195293426513672, + "learning_rate": 1.2044513241015773e-05, + "loss": 1.5304, + "step": 9509 + }, + { + "epoch": 1.19, + "grad_norm": 10.123017311096191, + "learning_rate": 1.2043676525959085e-05, + "loss": 0.7504, + "step": 9510 + }, + { + "epoch": 1.19, + "grad_norm": 24.254032135009766, + "learning_rate": 1.2042839810902399e-05, + "loss": 2.4331, + "step": 9511 + }, + { + "epoch": 1.19, + "grad_norm": 16.45452117919922, + "learning_rate": 1.204200309584571e-05, + "loss": 2.5155, + "step": 9512 + }, + { + "epoch": 1.19, + "grad_norm": 21.005456924438477, + "learning_rate": 1.2041166380789023e-05, + "loss": 1.4441, + "step": 9513 + }, + { + "epoch": 1.19, + "grad_norm": 20.228805541992188, + "learning_rate": 1.2040329665732335e-05, + "loss": 0.5575, + "step": 9514 + }, + { + "epoch": 1.19, + "grad_norm": 64.70474243164062, + "learning_rate": 1.2039492950675648e-05, + "loss": 2.7917, + "step": 9515 + }, + { + "epoch": 1.19, + "grad_norm": 38.86469650268555, + "learning_rate": 1.2038656235618962e-05, + "loss": 2.3465, + "step": 9516 + }, + { + "epoch": 1.19, + "grad_norm": 9.966197967529297, + "learning_rate": 1.2037819520562272e-05, + "loss": 0.9642, + "step": 9517 + }, + { + "epoch": 1.19, + "grad_norm": 18.563018798828125, + "learning_rate": 1.2036982805505586e-05, + "loss": 2.0094, + "step": 9518 + }, + { + "epoch": 1.19, + "grad_norm": 17.217992782592773, + "learning_rate": 1.20361460904489e-05, + "loss": 2.0467, + "step": 9519 + }, + { + "epoch": 1.19, + "grad_norm": 11.091466903686523, + "learning_rate": 1.203530937539221e-05, + "loss": 2.0162, + "step": 9520 + }, + { + "epoch": 1.19, + "grad_norm": 35.912391662597656, + "learning_rate": 1.2034472660335523e-05, + "loss": 2.7718, + "step": 9521 + }, + { + "epoch": 1.19, + "grad_norm": 7.927547454833984, + "learning_rate": 1.2033635945278837e-05, + "loss": 0.4048, + "step": 9522 + }, + { + "epoch": 1.2, + "grad_norm": 9.039243698120117, + "learning_rate": 1.2032799230222149e-05, + "loss": 1.4761, + "step": 9523 + }, + { + "epoch": 1.2, + "grad_norm": 12.982276916503906, + "learning_rate": 1.2031962515165461e-05, + "loss": 1.8997, + "step": 9524 + }, + { + "epoch": 1.2, + "grad_norm": 10.009096145629883, + "learning_rate": 1.2031125800108775e-05, + "loss": 1.811, + "step": 9525 + }, + { + "epoch": 1.2, + "grad_norm": 12.981189727783203, + "learning_rate": 1.2030289085052087e-05, + "loss": 2.2421, + "step": 9526 + }, + { + "epoch": 1.2, + "grad_norm": 7.837049961090088, + "learning_rate": 1.2029452369995398e-05, + "loss": 1.7047, + "step": 9527 + }, + { + "epoch": 1.2, + "grad_norm": 5.705050468444824, + "learning_rate": 1.202861565493871e-05, + "loss": 0.3916, + "step": 9528 + }, + { + "epoch": 1.2, + "grad_norm": 18.591602325439453, + "learning_rate": 1.2027778939882024e-05, + "loss": 0.877, + "step": 9529 + }, + { + "epoch": 1.2, + "grad_norm": 11.062577247619629, + "learning_rate": 1.2026942224825338e-05, + "loss": 0.8878, + "step": 9530 + }, + { + "epoch": 1.2, + "grad_norm": 18.63389015197754, + "learning_rate": 1.2026105509768648e-05, + "loss": 1.3153, + "step": 9531 + }, + { + "epoch": 1.2, + "grad_norm": 9.728708267211914, + "learning_rate": 1.2025268794711962e-05, + "loss": 2.1566, + "step": 9532 + }, + { + "epoch": 1.2, + "grad_norm": 3.1477296352386475, + "learning_rate": 1.2024432079655275e-05, + "loss": 0.1559, + "step": 9533 + }, + { + "epoch": 1.2, + "grad_norm": 5.642887592315674, + "learning_rate": 1.2023595364598586e-05, + "loss": 0.4317, + "step": 9534 + }, + { + "epoch": 1.2, + "grad_norm": 17.34537124633789, + "learning_rate": 1.20227586495419e-05, + "loss": 1.5061, + "step": 9535 + }, + { + "epoch": 1.2, + "grad_norm": 14.58328914642334, + "learning_rate": 1.2021921934485213e-05, + "loss": 1.1147, + "step": 9536 + }, + { + "epoch": 1.2, + "grad_norm": 11.474074363708496, + "learning_rate": 1.2021085219428525e-05, + "loss": 0.676, + "step": 9537 + }, + { + "epoch": 1.2, + "grad_norm": 26.583112716674805, + "learning_rate": 1.2020248504371837e-05, + "loss": 2.1511, + "step": 9538 + }, + { + "epoch": 1.2, + "grad_norm": 15.964679718017578, + "learning_rate": 1.201941178931515e-05, + "loss": 0.8863, + "step": 9539 + }, + { + "epoch": 1.2, + "grad_norm": 22.612957000732422, + "learning_rate": 1.2018575074258462e-05, + "loss": 3.0547, + "step": 9540 + }, + { + "epoch": 1.2, + "grad_norm": 8.257503509521484, + "learning_rate": 1.2017738359201774e-05, + "loss": 0.7493, + "step": 9541 + }, + { + "epoch": 1.2, + "grad_norm": 63.64929962158203, + "learning_rate": 1.2016901644145086e-05, + "loss": 0.9087, + "step": 9542 + }, + { + "epoch": 1.2, + "grad_norm": 12.743460655212402, + "learning_rate": 1.20160649290884e-05, + "loss": 1.8593, + "step": 9543 + }, + { + "epoch": 1.2, + "grad_norm": 16.666603088378906, + "learning_rate": 1.2015228214031714e-05, + "loss": 1.6155, + "step": 9544 + }, + { + "epoch": 1.2, + "grad_norm": 13.350123405456543, + "learning_rate": 1.2014391498975024e-05, + "loss": 2.1969, + "step": 9545 + }, + { + "epoch": 1.2, + "grad_norm": 4.722788333892822, + "learning_rate": 1.2013554783918337e-05, + "loss": 0.7194, + "step": 9546 + }, + { + "epoch": 1.2, + "grad_norm": 18.07695198059082, + "learning_rate": 1.2012718068861651e-05, + "loss": 0.9043, + "step": 9547 + }, + { + "epoch": 1.2, + "grad_norm": 8.8741455078125, + "learning_rate": 1.2011881353804961e-05, + "loss": 1.1142, + "step": 9548 + }, + { + "epoch": 1.2, + "grad_norm": 27.6228084564209, + "learning_rate": 1.2011044638748275e-05, + "loss": 2.4636, + "step": 9549 + }, + { + "epoch": 1.2, + "grad_norm": 21.675600051879883, + "learning_rate": 1.2010207923691589e-05, + "loss": 1.4064, + "step": 9550 + }, + { + "epoch": 1.2, + "grad_norm": 42.42505645751953, + "learning_rate": 1.20093712086349e-05, + "loss": 1.4687, + "step": 9551 + }, + { + "epoch": 1.2, + "grad_norm": 24.894182205200195, + "learning_rate": 1.2008534493578213e-05, + "loss": 0.8802, + "step": 9552 + }, + { + "epoch": 1.2, + "grad_norm": 6.725581169128418, + "learning_rate": 1.2007697778521525e-05, + "loss": 0.6847, + "step": 9553 + }, + { + "epoch": 1.2, + "grad_norm": 18.131277084350586, + "learning_rate": 1.2006861063464838e-05, + "loss": 0.911, + "step": 9554 + }, + { + "epoch": 1.2, + "grad_norm": 4.308471202850342, + "learning_rate": 1.200602434840815e-05, + "loss": 0.1425, + "step": 9555 + }, + { + "epoch": 1.2, + "grad_norm": 8.859646797180176, + "learning_rate": 1.2005187633351462e-05, + "loss": 1.3277, + "step": 9556 + }, + { + "epoch": 1.2, + "grad_norm": 12.082441329956055, + "learning_rate": 1.2004350918294776e-05, + "loss": 2.6566, + "step": 9557 + }, + { + "epoch": 1.2, + "grad_norm": 14.841496467590332, + "learning_rate": 1.200351420323809e-05, + "loss": 2.0174, + "step": 9558 + }, + { + "epoch": 1.2, + "grad_norm": 9.623128890991211, + "learning_rate": 1.20026774881814e-05, + "loss": 0.8134, + "step": 9559 + }, + { + "epoch": 1.2, + "grad_norm": 16.343841552734375, + "learning_rate": 1.2001840773124713e-05, + "loss": 1.8796, + "step": 9560 + }, + { + "epoch": 1.2, + "grad_norm": 9.812308311462402, + "learning_rate": 1.2001004058068027e-05, + "loss": 0.5763, + "step": 9561 + }, + { + "epoch": 1.2, + "grad_norm": 8.021820068359375, + "learning_rate": 1.2000167343011337e-05, + "loss": 0.3513, + "step": 9562 + }, + { + "epoch": 1.2, + "grad_norm": 8.692317008972168, + "learning_rate": 1.1999330627954651e-05, + "loss": 0.9401, + "step": 9563 + }, + { + "epoch": 1.2, + "grad_norm": 5.400107383728027, + "learning_rate": 1.1998493912897965e-05, + "loss": 0.2327, + "step": 9564 + }, + { + "epoch": 1.2, + "grad_norm": 25.09235382080078, + "learning_rate": 1.1997657197841276e-05, + "loss": 2.2308, + "step": 9565 + }, + { + "epoch": 1.2, + "grad_norm": 9.318791389465332, + "learning_rate": 1.1996820482784588e-05, + "loss": 0.8867, + "step": 9566 + }, + { + "epoch": 1.2, + "grad_norm": 21.22977066040039, + "learning_rate": 1.19959837677279e-05, + "loss": 1.4477, + "step": 9567 + }, + { + "epoch": 1.2, + "grad_norm": 12.80361270904541, + "learning_rate": 1.1995147052671214e-05, + "loss": 0.7557, + "step": 9568 + }, + { + "epoch": 1.2, + "grad_norm": 20.52949333190918, + "learning_rate": 1.1994310337614526e-05, + "loss": 1.7729, + "step": 9569 + }, + { + "epoch": 1.2, + "grad_norm": 12.310013771057129, + "learning_rate": 1.1993473622557838e-05, + "loss": 0.2901, + "step": 9570 + }, + { + "epoch": 1.2, + "grad_norm": 27.86255645751953, + "learning_rate": 1.1992636907501152e-05, + "loss": 0.8871, + "step": 9571 + }, + { + "epoch": 1.2, + "grad_norm": 11.100020408630371, + "learning_rate": 1.1991800192444465e-05, + "loss": 0.7504, + "step": 9572 + }, + { + "epoch": 1.2, + "grad_norm": 38.98814010620117, + "learning_rate": 1.1990963477387776e-05, + "loss": 1.8995, + "step": 9573 + }, + { + "epoch": 1.2, + "grad_norm": 7.79847526550293, + "learning_rate": 1.199012676233109e-05, + "loss": 0.8019, + "step": 9574 + }, + { + "epoch": 1.2, + "grad_norm": 14.081271171569824, + "learning_rate": 1.1989290047274403e-05, + "loss": 0.744, + "step": 9575 + }, + { + "epoch": 1.2, + "grad_norm": 64.49124145507812, + "learning_rate": 1.1988453332217713e-05, + "loss": 1.0467, + "step": 9576 + }, + { + "epoch": 1.2, + "grad_norm": 5.7524309158325195, + "learning_rate": 1.1987616617161027e-05, + "loss": 1.1582, + "step": 9577 + }, + { + "epoch": 1.2, + "grad_norm": 7.193423271179199, + "learning_rate": 1.198677990210434e-05, + "loss": 1.817, + "step": 9578 + }, + { + "epoch": 1.2, + "grad_norm": 10.854019165039062, + "learning_rate": 1.1985943187047652e-05, + "loss": 1.3564, + "step": 9579 + }, + { + "epoch": 1.2, + "grad_norm": 8.136621475219727, + "learning_rate": 1.1985106471990964e-05, + "loss": 1.3315, + "step": 9580 + }, + { + "epoch": 1.2, + "grad_norm": 21.077999114990234, + "learning_rate": 1.1984269756934276e-05, + "loss": 1.4171, + "step": 9581 + }, + { + "epoch": 1.2, + "grad_norm": 12.653505325317383, + "learning_rate": 1.198343304187759e-05, + "loss": 0.7738, + "step": 9582 + }, + { + "epoch": 1.2, + "grad_norm": 17.65804100036621, + "learning_rate": 1.1982596326820902e-05, + "loss": 0.7794, + "step": 9583 + }, + { + "epoch": 1.2, + "grad_norm": 20.522994995117188, + "learning_rate": 1.1981759611764214e-05, + "loss": 4.257, + "step": 9584 + }, + { + "epoch": 1.2, + "grad_norm": 42.09373092651367, + "learning_rate": 1.1980922896707527e-05, + "loss": 2.0501, + "step": 9585 + }, + { + "epoch": 1.2, + "grad_norm": 10.918661117553711, + "learning_rate": 1.1980086181650841e-05, + "loss": 1.1037, + "step": 9586 + }, + { + "epoch": 1.2, + "grad_norm": 6.394575595855713, + "learning_rate": 1.1979249466594151e-05, + "loss": 0.4025, + "step": 9587 + }, + { + "epoch": 1.2, + "grad_norm": 42.787078857421875, + "learning_rate": 1.1978412751537465e-05, + "loss": 2.5623, + "step": 9588 + }, + { + "epoch": 1.2, + "grad_norm": 21.253686904907227, + "learning_rate": 1.1977576036480779e-05, + "loss": 1.4255, + "step": 9589 + }, + { + "epoch": 1.2, + "grad_norm": 18.87506866455078, + "learning_rate": 1.1976739321424089e-05, + "loss": 1.337, + "step": 9590 + }, + { + "epoch": 1.2, + "grad_norm": 3.737640380859375, + "learning_rate": 1.1975902606367403e-05, + "loss": 0.3225, + "step": 9591 + }, + { + "epoch": 1.2, + "grad_norm": 32.349571228027344, + "learning_rate": 1.1975065891310716e-05, + "loss": 2.7442, + "step": 9592 + }, + { + "epoch": 1.2, + "grad_norm": 67.66178894042969, + "learning_rate": 1.1974229176254028e-05, + "loss": 1.4558, + "step": 9593 + }, + { + "epoch": 1.2, + "grad_norm": 75.0477066040039, + "learning_rate": 1.197339246119734e-05, + "loss": 1.3897, + "step": 9594 + }, + { + "epoch": 1.2, + "grad_norm": 15.68415641784668, + "learning_rate": 1.1972555746140652e-05, + "loss": 1.2878, + "step": 9595 + }, + { + "epoch": 1.2, + "grad_norm": 17.703025817871094, + "learning_rate": 1.1971719031083966e-05, + "loss": 0.7005, + "step": 9596 + }, + { + "epoch": 1.2, + "grad_norm": 15.474891662597656, + "learning_rate": 1.1970882316027278e-05, + "loss": 1.1582, + "step": 9597 + }, + { + "epoch": 1.2, + "grad_norm": 40.227596282958984, + "learning_rate": 1.197004560097059e-05, + "loss": 2.1653, + "step": 9598 + }, + { + "epoch": 1.2, + "grad_norm": 12.242630958557129, + "learning_rate": 1.1969208885913903e-05, + "loss": 0.6583, + "step": 9599 + }, + { + "epoch": 1.2, + "grad_norm": 10.278730392456055, + "learning_rate": 1.1968372170857217e-05, + "loss": 0.7933, + "step": 9600 + }, + { + "epoch": 1.2, + "eval_loss": 0.08709228038787842, + "eval_runtime": 94.6545, + "eval_samples_per_second": 37.42, + "eval_steps_per_second": 37.42, + "step": 9600 + }, + { + "epoch": 1.2, + "grad_norm": 10.507102966308594, + "learning_rate": 1.1967535455800527e-05, + "loss": 1.25, + "step": 9601 + }, + { + "epoch": 1.21, + "grad_norm": 44.48877716064453, + "learning_rate": 1.196669874074384e-05, + "loss": 1.0483, + "step": 9602 + }, + { + "epoch": 1.21, + "grad_norm": 9.914356231689453, + "learning_rate": 1.1965862025687154e-05, + "loss": 1.2343, + "step": 9603 + }, + { + "epoch": 1.21, + "grad_norm": 14.555281639099121, + "learning_rate": 1.1965025310630465e-05, + "loss": 2.6171, + "step": 9604 + }, + { + "epoch": 1.21, + "grad_norm": 40.05111312866211, + "learning_rate": 1.1964188595573778e-05, + "loss": 1.6383, + "step": 9605 + }, + { + "epoch": 1.21, + "grad_norm": 13.92563533782959, + "learning_rate": 1.196335188051709e-05, + "loss": 0.8991, + "step": 9606 + }, + { + "epoch": 1.21, + "grad_norm": 6.110177516937256, + "learning_rate": 1.1962515165460404e-05, + "loss": 0.4676, + "step": 9607 + }, + { + "epoch": 1.21, + "grad_norm": 7.3621063232421875, + "learning_rate": 1.1961678450403716e-05, + "loss": 0.866, + "step": 9608 + }, + { + "epoch": 1.21, + "grad_norm": 27.311689376831055, + "learning_rate": 1.1960841735347028e-05, + "loss": 0.966, + "step": 9609 + }, + { + "epoch": 1.21, + "grad_norm": 21.225568771362305, + "learning_rate": 1.1960005020290342e-05, + "loss": 0.9207, + "step": 9610 + }, + { + "epoch": 1.21, + "grad_norm": 5.978994369506836, + "learning_rate": 1.1959168305233654e-05, + "loss": 1.9674, + "step": 9611 + }, + { + "epoch": 1.21, + "grad_norm": 10.083782196044922, + "learning_rate": 1.1958331590176965e-05, + "loss": 1.1243, + "step": 9612 + }, + { + "epoch": 1.21, + "grad_norm": 16.697175979614258, + "learning_rate": 1.1957494875120279e-05, + "loss": 1.5605, + "step": 9613 + }, + { + "epoch": 1.21, + "grad_norm": 10.813374519348145, + "learning_rate": 1.1956658160063593e-05, + "loss": 1.3633, + "step": 9614 + }, + { + "epoch": 1.21, + "grad_norm": 19.021514892578125, + "learning_rate": 1.1955821445006903e-05, + "loss": 1.9111, + "step": 9615 + }, + { + "epoch": 1.21, + "grad_norm": 17.797969818115234, + "learning_rate": 1.1954984729950217e-05, + "loss": 1.2812, + "step": 9616 + }, + { + "epoch": 1.21, + "grad_norm": 11.785213470458984, + "learning_rate": 1.195414801489353e-05, + "loss": 1.0591, + "step": 9617 + }, + { + "epoch": 1.21, + "grad_norm": 7.536766052246094, + "learning_rate": 1.195331129983684e-05, + "loss": 0.3705, + "step": 9618 + }, + { + "epoch": 1.21, + "grad_norm": 7.87598180770874, + "learning_rate": 1.1952474584780154e-05, + "loss": 0.8564, + "step": 9619 + }, + { + "epoch": 1.21, + "grad_norm": 5.448538780212402, + "learning_rate": 1.1951637869723466e-05, + "loss": 0.4751, + "step": 9620 + }, + { + "epoch": 1.21, + "grad_norm": 4.328154563903809, + "learning_rate": 1.195080115466678e-05, + "loss": 0.3251, + "step": 9621 + }, + { + "epoch": 1.21, + "grad_norm": 10.848305702209473, + "learning_rate": 1.1949964439610092e-05, + "loss": 0.9825, + "step": 9622 + }, + { + "epoch": 1.21, + "grad_norm": 33.84737777709961, + "learning_rate": 1.1949127724553404e-05, + "loss": 1.1589, + "step": 9623 + }, + { + "epoch": 1.21, + "grad_norm": 54.30733871459961, + "learning_rate": 1.1948291009496717e-05, + "loss": 1.9604, + "step": 9624 + }, + { + "epoch": 1.21, + "grad_norm": 13.447796821594238, + "learning_rate": 1.1947454294440028e-05, + "loss": 0.7447, + "step": 9625 + }, + { + "epoch": 1.21, + "grad_norm": 46.959598541259766, + "learning_rate": 1.1946617579383341e-05, + "loss": 1.8551, + "step": 9626 + }, + { + "epoch": 1.21, + "grad_norm": 23.30669403076172, + "learning_rate": 1.1945780864326655e-05, + "loss": 2.3335, + "step": 9627 + }, + { + "epoch": 1.21, + "grad_norm": 7.455014228820801, + "learning_rate": 1.1944944149269969e-05, + "loss": 1.0215, + "step": 9628 + }, + { + "epoch": 1.21, + "grad_norm": 35.093746185302734, + "learning_rate": 1.1944107434213279e-05, + "loss": 1.5814, + "step": 9629 + }, + { + "epoch": 1.21, + "grad_norm": 7.487492084503174, + "learning_rate": 1.1943270719156593e-05, + "loss": 1.1403, + "step": 9630 + }, + { + "epoch": 1.21, + "grad_norm": 87.34281158447266, + "learning_rate": 1.1942434004099906e-05, + "loss": 2.2956, + "step": 9631 + }, + { + "epoch": 1.21, + "grad_norm": 9.11362361907959, + "learning_rate": 1.1941597289043216e-05, + "loss": 0.8279, + "step": 9632 + }, + { + "epoch": 1.21, + "grad_norm": 7.813751697540283, + "learning_rate": 1.194076057398653e-05, + "loss": 1.2104, + "step": 9633 + }, + { + "epoch": 1.21, + "grad_norm": 18.270851135253906, + "learning_rate": 1.1939923858929842e-05, + "loss": 0.8104, + "step": 9634 + }, + { + "epoch": 1.21, + "grad_norm": 19.307849884033203, + "learning_rate": 1.1939087143873156e-05, + "loss": 0.422, + "step": 9635 + }, + { + "epoch": 1.21, + "grad_norm": 15.76440143585205, + "learning_rate": 1.1938250428816468e-05, + "loss": 1.7508, + "step": 9636 + }, + { + "epoch": 1.21, + "grad_norm": 43.16948318481445, + "learning_rate": 1.193741371375978e-05, + "loss": 0.1653, + "step": 9637 + }, + { + "epoch": 1.21, + "grad_norm": 21.53644371032715, + "learning_rate": 1.1936576998703093e-05, + "loss": 1.9636, + "step": 9638 + }, + { + "epoch": 1.21, + "grad_norm": 5.95522403717041, + "learning_rate": 1.1935740283646403e-05, + "loss": 0.4349, + "step": 9639 + }, + { + "epoch": 1.21, + "grad_norm": 11.78096866607666, + "learning_rate": 1.1934903568589717e-05, + "loss": 1.6425, + "step": 9640 + }, + { + "epoch": 1.21, + "grad_norm": 49.823890686035156, + "learning_rate": 1.193406685353303e-05, + "loss": 2.4893, + "step": 9641 + }, + { + "epoch": 1.21, + "grad_norm": 12.305930137634277, + "learning_rate": 1.1933230138476344e-05, + "loss": 0.9311, + "step": 9642 + }, + { + "epoch": 1.21, + "grad_norm": 17.963987350463867, + "learning_rate": 1.1932393423419655e-05, + "loss": 0.686, + "step": 9643 + }, + { + "epoch": 1.21, + "grad_norm": 7.458902359008789, + "learning_rate": 1.1931556708362968e-05, + "loss": 0.6735, + "step": 9644 + }, + { + "epoch": 1.21, + "grad_norm": 26.11444091796875, + "learning_rate": 1.1930719993306282e-05, + "loss": 2.1362, + "step": 9645 + }, + { + "epoch": 1.21, + "grad_norm": 39.13798141479492, + "learning_rate": 1.1929883278249592e-05, + "loss": 2.6185, + "step": 9646 + }, + { + "epoch": 1.21, + "grad_norm": 11.085432052612305, + "learning_rate": 1.1929046563192906e-05, + "loss": 0.6792, + "step": 9647 + }, + { + "epoch": 1.21, + "grad_norm": 25.43505096435547, + "learning_rate": 1.1928209848136218e-05, + "loss": 0.9783, + "step": 9648 + }, + { + "epoch": 1.21, + "grad_norm": 11.517626762390137, + "learning_rate": 1.1927373133079532e-05, + "loss": 1.6657, + "step": 9649 + }, + { + "epoch": 1.21, + "grad_norm": 10.83224105834961, + "learning_rate": 1.1926536418022843e-05, + "loss": 2.2246, + "step": 9650 + }, + { + "epoch": 1.21, + "grad_norm": 33.41084671020508, + "learning_rate": 1.1925699702966155e-05, + "loss": 1.7027, + "step": 9651 + }, + { + "epoch": 1.21, + "grad_norm": 9.236109733581543, + "learning_rate": 1.1924862987909469e-05, + "loss": 0.8989, + "step": 9652 + }, + { + "epoch": 1.21, + "grad_norm": 32.25490188598633, + "learning_rate": 1.192402627285278e-05, + "loss": 0.4262, + "step": 9653 + }, + { + "epoch": 1.21, + "grad_norm": 22.86430549621582, + "learning_rate": 1.1923189557796093e-05, + "loss": 1.3529, + "step": 9654 + }, + { + "epoch": 1.21, + "grad_norm": 135.71849060058594, + "learning_rate": 1.1922352842739407e-05, + "loss": 3.4075, + "step": 9655 + }, + { + "epoch": 1.21, + "grad_norm": 3.792234420776367, + "learning_rate": 1.192151612768272e-05, + "loss": 0.2315, + "step": 9656 + }, + { + "epoch": 1.21, + "grad_norm": 18.394742965698242, + "learning_rate": 1.192067941262603e-05, + "loss": 1.0349, + "step": 9657 + }, + { + "epoch": 1.21, + "grad_norm": 6.221388816833496, + "learning_rate": 1.1919842697569344e-05, + "loss": 0.3315, + "step": 9658 + }, + { + "epoch": 1.21, + "grad_norm": 8.869363784790039, + "learning_rate": 1.1919005982512656e-05, + "loss": 0.4505, + "step": 9659 + }, + { + "epoch": 1.21, + "grad_norm": 9.412805557250977, + "learning_rate": 1.1918169267455968e-05, + "loss": 0.6926, + "step": 9660 + }, + { + "epoch": 1.21, + "grad_norm": 3.7438879013061523, + "learning_rate": 1.1917332552399282e-05, + "loss": 0.237, + "step": 9661 + }, + { + "epoch": 1.21, + "grad_norm": 19.818506240844727, + "learning_rate": 1.1916495837342594e-05, + "loss": 0.9026, + "step": 9662 + }, + { + "epoch": 1.21, + "grad_norm": 2.5379445552825928, + "learning_rate": 1.1915659122285907e-05, + "loss": 0.0703, + "step": 9663 + }, + { + "epoch": 1.21, + "grad_norm": 100.18061065673828, + "learning_rate": 1.191482240722922e-05, + "loss": 1.19, + "step": 9664 + }, + { + "epoch": 1.21, + "grad_norm": 4.609733581542969, + "learning_rate": 1.1913985692172531e-05, + "loss": 0.2924, + "step": 9665 + }, + { + "epoch": 1.21, + "grad_norm": 89.28193664550781, + "learning_rate": 1.1913148977115845e-05, + "loss": 2.1219, + "step": 9666 + }, + { + "epoch": 1.21, + "grad_norm": 44.11702346801758, + "learning_rate": 1.1912312262059155e-05, + "loss": 1.3472, + "step": 9667 + }, + { + "epoch": 1.21, + "grad_norm": 9.148381233215332, + "learning_rate": 1.1911475547002469e-05, + "loss": 1.6529, + "step": 9668 + }, + { + "epoch": 1.21, + "grad_norm": 60.04631042480469, + "learning_rate": 1.1910638831945782e-05, + "loss": 0.3966, + "step": 9669 + }, + { + "epoch": 1.21, + "grad_norm": 9.514388084411621, + "learning_rate": 1.1909802116889096e-05, + "loss": 0.8149, + "step": 9670 + }, + { + "epoch": 1.21, + "grad_norm": 13.282306671142578, + "learning_rate": 1.1908965401832406e-05, + "loss": 1.6097, + "step": 9671 + }, + { + "epoch": 1.21, + "grad_norm": 93.24066925048828, + "learning_rate": 1.190812868677572e-05, + "loss": 1.8783, + "step": 9672 + }, + { + "epoch": 1.21, + "grad_norm": 24.716209411621094, + "learning_rate": 1.1907291971719032e-05, + "loss": 3.0982, + "step": 9673 + }, + { + "epoch": 1.21, + "grad_norm": 8.042412757873535, + "learning_rate": 1.1906455256662344e-05, + "loss": 1.1715, + "step": 9674 + }, + { + "epoch": 1.21, + "grad_norm": 47.73003387451172, + "learning_rate": 1.1905618541605658e-05, + "loss": 1.9625, + "step": 9675 + }, + { + "epoch": 1.21, + "grad_norm": 19.045705795288086, + "learning_rate": 1.190478182654897e-05, + "loss": 1.1437, + "step": 9676 + }, + { + "epoch": 1.21, + "grad_norm": 13.466931343078613, + "learning_rate": 1.1903945111492283e-05, + "loss": 1.1626, + "step": 9677 + }, + { + "epoch": 1.21, + "grad_norm": 11.230758666992188, + "learning_rate": 1.1903108396435593e-05, + "loss": 1.8554, + "step": 9678 + }, + { + "epoch": 1.21, + "grad_norm": 28.8801326751709, + "learning_rate": 1.1902271681378907e-05, + "loss": 2.4584, + "step": 9679 + }, + { + "epoch": 1.21, + "grad_norm": 3.975315809249878, + "learning_rate": 1.190143496632222e-05, + "loss": 0.378, + "step": 9680 + }, + { + "epoch": 1.21, + "grad_norm": 8.131440162658691, + "learning_rate": 1.1900598251265531e-05, + "loss": 0.597, + "step": 9681 + }, + { + "epoch": 1.22, + "grad_norm": 9.377288818359375, + "learning_rate": 1.1899761536208845e-05, + "loss": 1.2148, + "step": 9682 + }, + { + "epoch": 1.22, + "grad_norm": 7.312132358551025, + "learning_rate": 1.1898924821152158e-05, + "loss": 0.8062, + "step": 9683 + }, + { + "epoch": 1.22, + "grad_norm": 8.498202323913574, + "learning_rate": 1.1898088106095472e-05, + "loss": 0.9118, + "step": 9684 + }, + { + "epoch": 1.22, + "grad_norm": 4.082993507385254, + "learning_rate": 1.1897251391038782e-05, + "loss": 1.4373, + "step": 9685 + }, + { + "epoch": 1.22, + "grad_norm": 24.017648696899414, + "learning_rate": 1.1896414675982096e-05, + "loss": 2.074, + "step": 9686 + }, + { + "epoch": 1.22, + "grad_norm": 25.38010025024414, + "learning_rate": 1.1895577960925408e-05, + "loss": 1.515, + "step": 9687 + }, + { + "epoch": 1.22, + "grad_norm": 13.79157543182373, + "learning_rate": 1.189474124586872e-05, + "loss": 0.6778, + "step": 9688 + }, + { + "epoch": 1.22, + "grad_norm": 30.164793014526367, + "learning_rate": 1.1893904530812033e-05, + "loss": 1.5578, + "step": 9689 + }, + { + "epoch": 1.22, + "grad_norm": 16.952709197998047, + "learning_rate": 1.1893067815755345e-05, + "loss": 1.8537, + "step": 9690 + }, + { + "epoch": 1.22, + "grad_norm": 9.253092765808105, + "learning_rate": 1.1892231100698659e-05, + "loss": 0.723, + "step": 9691 + }, + { + "epoch": 1.22, + "grad_norm": 59.83599853515625, + "learning_rate": 1.189139438564197e-05, + "loss": 2.5, + "step": 9692 + }, + { + "epoch": 1.22, + "grad_norm": 61.06582260131836, + "learning_rate": 1.1890557670585283e-05, + "loss": 1.3422, + "step": 9693 + }, + { + "epoch": 1.22, + "grad_norm": 11.885011672973633, + "learning_rate": 1.1889720955528597e-05, + "loss": 1.3652, + "step": 9694 + }, + { + "epoch": 1.22, + "grad_norm": 15.825852394104004, + "learning_rate": 1.1888884240471907e-05, + "loss": 3.3345, + "step": 9695 + }, + { + "epoch": 1.22, + "grad_norm": 19.11424446105957, + "learning_rate": 1.188804752541522e-05, + "loss": 1.8801, + "step": 9696 + }, + { + "epoch": 1.22, + "grad_norm": 36.316688537597656, + "learning_rate": 1.1887210810358534e-05, + "loss": 2.3157, + "step": 9697 + }, + { + "epoch": 1.22, + "grad_norm": 12.54317569732666, + "learning_rate": 1.1886374095301846e-05, + "loss": 1.6094, + "step": 9698 + }, + { + "epoch": 1.22, + "grad_norm": 66.97698211669922, + "learning_rate": 1.1885537380245158e-05, + "loss": 1.1063, + "step": 9699 + }, + { + "epoch": 1.22, + "grad_norm": 5.200563430786133, + "learning_rate": 1.1884700665188472e-05, + "loss": 0.6615, + "step": 9700 + }, + { + "epoch": 1.22, + "grad_norm": 18.189468383789062, + "learning_rate": 1.1883863950131784e-05, + "loss": 1.4355, + "step": 9701 + }, + { + "epoch": 1.22, + "grad_norm": 11.666252136230469, + "learning_rate": 1.1883027235075096e-05, + "loss": 0.8702, + "step": 9702 + }, + { + "epoch": 1.22, + "grad_norm": 12.471773147583008, + "learning_rate": 1.188219052001841e-05, + "loss": 0.8736, + "step": 9703 + }, + { + "epoch": 1.22, + "grad_norm": 15.201849937438965, + "learning_rate": 1.1881353804961721e-05, + "loss": 1.2114, + "step": 9704 + }, + { + "epoch": 1.22, + "grad_norm": 14.885287284851074, + "learning_rate": 1.1880517089905035e-05, + "loss": 0.916, + "step": 9705 + }, + { + "epoch": 1.22, + "grad_norm": 5.266726016998291, + "learning_rate": 1.1879680374848345e-05, + "loss": 0.4309, + "step": 9706 + }, + { + "epoch": 1.22, + "grad_norm": 7.320878505706787, + "learning_rate": 1.1878843659791659e-05, + "loss": 0.4919, + "step": 9707 + }, + { + "epoch": 1.22, + "grad_norm": 10.768665313720703, + "learning_rate": 1.1878006944734972e-05, + "loss": 1.098, + "step": 9708 + }, + { + "epoch": 1.22, + "grad_norm": 9.963107109069824, + "learning_rate": 1.1877170229678283e-05, + "loss": 0.6641, + "step": 9709 + }, + { + "epoch": 1.22, + "grad_norm": 14.743289947509766, + "learning_rate": 1.1876333514621596e-05, + "loss": 1.0066, + "step": 9710 + }, + { + "epoch": 1.22, + "grad_norm": 10.966519355773926, + "learning_rate": 1.187549679956491e-05, + "loss": 0.6277, + "step": 9711 + }, + { + "epoch": 1.22, + "grad_norm": 17.250471115112305, + "learning_rate": 1.1874660084508222e-05, + "loss": 1.2056, + "step": 9712 + }, + { + "epoch": 1.22, + "grad_norm": 18.81276512145996, + "learning_rate": 1.1873823369451534e-05, + "loss": 1.4703, + "step": 9713 + }, + { + "epoch": 1.22, + "grad_norm": 14.694525718688965, + "learning_rate": 1.1872986654394848e-05, + "loss": 1.6066, + "step": 9714 + }, + { + "epoch": 1.22, + "grad_norm": 24.63129425048828, + "learning_rate": 1.187214993933816e-05, + "loss": 3.3113, + "step": 9715 + }, + { + "epoch": 1.22, + "grad_norm": 16.88232421875, + "learning_rate": 1.1871313224281471e-05, + "loss": 0.742, + "step": 9716 + }, + { + "epoch": 1.22, + "grad_norm": 9.315510749816895, + "learning_rate": 1.1870476509224783e-05, + "loss": 0.6042, + "step": 9717 + }, + { + "epoch": 1.22, + "grad_norm": 10.096471786499023, + "learning_rate": 1.1869639794168097e-05, + "loss": 0.448, + "step": 9718 + }, + { + "epoch": 1.22, + "grad_norm": 16.892091751098633, + "learning_rate": 1.186880307911141e-05, + "loss": 1.3073, + "step": 9719 + }, + { + "epoch": 1.22, + "grad_norm": 7.081926345825195, + "learning_rate": 1.1867966364054721e-05, + "loss": 0.9993, + "step": 9720 + }, + { + "epoch": 1.22, + "grad_norm": 8.064104080200195, + "learning_rate": 1.1867129648998035e-05, + "loss": 0.731, + "step": 9721 + }, + { + "epoch": 1.22, + "grad_norm": 50.32485580444336, + "learning_rate": 1.1866292933941348e-05, + "loss": 1.3959, + "step": 9722 + }, + { + "epoch": 1.22, + "grad_norm": 26.517179489135742, + "learning_rate": 1.1865456218884659e-05, + "loss": 4.9971, + "step": 9723 + }, + { + "epoch": 1.22, + "grad_norm": 98.24976348876953, + "learning_rate": 1.1864619503827972e-05, + "loss": 1.5253, + "step": 9724 + }, + { + "epoch": 1.22, + "grad_norm": 36.602752685546875, + "learning_rate": 1.1863782788771286e-05, + "loss": 1.2071, + "step": 9725 + }, + { + "epoch": 1.22, + "grad_norm": 9.339238166809082, + "learning_rate": 1.1862946073714598e-05, + "loss": 1.1399, + "step": 9726 + }, + { + "epoch": 1.22, + "grad_norm": 17.146467208862305, + "learning_rate": 1.186210935865791e-05, + "loss": 0.4849, + "step": 9727 + }, + { + "epoch": 1.22, + "grad_norm": 9.974125862121582, + "learning_rate": 1.1861272643601223e-05, + "loss": 1.5309, + "step": 9728 + }, + { + "epoch": 1.22, + "grad_norm": 13.77196979522705, + "learning_rate": 1.1860435928544535e-05, + "loss": 0.5355, + "step": 9729 + }, + { + "epoch": 1.22, + "grad_norm": 7.349488735198975, + "learning_rate": 1.1859599213487847e-05, + "loss": 1.2569, + "step": 9730 + }, + { + "epoch": 1.22, + "grad_norm": 20.587677001953125, + "learning_rate": 1.185876249843116e-05, + "loss": 1.1589, + "step": 9731 + }, + { + "epoch": 1.22, + "grad_norm": 20.375568389892578, + "learning_rate": 1.1857925783374473e-05, + "loss": 2.3431, + "step": 9732 + }, + { + "epoch": 1.22, + "grad_norm": 8.66171932220459, + "learning_rate": 1.1857089068317787e-05, + "loss": 1.5323, + "step": 9733 + }, + { + "epoch": 1.22, + "grad_norm": 14.362919807434082, + "learning_rate": 1.1856252353261097e-05, + "loss": 1.5258, + "step": 9734 + }, + { + "epoch": 1.22, + "grad_norm": 18.112083435058594, + "learning_rate": 1.185541563820441e-05, + "loss": 2.2156, + "step": 9735 + }, + { + "epoch": 1.22, + "grad_norm": 7.53700590133667, + "learning_rate": 1.1854578923147724e-05, + "loss": 1.7609, + "step": 9736 + }, + { + "epoch": 1.22, + "grad_norm": 12.45809268951416, + "learning_rate": 1.1853742208091034e-05, + "loss": 1.2105, + "step": 9737 + }, + { + "epoch": 1.22, + "grad_norm": 99.39317321777344, + "learning_rate": 1.1852905493034348e-05, + "loss": 2.9131, + "step": 9738 + }, + { + "epoch": 1.22, + "grad_norm": 14.499650955200195, + "learning_rate": 1.1852068777977662e-05, + "loss": 1.1414, + "step": 9739 + }, + { + "epoch": 1.22, + "grad_norm": 9.993605613708496, + "learning_rate": 1.1851232062920974e-05, + "loss": 2.3243, + "step": 9740 + }, + { + "epoch": 1.22, + "grad_norm": 32.130714416503906, + "learning_rate": 1.1850395347864286e-05, + "loss": 0.9311, + "step": 9741 + }, + { + "epoch": 1.22, + "grad_norm": 7.0457258224487305, + "learning_rate": 1.18495586328076e-05, + "loss": 0.4676, + "step": 9742 + }, + { + "epoch": 1.22, + "grad_norm": 56.041038513183594, + "learning_rate": 1.1848721917750911e-05, + "loss": 1.9098, + "step": 9743 + }, + { + "epoch": 1.22, + "grad_norm": 19.361499786376953, + "learning_rate": 1.1847885202694223e-05, + "loss": 1.6205, + "step": 9744 + }, + { + "epoch": 1.22, + "grad_norm": 21.862525939941406, + "learning_rate": 1.1847048487637535e-05, + "loss": 1.8168, + "step": 9745 + }, + { + "epoch": 1.22, + "grad_norm": 10.574731826782227, + "learning_rate": 1.1846211772580849e-05, + "loss": 0.8181, + "step": 9746 + }, + { + "epoch": 1.22, + "grad_norm": 21.914243698120117, + "learning_rate": 1.1845375057524162e-05, + "loss": 1.3609, + "step": 9747 + }, + { + "epoch": 1.22, + "grad_norm": 10.41545581817627, + "learning_rate": 1.1844538342467473e-05, + "loss": 0.5635, + "step": 9748 + }, + { + "epoch": 1.22, + "grad_norm": 7.6245574951171875, + "learning_rate": 1.1843701627410786e-05, + "loss": 1.0122, + "step": 9749 + }, + { + "epoch": 1.22, + "grad_norm": 19.98027992248535, + "learning_rate": 1.18428649123541e-05, + "loss": 1.5541, + "step": 9750 + }, + { + "epoch": 1.22, + "grad_norm": 6.076691627502441, + "learning_rate": 1.184202819729741e-05, + "loss": 0.6404, + "step": 9751 + }, + { + "epoch": 1.22, + "grad_norm": 7.035378456115723, + "learning_rate": 1.1841191482240724e-05, + "loss": 0.839, + "step": 9752 + }, + { + "epoch": 1.22, + "grad_norm": 8.503005981445312, + "learning_rate": 1.1840354767184037e-05, + "loss": 1.9198, + "step": 9753 + }, + { + "epoch": 1.22, + "grad_norm": 19.694664001464844, + "learning_rate": 1.183951805212735e-05, + "loss": 1.6603, + "step": 9754 + }, + { + "epoch": 1.22, + "grad_norm": 8.645819664001465, + "learning_rate": 1.1838681337070661e-05, + "loss": 0.6592, + "step": 9755 + }, + { + "epoch": 1.22, + "grad_norm": 5.133186340332031, + "learning_rate": 1.1837844622013975e-05, + "loss": 0.3353, + "step": 9756 + }, + { + "epoch": 1.22, + "grad_norm": 21.087446212768555, + "learning_rate": 1.1837007906957287e-05, + "loss": 3.1026, + "step": 9757 + }, + { + "epoch": 1.22, + "grad_norm": 19.393712997436523, + "learning_rate": 1.1836171191900599e-05, + "loss": 1.9517, + "step": 9758 + }, + { + "epoch": 1.22, + "grad_norm": 77.39122772216797, + "learning_rate": 1.1835334476843911e-05, + "loss": 1.8786, + "step": 9759 + }, + { + "epoch": 1.22, + "grad_norm": 22.556842803955078, + "learning_rate": 1.1834497761787225e-05, + "loss": 0.9785, + "step": 9760 + }, + { + "epoch": 1.22, + "grad_norm": 12.212501525878906, + "learning_rate": 1.1833661046730538e-05, + "loss": 1.126, + "step": 9761 + }, + { + "epoch": 1.23, + "grad_norm": 12.766206741333008, + "learning_rate": 1.1832824331673848e-05, + "loss": 1.5055, + "step": 9762 + }, + { + "epoch": 1.23, + "grad_norm": 17.554147720336914, + "learning_rate": 1.1831987616617162e-05, + "loss": 0.9103, + "step": 9763 + }, + { + "epoch": 1.23, + "grad_norm": 23.220369338989258, + "learning_rate": 1.1831150901560476e-05, + "loss": 3.0983, + "step": 9764 + }, + { + "epoch": 1.23, + "grad_norm": 22.07280158996582, + "learning_rate": 1.1830314186503786e-05, + "loss": 1.9888, + "step": 9765 + }, + { + "epoch": 1.23, + "grad_norm": 8.112464904785156, + "learning_rate": 1.18294774714471e-05, + "loss": 0.3538, + "step": 9766 + }, + { + "epoch": 1.23, + "grad_norm": 11.294885635375977, + "learning_rate": 1.1828640756390413e-05, + "loss": 1.4769, + "step": 9767 + }, + { + "epoch": 1.23, + "grad_norm": 12.150555610656738, + "learning_rate": 1.1827804041333725e-05, + "loss": 2.2846, + "step": 9768 + }, + { + "epoch": 1.23, + "grad_norm": 5.326368808746338, + "learning_rate": 1.1826967326277037e-05, + "loss": 0.5095, + "step": 9769 + }, + { + "epoch": 1.23, + "grad_norm": 13.025184631347656, + "learning_rate": 1.182613061122035e-05, + "loss": 1.3693, + "step": 9770 + }, + { + "epoch": 1.23, + "grad_norm": 12.305625915527344, + "learning_rate": 1.1825293896163663e-05, + "loss": 1.7269, + "step": 9771 + }, + { + "epoch": 1.23, + "grad_norm": 16.152267456054688, + "learning_rate": 1.1824457181106975e-05, + "loss": 1.6834, + "step": 9772 + }, + { + "epoch": 1.23, + "grad_norm": 17.717496871948242, + "learning_rate": 1.1823620466050287e-05, + "loss": 1.5123, + "step": 9773 + }, + { + "epoch": 1.23, + "grad_norm": 6.071959972381592, + "learning_rate": 1.18227837509936e-05, + "loss": 0.5821, + "step": 9774 + }, + { + "epoch": 1.23, + "grad_norm": 35.208396911621094, + "learning_rate": 1.1821947035936914e-05, + "loss": 0.7074, + "step": 9775 + }, + { + "epoch": 1.23, + "grad_norm": 21.743438720703125, + "learning_rate": 1.1821110320880224e-05, + "loss": 1.6365, + "step": 9776 + }, + { + "epoch": 1.23, + "grad_norm": 15.401273727416992, + "learning_rate": 1.1820273605823538e-05, + "loss": 2.3279, + "step": 9777 + }, + { + "epoch": 1.23, + "grad_norm": 2.5009899139404297, + "learning_rate": 1.1819436890766852e-05, + "loss": 0.0616, + "step": 9778 + }, + { + "epoch": 1.23, + "grad_norm": 8.305074691772461, + "learning_rate": 1.1818600175710162e-05, + "loss": 0.8248, + "step": 9779 + }, + { + "epoch": 1.23, + "grad_norm": 8.066091537475586, + "learning_rate": 1.1817763460653476e-05, + "loss": 0.7442, + "step": 9780 + }, + { + "epoch": 1.23, + "grad_norm": 7.680723667144775, + "learning_rate": 1.181692674559679e-05, + "loss": 1.5972, + "step": 9781 + }, + { + "epoch": 1.23, + "grad_norm": 21.709930419921875, + "learning_rate": 1.1816090030540101e-05, + "loss": 1.9213, + "step": 9782 + }, + { + "epoch": 1.23, + "grad_norm": 8.652266502380371, + "learning_rate": 1.1815253315483413e-05, + "loss": 1.3114, + "step": 9783 + }, + { + "epoch": 1.23, + "grad_norm": 10.539308547973633, + "learning_rate": 1.1814416600426725e-05, + "loss": 1.4542, + "step": 9784 + }, + { + "epoch": 1.23, + "grad_norm": 6.910467147827148, + "learning_rate": 1.1813579885370039e-05, + "loss": 0.9087, + "step": 9785 + }, + { + "epoch": 1.23, + "grad_norm": 4.772772312164307, + "learning_rate": 1.181274317031335e-05, + "loss": 0.1523, + "step": 9786 + }, + { + "epoch": 1.23, + "grad_norm": 13.30754280090332, + "learning_rate": 1.1811906455256663e-05, + "loss": 1.1662, + "step": 9787 + }, + { + "epoch": 1.23, + "grad_norm": 17.31911277770996, + "learning_rate": 1.1811069740199976e-05, + "loss": 1.2639, + "step": 9788 + }, + { + "epoch": 1.23, + "grad_norm": 1.7666053771972656, + "learning_rate": 1.181023302514329e-05, + "loss": 0.0387, + "step": 9789 + }, + { + "epoch": 1.23, + "grad_norm": 11.537734985351562, + "learning_rate": 1.18093963100866e-05, + "loss": 1.752, + "step": 9790 + }, + { + "epoch": 1.23, + "grad_norm": 7.068645477294922, + "learning_rate": 1.1808559595029914e-05, + "loss": 1.4841, + "step": 9791 + }, + { + "epoch": 1.23, + "grad_norm": 14.287550926208496, + "learning_rate": 1.1807722879973227e-05, + "loss": 2.7825, + "step": 9792 + }, + { + "epoch": 1.23, + "grad_norm": 52.4526252746582, + "learning_rate": 1.1806886164916538e-05, + "loss": 2.5035, + "step": 9793 + }, + { + "epoch": 1.23, + "grad_norm": 9.312798500061035, + "learning_rate": 1.1806049449859851e-05, + "loss": 1.8516, + "step": 9794 + }, + { + "epoch": 1.23, + "grad_norm": 31.503602981567383, + "learning_rate": 1.1805212734803165e-05, + "loss": 1.1349, + "step": 9795 + }, + { + "epoch": 1.23, + "grad_norm": 3.658240556716919, + "learning_rate": 1.1804376019746477e-05, + "loss": 0.3557, + "step": 9796 + }, + { + "epoch": 1.23, + "grad_norm": 18.69940185546875, + "learning_rate": 1.1803539304689789e-05, + "loss": 1.3528, + "step": 9797 + }, + { + "epoch": 1.23, + "grad_norm": 17.545040130615234, + "learning_rate": 1.1802702589633101e-05, + "loss": 0.8369, + "step": 9798 + }, + { + "epoch": 1.23, + "grad_norm": 5.3122944831848145, + "learning_rate": 1.1801865874576415e-05, + "loss": 1.507, + "step": 9799 + }, + { + "epoch": 1.23, + "grad_norm": 17.907398223876953, + "learning_rate": 1.1801029159519726e-05, + "loss": 1.8676, + "step": 9800 + }, + { + "epoch": 1.23, + "grad_norm": 24.098581314086914, + "learning_rate": 1.1800192444463038e-05, + "loss": 2.6623, + "step": 9801 + }, + { + "epoch": 1.23, + "grad_norm": 15.955728530883789, + "learning_rate": 1.1799355729406352e-05, + "loss": 1.4016, + "step": 9802 + }, + { + "epoch": 1.23, + "grad_norm": 4.137416839599609, + "learning_rate": 1.1798519014349666e-05, + "loss": 0.114, + "step": 9803 + }, + { + "epoch": 1.23, + "grad_norm": 20.56151580810547, + "learning_rate": 1.1797682299292976e-05, + "loss": 1.1279, + "step": 9804 + }, + { + "epoch": 1.23, + "grad_norm": 18.528879165649414, + "learning_rate": 1.179684558423629e-05, + "loss": 1.7007, + "step": 9805 + }, + { + "epoch": 1.23, + "grad_norm": 11.145833969116211, + "learning_rate": 1.1796008869179603e-05, + "loss": 2.0885, + "step": 9806 + }, + { + "epoch": 1.23, + "grad_norm": 4.693984031677246, + "learning_rate": 1.1795172154122914e-05, + "loss": 1.038, + "step": 9807 + }, + { + "epoch": 1.23, + "grad_norm": 16.135169982910156, + "learning_rate": 1.1794335439066227e-05, + "loss": 1.1995, + "step": 9808 + }, + { + "epoch": 1.23, + "grad_norm": 13.0604248046875, + "learning_rate": 1.179349872400954e-05, + "loss": 3.9162, + "step": 9809 + }, + { + "epoch": 1.23, + "grad_norm": 7.696951389312744, + "learning_rate": 1.1792662008952853e-05, + "loss": 0.8307, + "step": 9810 + }, + { + "epoch": 1.23, + "grad_norm": 12.879096031188965, + "learning_rate": 1.1791825293896165e-05, + "loss": 1.2128, + "step": 9811 + }, + { + "epoch": 1.23, + "grad_norm": 2.9865870475769043, + "learning_rate": 1.1790988578839477e-05, + "loss": 0.3161, + "step": 9812 + }, + { + "epoch": 1.23, + "grad_norm": 14.160734176635742, + "learning_rate": 1.179015186378279e-05, + "loss": 2.3232, + "step": 9813 + }, + { + "epoch": 1.23, + "grad_norm": 14.625443458557129, + "learning_rate": 1.1789315148726102e-05, + "loss": 2.0368, + "step": 9814 + }, + { + "epoch": 1.23, + "grad_norm": 17.27570915222168, + "learning_rate": 1.1788478433669414e-05, + "loss": 0.7004, + "step": 9815 + }, + { + "epoch": 1.23, + "grad_norm": 5.550584316253662, + "learning_rate": 1.1787641718612728e-05, + "loss": 0.354, + "step": 9816 + }, + { + "epoch": 1.23, + "grad_norm": 6.084341049194336, + "learning_rate": 1.1786805003556038e-05, + "loss": 1.1003, + "step": 9817 + }, + { + "epoch": 1.23, + "grad_norm": 21.174999237060547, + "learning_rate": 1.1785968288499352e-05, + "loss": 1.3518, + "step": 9818 + }, + { + "epoch": 1.23, + "grad_norm": 12.584552764892578, + "learning_rate": 1.1785131573442665e-05, + "loss": 2.6013, + "step": 9819 + }, + { + "epoch": 1.23, + "grad_norm": 21.88260269165039, + "learning_rate": 1.1784294858385979e-05, + "loss": 1.9915, + "step": 9820 + }, + { + "epoch": 1.23, + "grad_norm": 11.529659271240234, + "learning_rate": 1.178345814332929e-05, + "loss": 0.7695, + "step": 9821 + }, + { + "epoch": 1.23, + "grad_norm": 7.979567527770996, + "learning_rate": 1.1782621428272603e-05, + "loss": 0.9663, + "step": 9822 + }, + { + "epoch": 1.23, + "grad_norm": 18.05103874206543, + "learning_rate": 1.1781784713215915e-05, + "loss": 3.1226, + "step": 9823 + }, + { + "epoch": 1.23, + "grad_norm": 6.140756607055664, + "learning_rate": 1.1780947998159227e-05, + "loss": 0.2484, + "step": 9824 + }, + { + "epoch": 1.23, + "grad_norm": 7.758366107940674, + "learning_rate": 1.178011128310254e-05, + "loss": 0.21, + "step": 9825 + }, + { + "epoch": 1.23, + "grad_norm": 36.641845703125, + "learning_rate": 1.1779274568045853e-05, + "loss": 2.524, + "step": 9826 + }, + { + "epoch": 1.23, + "grad_norm": 5.3544158935546875, + "learning_rate": 1.1778437852989166e-05, + "loss": 0.1326, + "step": 9827 + }, + { + "epoch": 1.23, + "grad_norm": 90.64929962158203, + "learning_rate": 1.1777601137932476e-05, + "loss": 2.0421, + "step": 9828 + }, + { + "epoch": 1.23, + "grad_norm": 14.497895240783691, + "learning_rate": 1.177676442287579e-05, + "loss": 1.7791, + "step": 9829 + }, + { + "epoch": 1.23, + "grad_norm": 36.353668212890625, + "learning_rate": 1.1775927707819104e-05, + "loss": 0.3409, + "step": 9830 + }, + { + "epoch": 1.23, + "grad_norm": 18.138263702392578, + "learning_rate": 1.1775090992762414e-05, + "loss": 0.6608, + "step": 9831 + }, + { + "epoch": 1.23, + "grad_norm": 0.4441068768501282, + "learning_rate": 1.1774254277705728e-05, + "loss": 0.0099, + "step": 9832 + }, + { + "epoch": 1.23, + "grad_norm": 16.02644157409668, + "learning_rate": 1.1773417562649041e-05, + "loss": 1.1821, + "step": 9833 + }, + { + "epoch": 1.23, + "grad_norm": 9.014534950256348, + "learning_rate": 1.1772580847592355e-05, + "loss": 1.8367, + "step": 9834 + }, + { + "epoch": 1.23, + "grad_norm": 13.180492401123047, + "learning_rate": 1.1771744132535665e-05, + "loss": 0.5815, + "step": 9835 + }, + { + "epoch": 1.23, + "grad_norm": 55.19274139404297, + "learning_rate": 1.1770907417478979e-05, + "loss": 1.3104, + "step": 9836 + }, + { + "epoch": 1.23, + "grad_norm": 9.048669815063477, + "learning_rate": 1.177007070242229e-05, + "loss": 1.456, + "step": 9837 + }, + { + "epoch": 1.23, + "grad_norm": 40.642486572265625, + "learning_rate": 1.1769233987365603e-05, + "loss": 1.2127, + "step": 9838 + }, + { + "epoch": 1.23, + "grad_norm": 17.61302375793457, + "learning_rate": 1.1768397272308916e-05, + "loss": 1.1644, + "step": 9839 + }, + { + "epoch": 1.23, + "grad_norm": 9.714911460876465, + "learning_rate": 1.1767560557252228e-05, + "loss": 1.2779, + "step": 9840 + }, + { + "epoch": 1.24, + "grad_norm": 10.274693489074707, + "learning_rate": 1.1766723842195542e-05, + "loss": 2.0046, + "step": 9841 + }, + { + "epoch": 1.24, + "grad_norm": 4.059805393218994, + "learning_rate": 1.1765887127138852e-05, + "loss": 0.1009, + "step": 9842 + }, + { + "epoch": 1.24, + "grad_norm": 20.27713394165039, + "learning_rate": 1.1765050412082166e-05, + "loss": 1.6268, + "step": 9843 + }, + { + "epoch": 1.24, + "grad_norm": 82.43860626220703, + "learning_rate": 1.176421369702548e-05, + "loss": 2.0341, + "step": 9844 + }, + { + "epoch": 1.24, + "grad_norm": 8.47548770904541, + "learning_rate": 1.176337698196879e-05, + "loss": 1.4052, + "step": 9845 + }, + { + "epoch": 1.24, + "grad_norm": 10.602141380310059, + "learning_rate": 1.1762540266912103e-05, + "loss": 0.8891, + "step": 9846 + }, + { + "epoch": 1.24, + "grad_norm": 16.42512321472168, + "learning_rate": 1.1761703551855417e-05, + "loss": 0.9752, + "step": 9847 + }, + { + "epoch": 1.24, + "grad_norm": 21.03297233581543, + "learning_rate": 1.176086683679873e-05, + "loss": 1.2994, + "step": 9848 + }, + { + "epoch": 1.24, + "grad_norm": 2.0785598754882812, + "learning_rate": 1.1760030121742041e-05, + "loss": 0.1167, + "step": 9849 + }, + { + "epoch": 1.24, + "grad_norm": 17.921354293823242, + "learning_rate": 1.1759193406685355e-05, + "loss": 1.7992, + "step": 9850 + }, + { + "epoch": 1.24, + "grad_norm": 18.078763961791992, + "learning_rate": 1.1758356691628667e-05, + "loss": 1.1432, + "step": 9851 + }, + { + "epoch": 1.24, + "grad_norm": 8.481141090393066, + "learning_rate": 1.1757519976571979e-05, + "loss": 0.5549, + "step": 9852 + }, + { + "epoch": 1.24, + "grad_norm": 21.629512786865234, + "learning_rate": 1.1756683261515292e-05, + "loss": 1.5444, + "step": 9853 + }, + { + "epoch": 1.24, + "grad_norm": 11.057011604309082, + "learning_rate": 1.1755846546458604e-05, + "loss": 0.4012, + "step": 9854 + }, + { + "epoch": 1.24, + "grad_norm": 16.92287826538086, + "learning_rate": 1.1755009831401918e-05, + "loss": 1.2856, + "step": 9855 + }, + { + "epoch": 1.24, + "grad_norm": 18.003637313842773, + "learning_rate": 1.1754173116345228e-05, + "loss": 0.7065, + "step": 9856 + }, + { + "epoch": 1.24, + "grad_norm": 13.426071166992188, + "learning_rate": 1.1753336401288542e-05, + "loss": 2.3196, + "step": 9857 + }, + { + "epoch": 1.24, + "grad_norm": 14.647819519042969, + "learning_rate": 1.1752499686231855e-05, + "loss": 0.7164, + "step": 9858 + }, + { + "epoch": 1.24, + "grad_norm": 13.148072242736816, + "learning_rate": 1.1751662971175166e-05, + "loss": 0.8424, + "step": 9859 + }, + { + "epoch": 1.24, + "grad_norm": 11.636898040771484, + "learning_rate": 1.175082625611848e-05, + "loss": 0.9816, + "step": 9860 + }, + { + "epoch": 1.24, + "grad_norm": 9.009572982788086, + "learning_rate": 1.1749989541061793e-05, + "loss": 1.0581, + "step": 9861 + }, + { + "epoch": 1.24, + "grad_norm": 14.94993782043457, + "learning_rate": 1.1749152826005105e-05, + "loss": 1.0659, + "step": 9862 + }, + { + "epoch": 1.24, + "grad_norm": 14.876943588256836, + "learning_rate": 1.1748316110948417e-05, + "loss": 1.4571, + "step": 9863 + }, + { + "epoch": 1.24, + "grad_norm": 28.33251953125, + "learning_rate": 1.174747939589173e-05, + "loss": 1.6219, + "step": 9864 + }, + { + "epoch": 1.24, + "grad_norm": 10.158002853393555, + "learning_rate": 1.1746642680835042e-05, + "loss": 0.3761, + "step": 9865 + }, + { + "epoch": 1.24, + "grad_norm": 16.329578399658203, + "learning_rate": 1.1745805965778354e-05, + "loss": 0.9413, + "step": 9866 + }, + { + "epoch": 1.24, + "grad_norm": 10.986481666564941, + "learning_rate": 1.1744969250721668e-05, + "loss": 0.9236, + "step": 9867 + }, + { + "epoch": 1.24, + "grad_norm": 3.439972400665283, + "learning_rate": 1.174413253566498e-05, + "loss": 0.0313, + "step": 9868 + }, + { + "epoch": 1.24, + "grad_norm": 9.496933937072754, + "learning_rate": 1.1743295820608294e-05, + "loss": 0.9859, + "step": 9869 + }, + { + "epoch": 1.24, + "grad_norm": 13.342422485351562, + "learning_rate": 1.1742459105551604e-05, + "loss": 0.8337, + "step": 9870 + }, + { + "epoch": 1.24, + "grad_norm": 28.25615119934082, + "learning_rate": 1.1741622390494918e-05, + "loss": 2.9762, + "step": 9871 + }, + { + "epoch": 1.24, + "grad_norm": 10.167847633361816, + "learning_rate": 1.1740785675438231e-05, + "loss": 0.8082, + "step": 9872 + }, + { + "epoch": 1.24, + "grad_norm": 17.874998092651367, + "learning_rate": 1.1739948960381542e-05, + "loss": 2.2444, + "step": 9873 + }, + { + "epoch": 1.24, + "grad_norm": 13.657346725463867, + "learning_rate": 1.1739112245324855e-05, + "loss": 1.3361, + "step": 9874 + }, + { + "epoch": 1.24, + "grad_norm": 26.667905807495117, + "learning_rate": 1.1738275530268169e-05, + "loss": 1.6523, + "step": 9875 + }, + { + "epoch": 1.24, + "grad_norm": 23.977022171020508, + "learning_rate": 1.173743881521148e-05, + "loss": 2.1226, + "step": 9876 + }, + { + "epoch": 1.24, + "grad_norm": 8.629140853881836, + "learning_rate": 1.1736602100154793e-05, + "loss": 1.351, + "step": 9877 + }, + { + "epoch": 1.24, + "grad_norm": 8.725996971130371, + "learning_rate": 1.1735765385098106e-05, + "loss": 0.6322, + "step": 9878 + }, + { + "epoch": 1.24, + "grad_norm": 10.691288948059082, + "learning_rate": 1.1734928670041418e-05, + "loss": 0.3133, + "step": 9879 + }, + { + "epoch": 1.24, + "grad_norm": 3.4821033477783203, + "learning_rate": 1.173409195498473e-05, + "loss": 0.0902, + "step": 9880 + }, + { + "epoch": 1.24, + "grad_norm": 13.584442138671875, + "learning_rate": 1.1733255239928042e-05, + "loss": 2.056, + "step": 9881 + }, + { + "epoch": 1.24, + "grad_norm": 22.304651260375977, + "learning_rate": 1.1732418524871356e-05, + "loss": 1.782, + "step": 9882 + }, + { + "epoch": 1.24, + "grad_norm": 15.612174987792969, + "learning_rate": 1.173158180981467e-05, + "loss": 1.2614, + "step": 9883 + }, + { + "epoch": 1.24, + "grad_norm": 16.01333999633789, + "learning_rate": 1.173074509475798e-05, + "loss": 1.5578, + "step": 9884 + }, + { + "epoch": 1.24, + "grad_norm": 9.732401847839355, + "learning_rate": 1.1729908379701293e-05, + "loss": 1.0354, + "step": 9885 + }, + { + "epoch": 1.24, + "grad_norm": 30.70211410522461, + "learning_rate": 1.1729071664644607e-05, + "loss": 1.0566, + "step": 9886 + }, + { + "epoch": 1.24, + "grad_norm": 9.202712059020996, + "learning_rate": 1.1728234949587917e-05, + "loss": 0.4007, + "step": 9887 + }, + { + "epoch": 1.24, + "grad_norm": 68.00823211669922, + "learning_rate": 1.1727398234531231e-05, + "loss": 1.4828, + "step": 9888 + }, + { + "epoch": 1.24, + "grad_norm": 11.506952285766602, + "learning_rate": 1.1726561519474545e-05, + "loss": 1.145, + "step": 9889 + }, + { + "epoch": 1.24, + "grad_norm": 21.97418975830078, + "learning_rate": 1.1725724804417857e-05, + "loss": 1.2828, + "step": 9890 + }, + { + "epoch": 1.24, + "grad_norm": 13.470939636230469, + "learning_rate": 1.1724888089361169e-05, + "loss": 1.4388, + "step": 9891 + }, + { + "epoch": 1.24, + "grad_norm": 15.934314727783203, + "learning_rate": 1.1724051374304482e-05, + "loss": 1.5466, + "step": 9892 + }, + { + "epoch": 1.24, + "grad_norm": 16.9025936126709, + "learning_rate": 1.1723214659247794e-05, + "loss": 1.1422, + "step": 9893 + }, + { + "epoch": 1.24, + "grad_norm": 11.071962356567383, + "learning_rate": 1.1722377944191106e-05, + "loss": 2.2181, + "step": 9894 + }, + { + "epoch": 1.24, + "grad_norm": 8.60844612121582, + "learning_rate": 1.1721541229134418e-05, + "loss": 1.6378, + "step": 9895 + }, + { + "epoch": 1.24, + "grad_norm": 19.60494041442871, + "learning_rate": 1.1720704514077732e-05, + "loss": 0.4927, + "step": 9896 + }, + { + "epoch": 1.24, + "grad_norm": 6.539211273193359, + "learning_rate": 1.1719867799021045e-05, + "loss": 0.5612, + "step": 9897 + }, + { + "epoch": 1.24, + "grad_norm": 83.48982238769531, + "learning_rate": 1.1719031083964356e-05, + "loss": 1.712, + "step": 9898 + }, + { + "epoch": 1.24, + "grad_norm": 11.819092750549316, + "learning_rate": 1.171819436890767e-05, + "loss": 1.1643, + "step": 9899 + }, + { + "epoch": 1.24, + "grad_norm": 40.66287612915039, + "learning_rate": 1.1717357653850983e-05, + "loss": 1.8852, + "step": 9900 + }, + { + "epoch": 1.24, + "grad_norm": 4.276702880859375, + "learning_rate": 1.1716520938794293e-05, + "loss": 0.2434, + "step": 9901 + }, + { + "epoch": 1.24, + "grad_norm": 12.493239402770996, + "learning_rate": 1.1715684223737607e-05, + "loss": 0.4886, + "step": 9902 + }, + { + "epoch": 1.24, + "grad_norm": 10.208495140075684, + "learning_rate": 1.171484750868092e-05, + "loss": 2.4325, + "step": 9903 + }, + { + "epoch": 1.24, + "grad_norm": 12.189680099487305, + "learning_rate": 1.1714010793624232e-05, + "loss": 1.207, + "step": 9904 + }, + { + "epoch": 1.24, + "grad_norm": 10.993623733520508, + "learning_rate": 1.1713174078567544e-05, + "loss": 1.8157, + "step": 9905 + }, + { + "epoch": 1.24, + "grad_norm": 7.953149318695068, + "learning_rate": 1.1712337363510858e-05, + "loss": 0.8497, + "step": 9906 + }, + { + "epoch": 1.24, + "grad_norm": 14.442801475524902, + "learning_rate": 1.171150064845417e-05, + "loss": 0.9703, + "step": 9907 + }, + { + "epoch": 1.24, + "grad_norm": 27.209766387939453, + "learning_rate": 1.1710663933397482e-05, + "loss": 2.2167, + "step": 9908 + }, + { + "epoch": 1.24, + "grad_norm": 34.28467559814453, + "learning_rate": 1.1709827218340794e-05, + "loss": 1.4908, + "step": 9909 + }, + { + "epoch": 1.24, + "grad_norm": 101.06977844238281, + "learning_rate": 1.1708990503284108e-05, + "loss": 2.5449, + "step": 9910 + }, + { + "epoch": 1.24, + "grad_norm": 16.86500358581543, + "learning_rate": 1.1708153788227421e-05, + "loss": 1.1078, + "step": 9911 + }, + { + "epoch": 1.24, + "grad_norm": 8.253978729248047, + "learning_rate": 1.1707317073170731e-05, + "loss": 1.1973, + "step": 9912 + }, + { + "epoch": 1.24, + "grad_norm": 4.450164794921875, + "learning_rate": 1.1706480358114045e-05, + "loss": 0.4521, + "step": 9913 + }, + { + "epoch": 1.24, + "grad_norm": 19.197463989257812, + "learning_rate": 1.1705643643057359e-05, + "loss": 2.4202, + "step": 9914 + }, + { + "epoch": 1.24, + "grad_norm": 12.841304779052734, + "learning_rate": 1.1704806928000669e-05, + "loss": 1.0224, + "step": 9915 + }, + { + "epoch": 1.24, + "grad_norm": 24.760807037353516, + "learning_rate": 1.1703970212943983e-05, + "loss": 1.556, + "step": 9916 + }, + { + "epoch": 1.24, + "grad_norm": 8.061955451965332, + "learning_rate": 1.1703133497887296e-05, + "loss": 0.5102, + "step": 9917 + }, + { + "epoch": 1.24, + "grad_norm": 13.719026565551758, + "learning_rate": 1.1702296782830608e-05, + "loss": 0.7795, + "step": 9918 + }, + { + "epoch": 1.24, + "grad_norm": 8.608823776245117, + "learning_rate": 1.170146006777392e-05, + "loss": 2.1716, + "step": 9919 + }, + { + "epoch": 1.24, + "grad_norm": 65.23345184326172, + "learning_rate": 1.1700623352717234e-05, + "loss": 1.4773, + "step": 9920 + }, + { + "epoch": 1.25, + "grad_norm": 8.659074783325195, + "learning_rate": 1.1699786637660546e-05, + "loss": 0.7318, + "step": 9921 + }, + { + "epoch": 1.25, + "grad_norm": 10.6577730178833, + "learning_rate": 1.1698949922603858e-05, + "loss": 0.9231, + "step": 9922 + }, + { + "epoch": 1.25, + "grad_norm": 5.324812889099121, + "learning_rate": 1.169811320754717e-05, + "loss": 0.6034, + "step": 9923 + }, + { + "epoch": 1.25, + "grad_norm": 14.057004928588867, + "learning_rate": 1.1697276492490483e-05, + "loss": 0.9089, + "step": 9924 + }, + { + "epoch": 1.25, + "grad_norm": 43.46983337402344, + "learning_rate": 1.1696439777433797e-05, + "loss": 1.6469, + "step": 9925 + }, + { + "epoch": 1.25, + "grad_norm": 18.37807273864746, + "learning_rate": 1.1695603062377107e-05, + "loss": 0.5716, + "step": 9926 + }, + { + "epoch": 1.25, + "grad_norm": 10.710125923156738, + "learning_rate": 1.1694766347320421e-05, + "loss": 2.0483, + "step": 9927 + }, + { + "epoch": 1.25, + "grad_norm": 6.957853317260742, + "learning_rate": 1.1693929632263735e-05, + "loss": 0.4949, + "step": 9928 + }, + { + "epoch": 1.25, + "grad_norm": 122.19803619384766, + "learning_rate": 1.1693092917207045e-05, + "loss": 2.0142, + "step": 9929 + }, + { + "epoch": 1.25, + "grad_norm": 4.989986419677734, + "learning_rate": 1.1692256202150359e-05, + "loss": 0.5258, + "step": 9930 + }, + { + "epoch": 1.25, + "grad_norm": 17.40357208251953, + "learning_rate": 1.1691419487093672e-05, + "loss": 0.9121, + "step": 9931 + }, + { + "epoch": 1.25, + "grad_norm": 14.683014869689941, + "learning_rate": 1.1690582772036984e-05, + "loss": 1.2121, + "step": 9932 + }, + { + "epoch": 1.25, + "grad_norm": 42.671024322509766, + "learning_rate": 1.1689746056980296e-05, + "loss": 2.0181, + "step": 9933 + }, + { + "epoch": 1.25, + "grad_norm": 13.469338417053223, + "learning_rate": 1.1688909341923608e-05, + "loss": 4.0186, + "step": 9934 + }, + { + "epoch": 1.25, + "grad_norm": 28.573440551757812, + "learning_rate": 1.1688072626866922e-05, + "loss": 1.1826, + "step": 9935 + }, + { + "epoch": 1.25, + "grad_norm": 70.24644470214844, + "learning_rate": 1.1687235911810234e-05, + "loss": 2.1922, + "step": 9936 + }, + { + "epoch": 1.25, + "grad_norm": 21.14872169494629, + "learning_rate": 1.1686399196753546e-05, + "loss": 1.451, + "step": 9937 + }, + { + "epoch": 1.25, + "grad_norm": 6.562117576599121, + "learning_rate": 1.168556248169686e-05, + "loss": 0.2988, + "step": 9938 + }, + { + "epoch": 1.25, + "grad_norm": 31.07131004333496, + "learning_rate": 1.1684725766640173e-05, + "loss": 1.3988, + "step": 9939 + }, + { + "epoch": 1.25, + "grad_norm": 12.469810485839844, + "learning_rate": 1.1683889051583483e-05, + "loss": 1.2987, + "step": 9940 + }, + { + "epoch": 1.25, + "grad_norm": 16.9166202545166, + "learning_rate": 1.1683052336526797e-05, + "loss": 3.1104, + "step": 9941 + }, + { + "epoch": 1.25, + "grad_norm": 12.717508316040039, + "learning_rate": 1.168221562147011e-05, + "loss": 2.7321, + "step": 9942 + }, + { + "epoch": 1.25, + "grad_norm": 17.788097381591797, + "learning_rate": 1.168137890641342e-05, + "loss": 1.7442, + "step": 9943 + }, + { + "epoch": 1.25, + "grad_norm": 9.754151344299316, + "learning_rate": 1.1680542191356734e-05, + "loss": 0.6006, + "step": 9944 + }, + { + "epoch": 1.25, + "grad_norm": 18.577787399291992, + "learning_rate": 1.1679705476300048e-05, + "loss": 2.614, + "step": 9945 + }, + { + "epoch": 1.25, + "grad_norm": 48.640342712402344, + "learning_rate": 1.167886876124336e-05, + "loss": 1.2527, + "step": 9946 + }, + { + "epoch": 1.25, + "grad_norm": 59.63102340698242, + "learning_rate": 1.1678032046186672e-05, + "loss": 1.4984, + "step": 9947 + }, + { + "epoch": 1.25, + "grad_norm": 22.24765396118164, + "learning_rate": 1.1677195331129984e-05, + "loss": 1.0165, + "step": 9948 + }, + { + "epoch": 1.25, + "grad_norm": 20.635051727294922, + "learning_rate": 1.1676358616073298e-05, + "loss": 1.4636, + "step": 9949 + }, + { + "epoch": 1.25, + "grad_norm": 14.577654838562012, + "learning_rate": 1.167552190101661e-05, + "loss": 1.8911, + "step": 9950 + }, + { + "epoch": 1.25, + "grad_norm": 15.311452865600586, + "learning_rate": 1.1674685185959921e-05, + "loss": 0.9604, + "step": 9951 + }, + { + "epoch": 1.25, + "grad_norm": 23.743816375732422, + "learning_rate": 1.1673848470903235e-05, + "loss": 2.5244, + "step": 9952 + }, + { + "epoch": 1.25, + "grad_norm": 22.989370346069336, + "learning_rate": 1.1673011755846549e-05, + "loss": 2.0244, + "step": 9953 + }, + { + "epoch": 1.25, + "grad_norm": 14.441717147827148, + "learning_rate": 1.1672175040789859e-05, + "loss": 0.8623, + "step": 9954 + }, + { + "epoch": 1.25, + "grad_norm": 9.20020580291748, + "learning_rate": 1.1671338325733173e-05, + "loss": 1.2268, + "step": 9955 + }, + { + "epoch": 1.25, + "grad_norm": 16.0340576171875, + "learning_rate": 1.1670501610676486e-05, + "loss": 1.6768, + "step": 9956 + }, + { + "epoch": 1.25, + "grad_norm": 5.675755977630615, + "learning_rate": 1.1669664895619797e-05, + "loss": 1.0931, + "step": 9957 + }, + { + "epoch": 1.25, + "grad_norm": 5.870267391204834, + "learning_rate": 1.166882818056311e-05, + "loss": 0.6885, + "step": 9958 + }, + { + "epoch": 1.25, + "grad_norm": 15.182581901550293, + "learning_rate": 1.1667991465506424e-05, + "loss": 1.7034, + "step": 9959 + }, + { + "epoch": 1.25, + "grad_norm": 41.7207145690918, + "learning_rate": 1.1667154750449736e-05, + "loss": 1.8537, + "step": 9960 + }, + { + "epoch": 1.25, + "grad_norm": 9.536687850952148, + "learning_rate": 1.1666318035393048e-05, + "loss": 0.9629, + "step": 9961 + }, + { + "epoch": 1.25, + "grad_norm": 5.596810817718506, + "learning_rate": 1.166548132033636e-05, + "loss": 0.3131, + "step": 9962 + }, + { + "epoch": 1.25, + "grad_norm": 24.205638885498047, + "learning_rate": 1.1664644605279673e-05, + "loss": 1.0994, + "step": 9963 + }, + { + "epoch": 1.25, + "grad_norm": 80.2908706665039, + "learning_rate": 1.1663807890222985e-05, + "loss": 0.9488, + "step": 9964 + }, + { + "epoch": 1.25, + "grad_norm": 11.021303176879883, + "learning_rate": 1.1662971175166297e-05, + "loss": 1.2746, + "step": 9965 + }, + { + "epoch": 1.25, + "grad_norm": 10.356732368469238, + "learning_rate": 1.1662134460109611e-05, + "loss": 1.9948, + "step": 9966 + }, + { + "epoch": 1.25, + "grad_norm": 2.0912623405456543, + "learning_rate": 1.1661297745052925e-05, + "loss": 0.0345, + "step": 9967 + }, + { + "epoch": 1.25, + "grad_norm": 31.7636661529541, + "learning_rate": 1.1660461029996235e-05, + "loss": 1.817, + "step": 9968 + }, + { + "epoch": 1.25, + "grad_norm": 6.6061553955078125, + "learning_rate": 1.1659624314939548e-05, + "loss": 1.1319, + "step": 9969 + }, + { + "epoch": 1.25, + "grad_norm": 13.415298461914062, + "learning_rate": 1.1658787599882862e-05, + "loss": 0.6593, + "step": 9970 + }, + { + "epoch": 1.25, + "grad_norm": 14.584321022033691, + "learning_rate": 1.1657950884826172e-05, + "loss": 1.9673, + "step": 9971 + }, + { + "epoch": 1.25, + "grad_norm": 14.611739158630371, + "learning_rate": 1.1657114169769486e-05, + "loss": 0.9902, + "step": 9972 + }, + { + "epoch": 1.25, + "grad_norm": 11.319719314575195, + "learning_rate": 1.1656277454712798e-05, + "loss": 1.1964, + "step": 9973 + }, + { + "epoch": 1.25, + "grad_norm": 20.697874069213867, + "learning_rate": 1.1655440739656112e-05, + "loss": 1.9791, + "step": 9974 + }, + { + "epoch": 1.25, + "grad_norm": 7.637447834014893, + "learning_rate": 1.1654604024599424e-05, + "loss": 1.1018, + "step": 9975 + }, + { + "epoch": 1.25, + "grad_norm": 8.925323486328125, + "learning_rate": 1.1653767309542736e-05, + "loss": 0.5566, + "step": 9976 + }, + { + "epoch": 1.25, + "grad_norm": 80.9381332397461, + "learning_rate": 1.165293059448605e-05, + "loss": 1.4074, + "step": 9977 + }, + { + "epoch": 1.25, + "grad_norm": 14.44371223449707, + "learning_rate": 1.1652093879429361e-05, + "loss": 1.5644, + "step": 9978 + }, + { + "epoch": 1.25, + "grad_norm": 10.81545639038086, + "learning_rate": 1.1651257164372673e-05, + "loss": 1.211, + "step": 9979 + }, + { + "epoch": 1.25, + "grad_norm": 23.71071434020996, + "learning_rate": 1.1650420449315987e-05, + "loss": 1.4741, + "step": 9980 + }, + { + "epoch": 1.25, + "grad_norm": 9.054380416870117, + "learning_rate": 1.16495837342593e-05, + "loss": 2.1966, + "step": 9981 + }, + { + "epoch": 1.25, + "grad_norm": 13.376519203186035, + "learning_rate": 1.164874701920261e-05, + "loss": 3.4557, + "step": 9982 + }, + { + "epoch": 1.25, + "grad_norm": 22.065784454345703, + "learning_rate": 1.1647910304145924e-05, + "loss": 2.3747, + "step": 9983 + }, + { + "epoch": 1.25, + "grad_norm": 11.869088172912598, + "learning_rate": 1.1647073589089238e-05, + "loss": 1.893, + "step": 9984 + }, + { + "epoch": 1.25, + "grad_norm": 12.72069263458252, + "learning_rate": 1.1646236874032548e-05, + "loss": 1.6795, + "step": 9985 + }, + { + "epoch": 1.25, + "grad_norm": 10.286107063293457, + "learning_rate": 1.1645400158975862e-05, + "loss": 1.1739, + "step": 9986 + }, + { + "epoch": 1.25, + "grad_norm": 28.606319427490234, + "learning_rate": 1.1644563443919174e-05, + "loss": 1.7524, + "step": 9987 + }, + { + "epoch": 1.25, + "grad_norm": 2.5166430473327637, + "learning_rate": 1.1643726728862487e-05, + "loss": 0.0647, + "step": 9988 + }, + { + "epoch": 1.25, + "grad_norm": 21.91144561767578, + "learning_rate": 1.16428900138058e-05, + "loss": 1.3016, + "step": 9989 + }, + { + "epoch": 1.25, + "grad_norm": 23.434961318969727, + "learning_rate": 1.1642053298749111e-05, + "loss": 0.9999, + "step": 9990 + }, + { + "epoch": 1.25, + "grad_norm": 3.6171557903289795, + "learning_rate": 1.1641216583692425e-05, + "loss": 0.3816, + "step": 9991 + }, + { + "epoch": 1.25, + "grad_norm": 25.213613510131836, + "learning_rate": 1.1640379868635735e-05, + "loss": 2.231, + "step": 9992 + }, + { + "epoch": 1.25, + "grad_norm": 3.4147605895996094, + "learning_rate": 1.1639543153579049e-05, + "loss": 0.1599, + "step": 9993 + }, + { + "epoch": 1.25, + "grad_norm": 11.12276840209961, + "learning_rate": 1.1638706438522363e-05, + "loss": 0.9217, + "step": 9994 + }, + { + "epoch": 1.25, + "grad_norm": 7.428321838378906, + "learning_rate": 1.1637869723465676e-05, + "loss": 1.4817, + "step": 9995 + }, + { + "epoch": 1.25, + "grad_norm": 3.9801888465881348, + "learning_rate": 1.1637033008408986e-05, + "loss": 0.0822, + "step": 9996 + }, + { + "epoch": 1.25, + "grad_norm": 15.356182098388672, + "learning_rate": 1.16361962933523e-05, + "loss": 1.5217, + "step": 9997 + }, + { + "epoch": 1.25, + "grad_norm": 10.371597290039062, + "learning_rate": 1.1635359578295614e-05, + "loss": 2.9021, + "step": 9998 + }, + { + "epoch": 1.25, + "grad_norm": 18.860326766967773, + "learning_rate": 1.1634522863238924e-05, + "loss": 1.3427, + "step": 9999 + }, + { + "epoch": 1.25, + "grad_norm": 8.820758819580078, + "learning_rate": 1.1633686148182238e-05, + "loss": 2.0757, + "step": 10000 + }, + { + "epoch": 1.25, + "eval_loss": 0.08529716730117798, + "eval_runtime": 95.5005, + "eval_samples_per_second": 37.089, + "eval_steps_per_second": 37.089, + "step": 10000 + }, + { + "epoch": 1.26, + "grad_norm": 18.028676986694336, + "learning_rate": 1.163284943312555e-05, + "loss": 2.2488, + "step": 10001 + }, + { + "epoch": 1.26, + "grad_norm": 23.154457092285156, + "learning_rate": 1.1632012718068863e-05, + "loss": 0.8694, + "step": 10002 + }, + { + "epoch": 1.26, + "grad_norm": 10.192426681518555, + "learning_rate": 1.1631176003012175e-05, + "loss": 0.8179, + "step": 10003 + }, + { + "epoch": 1.26, + "grad_norm": 37.10844802856445, + "learning_rate": 1.1630339287955487e-05, + "loss": 1.7208, + "step": 10004 + }, + { + "epoch": 1.26, + "grad_norm": 44.653175354003906, + "learning_rate": 1.1629502572898801e-05, + "loss": 0.8287, + "step": 10005 + }, + { + "epoch": 1.26, + "grad_norm": 4.824666976928711, + "learning_rate": 1.1628665857842111e-05, + "loss": 0.2451, + "step": 10006 + }, + { + "epoch": 1.26, + "grad_norm": 10.745501518249512, + "learning_rate": 1.1627829142785425e-05, + "loss": 1.5447, + "step": 10007 + }, + { + "epoch": 1.26, + "grad_norm": 17.71950912475586, + "learning_rate": 1.1626992427728738e-05, + "loss": 2.7385, + "step": 10008 + }, + { + "epoch": 1.26, + "grad_norm": 4.815904140472412, + "learning_rate": 1.1626155712672052e-05, + "loss": 0.3241, + "step": 10009 + }, + { + "epoch": 1.26, + "grad_norm": 9.331512451171875, + "learning_rate": 1.1625318997615362e-05, + "loss": 0.8059, + "step": 10010 + }, + { + "epoch": 1.26, + "grad_norm": 11.508204460144043, + "learning_rate": 1.1624482282558676e-05, + "loss": 1.0577, + "step": 10011 + }, + { + "epoch": 1.26, + "grad_norm": 38.958702087402344, + "learning_rate": 1.162364556750199e-05, + "loss": 1.4877, + "step": 10012 + }, + { + "epoch": 1.26, + "grad_norm": 12.809269905090332, + "learning_rate": 1.16228088524453e-05, + "loss": 0.8781, + "step": 10013 + }, + { + "epoch": 1.26, + "grad_norm": 21.30847930908203, + "learning_rate": 1.1621972137388614e-05, + "loss": 0.6281, + "step": 10014 + }, + { + "epoch": 1.26, + "grad_norm": 12.957433700561523, + "learning_rate": 1.1621135422331925e-05, + "loss": 0.8955, + "step": 10015 + }, + { + "epoch": 1.26, + "grad_norm": 13.342180252075195, + "learning_rate": 1.1620298707275239e-05, + "loss": 1.9799, + "step": 10016 + }, + { + "epoch": 1.26, + "grad_norm": 27.924131393432617, + "learning_rate": 1.1619461992218551e-05, + "loss": 3.0026, + "step": 10017 + }, + { + "epoch": 1.26, + "grad_norm": 20.1268253326416, + "learning_rate": 1.1618625277161863e-05, + "loss": 0.5763, + "step": 10018 + }, + { + "epoch": 1.26, + "grad_norm": 6.630133152008057, + "learning_rate": 1.1617788562105177e-05, + "loss": 0.4575, + "step": 10019 + }, + { + "epoch": 1.26, + "grad_norm": 19.63210678100586, + "learning_rate": 1.1616951847048487e-05, + "loss": 1.0933, + "step": 10020 + }, + { + "epoch": 1.26, + "grad_norm": 14.773037910461426, + "learning_rate": 1.16161151319918e-05, + "loss": 1.8225, + "step": 10021 + }, + { + "epoch": 1.26, + "grad_norm": 30.617250442504883, + "learning_rate": 1.1615278416935114e-05, + "loss": 1.1239, + "step": 10022 + }, + { + "epoch": 1.26, + "grad_norm": 10.700183868408203, + "learning_rate": 1.1614441701878428e-05, + "loss": 0.7783, + "step": 10023 + }, + { + "epoch": 1.26, + "grad_norm": 14.855048179626465, + "learning_rate": 1.1613604986821738e-05, + "loss": 2.5965, + "step": 10024 + }, + { + "epoch": 1.26, + "grad_norm": 15.018572807312012, + "learning_rate": 1.1612768271765052e-05, + "loss": 1.3255, + "step": 10025 + }, + { + "epoch": 1.26, + "grad_norm": 6.425628662109375, + "learning_rate": 1.1611931556708364e-05, + "loss": 0.6021, + "step": 10026 + }, + { + "epoch": 1.26, + "grad_norm": 104.21813201904297, + "learning_rate": 1.1611094841651676e-05, + "loss": 1.1573, + "step": 10027 + }, + { + "epoch": 1.26, + "grad_norm": 8.305505752563477, + "learning_rate": 1.161025812659499e-05, + "loss": 1.0939, + "step": 10028 + }, + { + "epoch": 1.26, + "grad_norm": 11.391011238098145, + "learning_rate": 1.1609421411538301e-05, + "loss": 1.2346, + "step": 10029 + }, + { + "epoch": 1.26, + "grad_norm": 39.1953125, + "learning_rate": 1.1608584696481615e-05, + "loss": 3.9511, + "step": 10030 + }, + { + "epoch": 1.26, + "grad_norm": 28.51256561279297, + "learning_rate": 1.1607747981424927e-05, + "loss": 1.0599, + "step": 10031 + }, + { + "epoch": 1.26, + "grad_norm": 9.189704895019531, + "learning_rate": 1.1606911266368239e-05, + "loss": 1.0351, + "step": 10032 + }, + { + "epoch": 1.26, + "grad_norm": 12.981938362121582, + "learning_rate": 1.1606074551311553e-05, + "loss": 1.1728, + "step": 10033 + }, + { + "epoch": 1.26, + "grad_norm": 56.41666030883789, + "learning_rate": 1.1605237836254863e-05, + "loss": 0.7182, + "step": 10034 + }, + { + "epoch": 1.26, + "grad_norm": 14.686779975891113, + "learning_rate": 1.1604401121198176e-05, + "loss": 2.1056, + "step": 10035 + }, + { + "epoch": 1.26, + "grad_norm": 18.131771087646484, + "learning_rate": 1.160356440614149e-05, + "loss": 2.0395, + "step": 10036 + }, + { + "epoch": 1.26, + "grad_norm": 8.616911888122559, + "learning_rate": 1.1602727691084804e-05, + "loss": 1.0235, + "step": 10037 + }, + { + "epoch": 1.26, + "grad_norm": 23.04092025756836, + "learning_rate": 1.1601890976028114e-05, + "loss": 1.743, + "step": 10038 + }, + { + "epoch": 1.26, + "grad_norm": 48.13896560668945, + "learning_rate": 1.1601054260971428e-05, + "loss": 2.179, + "step": 10039 + }, + { + "epoch": 1.26, + "grad_norm": 19.76043701171875, + "learning_rate": 1.160021754591474e-05, + "loss": 1.943, + "step": 10040 + }, + { + "epoch": 1.26, + "grad_norm": 10.525476455688477, + "learning_rate": 1.1599380830858052e-05, + "loss": 3.165, + "step": 10041 + }, + { + "epoch": 1.26, + "grad_norm": 16.542749404907227, + "learning_rate": 1.1598544115801365e-05, + "loss": 2.4273, + "step": 10042 + }, + { + "epoch": 1.26, + "grad_norm": 2.2783255577087402, + "learning_rate": 1.1597707400744677e-05, + "loss": 0.0453, + "step": 10043 + }, + { + "epoch": 1.26, + "grad_norm": 7.882564544677734, + "learning_rate": 1.159687068568799e-05, + "loss": 0.4292, + "step": 10044 + }, + { + "epoch": 1.26, + "grad_norm": 30.36086082458496, + "learning_rate": 1.1596033970631301e-05, + "loss": 0.7863, + "step": 10045 + }, + { + "epoch": 1.26, + "grad_norm": 16.81833839416504, + "learning_rate": 1.1595197255574615e-05, + "loss": 2.1633, + "step": 10046 + }, + { + "epoch": 1.26, + "grad_norm": 16.21168327331543, + "learning_rate": 1.1594360540517928e-05, + "loss": 0.6202, + "step": 10047 + }, + { + "epoch": 1.26, + "grad_norm": 12.192089080810547, + "learning_rate": 1.1593523825461239e-05, + "loss": 0.6212, + "step": 10048 + }, + { + "epoch": 1.26, + "grad_norm": 15.387441635131836, + "learning_rate": 1.1592687110404552e-05, + "loss": 0.7201, + "step": 10049 + }, + { + "epoch": 1.26, + "grad_norm": 15.565264701843262, + "learning_rate": 1.1591850395347866e-05, + "loss": 0.8358, + "step": 10050 + }, + { + "epoch": 1.26, + "grad_norm": 12.644769668579102, + "learning_rate": 1.159101368029118e-05, + "loss": 2.3882, + "step": 10051 + }, + { + "epoch": 1.26, + "grad_norm": 10.12132453918457, + "learning_rate": 1.159017696523449e-05, + "loss": 1.5997, + "step": 10052 + }, + { + "epoch": 1.26, + "grad_norm": 133.52984619140625, + "learning_rate": 1.1589340250177803e-05, + "loss": 2.4285, + "step": 10053 + }, + { + "epoch": 1.26, + "grad_norm": 13.751230239868164, + "learning_rate": 1.1588503535121115e-05, + "loss": 0.8718, + "step": 10054 + }, + { + "epoch": 1.26, + "grad_norm": 10.26984977722168, + "learning_rate": 1.1587666820064427e-05, + "loss": 1.512, + "step": 10055 + }, + { + "epoch": 1.26, + "grad_norm": 6.855957984924316, + "learning_rate": 1.1586830105007741e-05, + "loss": 0.2865, + "step": 10056 + }, + { + "epoch": 1.26, + "grad_norm": 14.386962890625, + "learning_rate": 1.1585993389951053e-05, + "loss": 0.7904, + "step": 10057 + }, + { + "epoch": 1.26, + "grad_norm": 11.048480033874512, + "learning_rate": 1.1585156674894367e-05, + "loss": 1.1862, + "step": 10058 + }, + { + "epoch": 1.26, + "grad_norm": 3.805039882659912, + "learning_rate": 1.1584319959837677e-05, + "loss": 1.1205, + "step": 10059 + }, + { + "epoch": 1.26, + "grad_norm": 52.64353561401367, + "learning_rate": 1.158348324478099e-05, + "loss": 1.2169, + "step": 10060 + }, + { + "epoch": 1.26, + "grad_norm": 8.508051872253418, + "learning_rate": 1.1582646529724304e-05, + "loss": 0.441, + "step": 10061 + }, + { + "epoch": 1.26, + "grad_norm": 10.83204460144043, + "learning_rate": 1.1581809814667614e-05, + "loss": 0.7029, + "step": 10062 + }, + { + "epoch": 1.26, + "grad_norm": 8.536264419555664, + "learning_rate": 1.1580973099610928e-05, + "loss": 0.5355, + "step": 10063 + }, + { + "epoch": 1.26, + "grad_norm": 18.436201095581055, + "learning_rate": 1.1580136384554242e-05, + "loss": 2.426, + "step": 10064 + }, + { + "epoch": 1.26, + "grad_norm": 4.101004123687744, + "learning_rate": 1.1579299669497555e-05, + "loss": 0.3596, + "step": 10065 + }, + { + "epoch": 1.26, + "grad_norm": 61.37518310546875, + "learning_rate": 1.1578462954440866e-05, + "loss": 1.6832, + "step": 10066 + }, + { + "epoch": 1.26, + "grad_norm": 9.92491340637207, + "learning_rate": 1.157762623938418e-05, + "loss": 2.079, + "step": 10067 + }, + { + "epoch": 1.26, + "grad_norm": 35.844871520996094, + "learning_rate": 1.1576789524327491e-05, + "loss": 2.0752, + "step": 10068 + }, + { + "epoch": 1.26, + "grad_norm": 4.12769079208374, + "learning_rate": 1.1575952809270803e-05, + "loss": 0.2739, + "step": 10069 + }, + { + "epoch": 1.26, + "grad_norm": 11.006604194641113, + "learning_rate": 1.1575116094214117e-05, + "loss": 1.53, + "step": 10070 + }, + { + "epoch": 1.26, + "grad_norm": 13.370402336120605, + "learning_rate": 1.1574279379157429e-05, + "loss": 1.1828, + "step": 10071 + }, + { + "epoch": 1.26, + "grad_norm": 20.020004272460938, + "learning_rate": 1.1573442664100742e-05, + "loss": 1.3519, + "step": 10072 + }, + { + "epoch": 1.26, + "grad_norm": 20.790355682373047, + "learning_rate": 1.1572605949044053e-05, + "loss": 1.2607, + "step": 10073 + }, + { + "epoch": 1.26, + "grad_norm": 13.527242660522461, + "learning_rate": 1.1571769233987366e-05, + "loss": 0.8212, + "step": 10074 + }, + { + "epoch": 1.26, + "grad_norm": 11.991751670837402, + "learning_rate": 1.157093251893068e-05, + "loss": 0.7136, + "step": 10075 + }, + { + "epoch": 1.26, + "grad_norm": 2.771205186843872, + "learning_rate": 1.157009580387399e-05, + "loss": 0.0826, + "step": 10076 + }, + { + "epoch": 1.26, + "grad_norm": 7.323941707611084, + "learning_rate": 1.1569259088817304e-05, + "loss": 0.6736, + "step": 10077 + }, + { + "epoch": 1.26, + "grad_norm": 25.943374633789062, + "learning_rate": 1.1568422373760618e-05, + "loss": 3.555, + "step": 10078 + }, + { + "epoch": 1.26, + "grad_norm": 17.632131576538086, + "learning_rate": 1.156758565870393e-05, + "loss": 2.1017, + "step": 10079 + }, + { + "epoch": 1.27, + "grad_norm": 27.554595947265625, + "learning_rate": 1.1566748943647242e-05, + "loss": 2.1187, + "step": 10080 + }, + { + "epoch": 1.27, + "grad_norm": 11.076910018920898, + "learning_rate": 1.1565912228590555e-05, + "loss": 2.0688, + "step": 10081 + }, + { + "epoch": 1.27, + "grad_norm": 27.039587020874023, + "learning_rate": 1.1565075513533867e-05, + "loss": 1.2906, + "step": 10082 + }, + { + "epoch": 1.27, + "grad_norm": 57.26781463623047, + "learning_rate": 1.1564238798477179e-05, + "loss": 0.9118, + "step": 10083 + }, + { + "epoch": 1.27, + "grad_norm": 8.590510368347168, + "learning_rate": 1.1563402083420491e-05, + "loss": 0.9175, + "step": 10084 + }, + { + "epoch": 1.27, + "grad_norm": 7.459293842315674, + "learning_rate": 1.1562565368363805e-05, + "loss": 0.862, + "step": 10085 + }, + { + "epoch": 1.27, + "grad_norm": 60.84637451171875, + "learning_rate": 1.1561728653307118e-05, + "loss": 2.0098, + "step": 10086 + }, + { + "epoch": 1.27, + "grad_norm": 5.666141986846924, + "learning_rate": 1.1560891938250429e-05, + "loss": 0.5453, + "step": 10087 + }, + { + "epoch": 1.27, + "grad_norm": 10.006087303161621, + "learning_rate": 1.1560055223193742e-05, + "loss": 2.3826, + "step": 10088 + }, + { + "epoch": 1.27, + "grad_norm": 13.376801490783691, + "learning_rate": 1.1559218508137056e-05, + "loss": 2.2379, + "step": 10089 + }, + { + "epoch": 1.27, + "grad_norm": 18.88728141784668, + "learning_rate": 1.1558381793080366e-05, + "loss": 3.1723, + "step": 10090 + }, + { + "epoch": 1.27, + "grad_norm": 15.521059036254883, + "learning_rate": 1.155754507802368e-05, + "loss": 2.7883, + "step": 10091 + }, + { + "epoch": 1.27, + "grad_norm": 24.361106872558594, + "learning_rate": 1.1556708362966993e-05, + "loss": 1.4142, + "step": 10092 + }, + { + "epoch": 1.27, + "grad_norm": 5.3813347816467285, + "learning_rate": 1.1555871647910305e-05, + "loss": 0.3879, + "step": 10093 + }, + { + "epoch": 1.27, + "grad_norm": 3.4180572032928467, + "learning_rate": 1.1555034932853617e-05, + "loss": 0.1363, + "step": 10094 + }, + { + "epoch": 1.27, + "grad_norm": 18.977209091186523, + "learning_rate": 1.1554198217796931e-05, + "loss": 1.1672, + "step": 10095 + }, + { + "epoch": 1.27, + "grad_norm": 3.979006767272949, + "learning_rate": 1.1553361502740243e-05, + "loss": 0.4125, + "step": 10096 + }, + { + "epoch": 1.27, + "grad_norm": 7.230010986328125, + "learning_rate": 1.1552524787683555e-05, + "loss": 0.8564, + "step": 10097 + }, + { + "epoch": 1.27, + "grad_norm": 31.010128021240234, + "learning_rate": 1.1551688072626867e-05, + "loss": 1.5528, + "step": 10098 + }, + { + "epoch": 1.27, + "grad_norm": 13.065135955810547, + "learning_rate": 1.155085135757018e-05, + "loss": 0.4326, + "step": 10099 + }, + { + "epoch": 1.27, + "grad_norm": 37.013404846191406, + "learning_rate": 1.1550014642513494e-05, + "loss": 1.5276, + "step": 10100 + }, + { + "epoch": 1.27, + "grad_norm": 77.3665542602539, + "learning_rate": 1.1549177927456804e-05, + "loss": 2.7431, + "step": 10101 + }, + { + "epoch": 1.27, + "grad_norm": 18.20163345336914, + "learning_rate": 1.1548341212400118e-05, + "loss": 1.9462, + "step": 10102 + }, + { + "epoch": 1.27, + "grad_norm": 11.497478485107422, + "learning_rate": 1.1547504497343432e-05, + "loss": 1.6377, + "step": 10103 + }, + { + "epoch": 1.27, + "grad_norm": 5.81686544418335, + "learning_rate": 1.1546667782286742e-05, + "loss": 0.2629, + "step": 10104 + }, + { + "epoch": 1.27, + "grad_norm": 8.53093147277832, + "learning_rate": 1.1545831067230056e-05, + "loss": 0.8507, + "step": 10105 + }, + { + "epoch": 1.27, + "grad_norm": 25.47751235961914, + "learning_rate": 1.154499435217337e-05, + "loss": 1.11, + "step": 10106 + }, + { + "epoch": 1.27, + "grad_norm": 23.080636978149414, + "learning_rate": 1.1544157637116681e-05, + "loss": 1.3284, + "step": 10107 + }, + { + "epoch": 1.27, + "grad_norm": 10.221174240112305, + "learning_rate": 1.1543320922059993e-05, + "loss": 0.8157, + "step": 10108 + }, + { + "epoch": 1.27, + "grad_norm": 26.856416702270508, + "learning_rate": 1.1542484207003307e-05, + "loss": 2.0644, + "step": 10109 + }, + { + "epoch": 1.27, + "grad_norm": 10.893000602722168, + "learning_rate": 1.1541647491946619e-05, + "loss": 1.6379, + "step": 10110 + }, + { + "epoch": 1.27, + "grad_norm": 9.401641845703125, + "learning_rate": 1.154081077688993e-05, + "loss": 2.956, + "step": 10111 + }, + { + "epoch": 1.27, + "grad_norm": 14.263781547546387, + "learning_rate": 1.1539974061833243e-05, + "loss": 1.6514, + "step": 10112 + }, + { + "epoch": 1.27, + "grad_norm": 19.887052536010742, + "learning_rate": 1.1539137346776556e-05, + "loss": 0.8531, + "step": 10113 + }, + { + "epoch": 1.27, + "grad_norm": 10.784709930419922, + "learning_rate": 1.153830063171987e-05, + "loss": 1.4221, + "step": 10114 + }, + { + "epoch": 1.27, + "grad_norm": 6.730569839477539, + "learning_rate": 1.153746391666318e-05, + "loss": 0.5405, + "step": 10115 + }, + { + "epoch": 1.27, + "grad_norm": 12.676787376403809, + "learning_rate": 1.1536627201606494e-05, + "loss": 0.8175, + "step": 10116 + }, + { + "epoch": 1.27, + "grad_norm": 8.611346244812012, + "learning_rate": 1.1535790486549808e-05, + "loss": 0.5565, + "step": 10117 + }, + { + "epoch": 1.27, + "grad_norm": 12.043750762939453, + "learning_rate": 1.1534953771493118e-05, + "loss": 0.3946, + "step": 10118 + }, + { + "epoch": 1.27, + "grad_norm": 13.019969940185547, + "learning_rate": 1.1534117056436431e-05, + "loss": 1.1617, + "step": 10119 + }, + { + "epoch": 1.27, + "grad_norm": 13.252829551696777, + "learning_rate": 1.1533280341379745e-05, + "loss": 1.5584, + "step": 10120 + }, + { + "epoch": 1.27, + "grad_norm": 16.159860610961914, + "learning_rate": 1.1532443626323057e-05, + "loss": 1.3501, + "step": 10121 + }, + { + "epoch": 1.27, + "grad_norm": 21.49095344543457, + "learning_rate": 1.1531606911266369e-05, + "loss": 0.746, + "step": 10122 + }, + { + "epoch": 1.27, + "grad_norm": 12.386560440063477, + "learning_rate": 1.1530770196209683e-05, + "loss": 0.5928, + "step": 10123 + }, + { + "epoch": 1.27, + "grad_norm": 19.16834831237793, + "learning_rate": 1.1529933481152995e-05, + "loss": 1.1127, + "step": 10124 + }, + { + "epoch": 1.27, + "grad_norm": 7.827854633331299, + "learning_rate": 1.1529096766096307e-05, + "loss": 1.0985, + "step": 10125 + }, + { + "epoch": 1.27, + "grad_norm": 7.0746541023254395, + "learning_rate": 1.1528260051039619e-05, + "loss": 0.8428, + "step": 10126 + }, + { + "epoch": 1.27, + "grad_norm": 74.46416473388672, + "learning_rate": 1.1527423335982932e-05, + "loss": 3.9228, + "step": 10127 + }, + { + "epoch": 1.27, + "grad_norm": 44.8482666015625, + "learning_rate": 1.1526586620926246e-05, + "loss": 1.3062, + "step": 10128 + }, + { + "epoch": 1.27, + "grad_norm": 24.330764770507812, + "learning_rate": 1.1525749905869556e-05, + "loss": 1.4034, + "step": 10129 + }, + { + "epoch": 1.27, + "grad_norm": 17.377822875976562, + "learning_rate": 1.152491319081287e-05, + "loss": 2.4979, + "step": 10130 + }, + { + "epoch": 1.27, + "grad_norm": 6.2692646980285645, + "learning_rate": 1.1524076475756183e-05, + "loss": 1.3149, + "step": 10131 + }, + { + "epoch": 1.27, + "grad_norm": 15.297842979431152, + "learning_rate": 1.1523239760699494e-05, + "loss": 1.3075, + "step": 10132 + }, + { + "epoch": 1.27, + "grad_norm": 12.967483520507812, + "learning_rate": 1.1522403045642807e-05, + "loss": 0.7149, + "step": 10133 + }, + { + "epoch": 1.27, + "grad_norm": 20.41457176208496, + "learning_rate": 1.1521566330586121e-05, + "loss": 1.4163, + "step": 10134 + }, + { + "epoch": 1.27, + "grad_norm": 8.600605964660645, + "learning_rate": 1.1520729615529433e-05, + "loss": 1.5241, + "step": 10135 + }, + { + "epoch": 1.27, + "grad_norm": 10.931024551391602, + "learning_rate": 1.1519892900472745e-05, + "loss": 0.6916, + "step": 10136 + }, + { + "epoch": 1.27, + "grad_norm": 18.53843116760254, + "learning_rate": 1.1519056185416057e-05, + "loss": 0.78, + "step": 10137 + }, + { + "epoch": 1.27, + "grad_norm": 9.421236991882324, + "learning_rate": 1.151821947035937e-05, + "loss": 0.7191, + "step": 10138 + }, + { + "epoch": 1.27, + "grad_norm": 6.311631679534912, + "learning_rate": 1.1517382755302682e-05, + "loss": 0.4033, + "step": 10139 + }, + { + "epoch": 1.27, + "grad_norm": 9.313081741333008, + "learning_rate": 1.1516546040245994e-05, + "loss": 0.8096, + "step": 10140 + }, + { + "epoch": 1.27, + "grad_norm": 22.80303955078125, + "learning_rate": 1.1515709325189308e-05, + "loss": 1.9656, + "step": 10141 + }, + { + "epoch": 1.27, + "grad_norm": 14.31672191619873, + "learning_rate": 1.1514872610132622e-05, + "loss": 0.536, + "step": 10142 + }, + { + "epoch": 1.27, + "grad_norm": 9.868819236755371, + "learning_rate": 1.1514035895075932e-05, + "loss": 0.5144, + "step": 10143 + }, + { + "epoch": 1.27, + "grad_norm": 14.120258331298828, + "learning_rate": 1.1513199180019246e-05, + "loss": 1.4992, + "step": 10144 + }, + { + "epoch": 1.27, + "grad_norm": 9.662248611450195, + "learning_rate": 1.151236246496256e-05, + "loss": 0.5324, + "step": 10145 + }, + { + "epoch": 1.27, + "grad_norm": 13.44205093383789, + "learning_rate": 1.151152574990587e-05, + "loss": 0.8438, + "step": 10146 + }, + { + "epoch": 1.27, + "grad_norm": 6.933963775634766, + "learning_rate": 1.1510689034849183e-05, + "loss": 1.4326, + "step": 10147 + }, + { + "epoch": 1.27, + "grad_norm": 16.528087615966797, + "learning_rate": 1.1509852319792497e-05, + "loss": 3.5467, + "step": 10148 + }, + { + "epoch": 1.27, + "grad_norm": 12.353951454162598, + "learning_rate": 1.1509015604735809e-05, + "loss": 0.9497, + "step": 10149 + }, + { + "epoch": 1.27, + "grad_norm": 19.795032501220703, + "learning_rate": 1.150817888967912e-05, + "loss": 2.5206, + "step": 10150 + }, + { + "epoch": 1.27, + "grad_norm": 7.626039505004883, + "learning_rate": 1.1507342174622433e-05, + "loss": 0.5983, + "step": 10151 + }, + { + "epoch": 1.27, + "grad_norm": 16.426559448242188, + "learning_rate": 1.1506505459565746e-05, + "loss": 1.5464, + "step": 10152 + }, + { + "epoch": 1.27, + "grad_norm": 8.337430000305176, + "learning_rate": 1.1505668744509058e-05, + "loss": 0.5066, + "step": 10153 + }, + { + "epoch": 1.27, + "grad_norm": 7.870647430419922, + "learning_rate": 1.150483202945237e-05, + "loss": 0.7625, + "step": 10154 + }, + { + "epoch": 1.27, + "grad_norm": 13.128697395324707, + "learning_rate": 1.1503995314395684e-05, + "loss": 2.1313, + "step": 10155 + }, + { + "epoch": 1.27, + "grad_norm": 29.010080337524414, + "learning_rate": 1.1503158599338998e-05, + "loss": 0.5929, + "step": 10156 + }, + { + "epoch": 1.27, + "grad_norm": 26.012073516845703, + "learning_rate": 1.1502321884282308e-05, + "loss": 1.9542, + "step": 10157 + }, + { + "epoch": 1.27, + "grad_norm": 11.357194900512695, + "learning_rate": 1.1501485169225621e-05, + "loss": 0.6633, + "step": 10158 + }, + { + "epoch": 1.27, + "grad_norm": 9.94775390625, + "learning_rate": 1.1500648454168935e-05, + "loss": 1.4055, + "step": 10159 + }, + { + "epoch": 1.28, + "grad_norm": 15.528973579406738, + "learning_rate": 1.1499811739112245e-05, + "loss": 2.0949, + "step": 10160 + }, + { + "epoch": 1.28, + "grad_norm": 19.442302703857422, + "learning_rate": 1.1498975024055559e-05, + "loss": 1.303, + "step": 10161 + }, + { + "epoch": 1.28, + "grad_norm": 3.6374619007110596, + "learning_rate": 1.1498138308998873e-05, + "loss": 0.3394, + "step": 10162 + }, + { + "epoch": 1.28, + "grad_norm": 39.10694885253906, + "learning_rate": 1.1497301593942185e-05, + "loss": 3.5757, + "step": 10163 + }, + { + "epoch": 1.28, + "grad_norm": 5.301551342010498, + "learning_rate": 1.1496464878885497e-05, + "loss": 0.4299, + "step": 10164 + }, + { + "epoch": 1.28, + "grad_norm": 14.305914878845215, + "learning_rate": 1.1495628163828808e-05, + "loss": 2.2254, + "step": 10165 + }, + { + "epoch": 1.28, + "grad_norm": 10.280922889709473, + "learning_rate": 1.1494791448772122e-05, + "loss": 0.5056, + "step": 10166 + }, + { + "epoch": 1.28, + "grad_norm": 19.907215118408203, + "learning_rate": 1.1493954733715434e-05, + "loss": 0.862, + "step": 10167 + }, + { + "epoch": 1.28, + "grad_norm": 13.114398956298828, + "learning_rate": 1.1493118018658746e-05, + "loss": 1.274, + "step": 10168 + }, + { + "epoch": 1.28, + "grad_norm": 8.666918754577637, + "learning_rate": 1.149228130360206e-05, + "loss": 1.2034, + "step": 10169 + }, + { + "epoch": 1.28, + "grad_norm": 16.747922897338867, + "learning_rate": 1.149144458854537e-05, + "loss": 1.3977, + "step": 10170 + }, + { + "epoch": 1.28, + "grad_norm": 9.491082191467285, + "learning_rate": 1.1490607873488684e-05, + "loss": 1.5995, + "step": 10171 + }, + { + "epoch": 1.28, + "grad_norm": 109.70430755615234, + "learning_rate": 1.1489771158431997e-05, + "loss": 2.0431, + "step": 10172 + }, + { + "epoch": 1.28, + "grad_norm": 19.952125549316406, + "learning_rate": 1.1488934443375311e-05, + "loss": 2.6242, + "step": 10173 + }, + { + "epoch": 1.28, + "grad_norm": 14.38827896118164, + "learning_rate": 1.1488097728318621e-05, + "loss": 1.0839, + "step": 10174 + }, + { + "epoch": 1.28, + "grad_norm": 8.935357093811035, + "learning_rate": 1.1487261013261935e-05, + "loss": 0.4499, + "step": 10175 + }, + { + "epoch": 1.28, + "grad_norm": 12.330789566040039, + "learning_rate": 1.1486424298205248e-05, + "loss": 1.8868, + "step": 10176 + }, + { + "epoch": 1.28, + "grad_norm": 5.454981327056885, + "learning_rate": 1.1485587583148559e-05, + "loss": 0.5025, + "step": 10177 + }, + { + "epoch": 1.28, + "grad_norm": 9.598280906677246, + "learning_rate": 1.1484750868091872e-05, + "loss": 0.4013, + "step": 10178 + }, + { + "epoch": 1.28, + "grad_norm": 9.526079177856445, + "learning_rate": 1.1483914153035184e-05, + "loss": 0.933, + "step": 10179 + }, + { + "epoch": 1.28, + "grad_norm": 15.199457168579102, + "learning_rate": 1.1483077437978498e-05, + "loss": 1.4231, + "step": 10180 + }, + { + "epoch": 1.28, + "grad_norm": 78.98918151855469, + "learning_rate": 1.148224072292181e-05, + "loss": 2.0349, + "step": 10181 + }, + { + "epoch": 1.28, + "grad_norm": 32.13898849487305, + "learning_rate": 1.1481404007865122e-05, + "loss": 1.6432, + "step": 10182 + }, + { + "epoch": 1.28, + "grad_norm": 38.61654281616211, + "learning_rate": 1.1480567292808436e-05, + "loss": 2.6025, + "step": 10183 + }, + { + "epoch": 1.28, + "grad_norm": 3.5785717964172363, + "learning_rate": 1.1479730577751746e-05, + "loss": 0.1609, + "step": 10184 + }, + { + "epoch": 1.28, + "grad_norm": 4.924627304077148, + "learning_rate": 1.147889386269506e-05, + "loss": 0.2792, + "step": 10185 + }, + { + "epoch": 1.28, + "grad_norm": 61.50282287597656, + "learning_rate": 1.1478057147638373e-05, + "loss": 2.2775, + "step": 10186 + }, + { + "epoch": 1.28, + "grad_norm": 17.36076545715332, + "learning_rate": 1.1477220432581687e-05, + "loss": 0.7657, + "step": 10187 + }, + { + "epoch": 1.28, + "grad_norm": 12.714561462402344, + "learning_rate": 1.1476383717524997e-05, + "loss": 1.351, + "step": 10188 + }, + { + "epoch": 1.28, + "grad_norm": 16.25249481201172, + "learning_rate": 1.147554700246831e-05, + "loss": 1.2711, + "step": 10189 + }, + { + "epoch": 1.28, + "grad_norm": 22.99129867553711, + "learning_rate": 1.1474710287411623e-05, + "loss": 1.2257, + "step": 10190 + }, + { + "epoch": 1.28, + "grad_norm": 50.55567169189453, + "learning_rate": 1.1473873572354935e-05, + "loss": 3.1723, + "step": 10191 + }, + { + "epoch": 1.28, + "grad_norm": 29.830339431762695, + "learning_rate": 1.1473036857298248e-05, + "loss": 2.6605, + "step": 10192 + }, + { + "epoch": 1.28, + "grad_norm": 11.058066368103027, + "learning_rate": 1.147220014224156e-05, + "loss": 0.4693, + "step": 10193 + }, + { + "epoch": 1.28, + "grad_norm": 27.12210464477539, + "learning_rate": 1.1471363427184874e-05, + "loss": 1.9006, + "step": 10194 + }, + { + "epoch": 1.28, + "grad_norm": 15.129981994628906, + "learning_rate": 1.1470526712128184e-05, + "loss": 0.7715, + "step": 10195 + }, + { + "epoch": 1.28, + "grad_norm": 11.411121368408203, + "learning_rate": 1.1469689997071498e-05, + "loss": 1.2326, + "step": 10196 + }, + { + "epoch": 1.28, + "grad_norm": 14.986672401428223, + "learning_rate": 1.1468853282014811e-05, + "loss": 1.0021, + "step": 10197 + }, + { + "epoch": 1.28, + "grad_norm": 7.28682279586792, + "learning_rate": 1.1468016566958122e-05, + "loss": 1.6545, + "step": 10198 + }, + { + "epoch": 1.28, + "grad_norm": 12.118386268615723, + "learning_rate": 1.1467179851901435e-05, + "loss": 0.605, + "step": 10199 + }, + { + "epoch": 1.28, + "grad_norm": 11.546998977661133, + "learning_rate": 1.1466343136844749e-05, + "loss": 0.5888, + "step": 10200 + }, + { + "epoch": 1.28, + "grad_norm": 9.022869110107422, + "learning_rate": 1.1465506421788063e-05, + "loss": 0.7267, + "step": 10201 + }, + { + "epoch": 1.28, + "grad_norm": 11.095171928405762, + "learning_rate": 1.1464669706731373e-05, + "loss": 2.0642, + "step": 10202 + }, + { + "epoch": 1.28, + "grad_norm": 10.303417205810547, + "learning_rate": 1.1463832991674686e-05, + "loss": 0.5233, + "step": 10203 + }, + { + "epoch": 1.28, + "grad_norm": 11.605952262878418, + "learning_rate": 1.1462996276617998e-05, + "loss": 2.1517, + "step": 10204 + }, + { + "epoch": 1.28, + "grad_norm": 15.570901870727539, + "learning_rate": 1.146215956156131e-05, + "loss": 0.9086, + "step": 10205 + }, + { + "epoch": 1.28, + "grad_norm": 81.48786926269531, + "learning_rate": 1.1461322846504624e-05, + "loss": 2.5043, + "step": 10206 + }, + { + "epoch": 1.28, + "grad_norm": 38.58003616333008, + "learning_rate": 1.1460486131447936e-05, + "loss": 0.7259, + "step": 10207 + }, + { + "epoch": 1.28, + "grad_norm": 18.89350128173828, + "learning_rate": 1.145964941639125e-05, + "loss": 2.7088, + "step": 10208 + }, + { + "epoch": 1.28, + "grad_norm": 11.42757511138916, + "learning_rate": 1.145881270133456e-05, + "loss": 1.7884, + "step": 10209 + }, + { + "epoch": 1.28, + "grad_norm": 14.500580787658691, + "learning_rate": 1.1457975986277874e-05, + "loss": 1.8369, + "step": 10210 + }, + { + "epoch": 1.28, + "grad_norm": 10.310991287231445, + "learning_rate": 1.1457139271221187e-05, + "loss": 0.2204, + "step": 10211 + }, + { + "epoch": 1.28, + "grad_norm": 7.678720951080322, + "learning_rate": 1.1456302556164497e-05, + "loss": 0.7853, + "step": 10212 + }, + { + "epoch": 1.28, + "grad_norm": 23.72821617126465, + "learning_rate": 1.1455465841107811e-05, + "loss": 1.7404, + "step": 10213 + }, + { + "epoch": 1.28, + "grad_norm": 11.855911254882812, + "learning_rate": 1.1454629126051125e-05, + "loss": 1.0215, + "step": 10214 + }, + { + "epoch": 1.28, + "grad_norm": 5.976694583892822, + "learning_rate": 1.1453792410994438e-05, + "loss": 0.2622, + "step": 10215 + }, + { + "epoch": 1.28, + "grad_norm": 23.440793991088867, + "learning_rate": 1.1452955695937749e-05, + "loss": 1.4206, + "step": 10216 + }, + { + "epoch": 1.28, + "grad_norm": 13.946830749511719, + "learning_rate": 1.1452118980881062e-05, + "loss": 2.2023, + "step": 10217 + }, + { + "epoch": 1.28, + "grad_norm": 88.03887939453125, + "learning_rate": 1.1451282265824374e-05, + "loss": 3.502, + "step": 10218 + }, + { + "epoch": 1.28, + "grad_norm": 19.027698516845703, + "learning_rate": 1.1450445550767686e-05, + "loss": 0.7178, + "step": 10219 + }, + { + "epoch": 1.28, + "grad_norm": 56.99128723144531, + "learning_rate": 1.1449608835711e-05, + "loss": 0.9991, + "step": 10220 + }, + { + "epoch": 1.28, + "grad_norm": 13.91539192199707, + "learning_rate": 1.1448772120654312e-05, + "loss": 1.4028, + "step": 10221 + }, + { + "epoch": 1.28, + "grad_norm": 34.86525344848633, + "learning_rate": 1.1447935405597625e-05, + "loss": 1.5506, + "step": 10222 + }, + { + "epoch": 1.28, + "grad_norm": 18.596843719482422, + "learning_rate": 1.1447098690540936e-05, + "loss": 1.7121, + "step": 10223 + }, + { + "epoch": 1.28, + "grad_norm": 3.9318630695343018, + "learning_rate": 1.144626197548425e-05, + "loss": 0.2067, + "step": 10224 + }, + { + "epoch": 1.28, + "grad_norm": 22.74734878540039, + "learning_rate": 1.1445425260427563e-05, + "loss": 1.2098, + "step": 10225 + }, + { + "epoch": 1.28, + "grad_norm": 13.152312278747559, + "learning_rate": 1.1444588545370873e-05, + "loss": 0.6433, + "step": 10226 + }, + { + "epoch": 1.28, + "grad_norm": 13.77457046508789, + "learning_rate": 1.1443751830314187e-05, + "loss": 1.9025, + "step": 10227 + }, + { + "epoch": 1.28, + "grad_norm": 8.574159622192383, + "learning_rate": 1.14429151152575e-05, + "loss": 0.8724, + "step": 10228 + }, + { + "epoch": 1.28, + "grad_norm": 21.462326049804688, + "learning_rate": 1.1442078400200813e-05, + "loss": 1.4886, + "step": 10229 + }, + { + "epoch": 1.28, + "grad_norm": 19.787872314453125, + "learning_rate": 1.1441241685144125e-05, + "loss": 2.7115, + "step": 10230 + }, + { + "epoch": 1.28, + "grad_norm": 14.572142601013184, + "learning_rate": 1.1440404970087438e-05, + "loss": 1.9277, + "step": 10231 + }, + { + "epoch": 1.28, + "grad_norm": 67.16240692138672, + "learning_rate": 1.143956825503075e-05, + "loss": 1.009, + "step": 10232 + }, + { + "epoch": 1.28, + "grad_norm": 15.596375465393066, + "learning_rate": 1.1438731539974062e-05, + "loss": 1.0441, + "step": 10233 + }, + { + "epoch": 1.28, + "grad_norm": 5.920174598693848, + "learning_rate": 1.1437894824917376e-05, + "loss": 0.291, + "step": 10234 + }, + { + "epoch": 1.28, + "grad_norm": 14.65095329284668, + "learning_rate": 1.1437058109860688e-05, + "loss": 0.7002, + "step": 10235 + }, + { + "epoch": 1.28, + "grad_norm": 16.111230850219727, + "learning_rate": 1.1436221394804001e-05, + "loss": 1.2901, + "step": 10236 + }, + { + "epoch": 1.28, + "grad_norm": 52.37575149536133, + "learning_rate": 1.1435384679747312e-05, + "loss": 0.932, + "step": 10237 + }, + { + "epoch": 1.28, + "grad_norm": 16.191383361816406, + "learning_rate": 1.1434547964690625e-05, + "loss": 0.8914, + "step": 10238 + }, + { + "epoch": 1.28, + "grad_norm": 39.67690658569336, + "learning_rate": 1.1433711249633939e-05, + "loss": 1.0724, + "step": 10239 + }, + { + "epoch": 1.29, + "grad_norm": 17.87415885925293, + "learning_rate": 1.1432874534577249e-05, + "loss": 2.0775, + "step": 10240 + }, + { + "epoch": 1.29, + "grad_norm": 17.937177658081055, + "learning_rate": 1.1432037819520563e-05, + "loss": 1.4651, + "step": 10241 + }, + { + "epoch": 1.29, + "grad_norm": 7.352197647094727, + "learning_rate": 1.1431201104463876e-05, + "loss": 0.971, + "step": 10242 + }, + { + "epoch": 1.29, + "grad_norm": 18.424087524414062, + "learning_rate": 1.1430364389407188e-05, + "loss": 0.9773, + "step": 10243 + }, + { + "epoch": 1.29, + "grad_norm": 38.48517990112305, + "learning_rate": 1.14295276743505e-05, + "loss": 1.7147, + "step": 10244 + }, + { + "epoch": 1.29, + "grad_norm": 43.285911560058594, + "learning_rate": 1.1428690959293814e-05, + "loss": 1.2354, + "step": 10245 + }, + { + "epoch": 1.29, + "grad_norm": 20.401620864868164, + "learning_rate": 1.1427854244237126e-05, + "loss": 1.2132, + "step": 10246 + }, + { + "epoch": 1.29, + "grad_norm": 7.969087600708008, + "learning_rate": 1.1427017529180438e-05, + "loss": 0.4129, + "step": 10247 + }, + { + "epoch": 1.29, + "grad_norm": 10.487325668334961, + "learning_rate": 1.142618081412375e-05, + "loss": 0.5936, + "step": 10248 + }, + { + "epoch": 1.29, + "grad_norm": 15.239189147949219, + "learning_rate": 1.1425344099067064e-05, + "loss": 1.5616, + "step": 10249 + }, + { + "epoch": 1.29, + "grad_norm": 6.267992973327637, + "learning_rate": 1.1424507384010377e-05, + "loss": 0.5374, + "step": 10250 + }, + { + "epoch": 1.29, + "grad_norm": 6.666161060333252, + "learning_rate": 1.1423670668953687e-05, + "loss": 0.2554, + "step": 10251 + }, + { + "epoch": 1.29, + "grad_norm": 7.305466175079346, + "learning_rate": 1.1422833953897001e-05, + "loss": 0.8353, + "step": 10252 + }, + { + "epoch": 1.29, + "grad_norm": 4.593288421630859, + "learning_rate": 1.1421997238840315e-05, + "loss": 0.2813, + "step": 10253 + }, + { + "epoch": 1.29, + "grad_norm": 6.5083909034729, + "learning_rate": 1.1421160523783625e-05, + "loss": 0.8337, + "step": 10254 + }, + { + "epoch": 1.29, + "grad_norm": 8.180171012878418, + "learning_rate": 1.1420323808726939e-05, + "loss": 0.6955, + "step": 10255 + }, + { + "epoch": 1.29, + "grad_norm": 16.141521453857422, + "learning_rate": 1.1419487093670252e-05, + "loss": 2.3699, + "step": 10256 + }, + { + "epoch": 1.29, + "grad_norm": 16.75731086730957, + "learning_rate": 1.1418650378613564e-05, + "loss": 1.3129, + "step": 10257 + }, + { + "epoch": 1.29, + "grad_norm": 16.07633399963379, + "learning_rate": 1.1417813663556876e-05, + "loss": 1.6392, + "step": 10258 + }, + { + "epoch": 1.29, + "grad_norm": 18.466657638549805, + "learning_rate": 1.141697694850019e-05, + "loss": 1.7056, + "step": 10259 + }, + { + "epoch": 1.29, + "grad_norm": 10.80053997039795, + "learning_rate": 1.1416140233443502e-05, + "loss": 1.4135, + "step": 10260 + }, + { + "epoch": 1.29, + "grad_norm": 28.12500762939453, + "learning_rate": 1.1415303518386814e-05, + "loss": 1.8611, + "step": 10261 + }, + { + "epoch": 1.29, + "grad_norm": 12.086811065673828, + "learning_rate": 1.1414466803330126e-05, + "loss": 1.3992, + "step": 10262 + }, + { + "epoch": 1.29, + "grad_norm": 30.865243911743164, + "learning_rate": 1.141363008827344e-05, + "loss": 1.8881, + "step": 10263 + }, + { + "epoch": 1.29, + "grad_norm": 11.483710289001465, + "learning_rate": 1.1412793373216753e-05, + "loss": 1.1132, + "step": 10264 + }, + { + "epoch": 1.29, + "grad_norm": 26.61813735961914, + "learning_rate": 1.1411956658160063e-05, + "loss": 1.5824, + "step": 10265 + }, + { + "epoch": 1.29, + "grad_norm": 28.661653518676758, + "learning_rate": 1.1411119943103377e-05, + "loss": 2.1407, + "step": 10266 + }, + { + "epoch": 1.29, + "grad_norm": 9.828034400939941, + "learning_rate": 1.141028322804669e-05, + "loss": 1.0855, + "step": 10267 + }, + { + "epoch": 1.29, + "grad_norm": 15.128324508666992, + "learning_rate": 1.140944651299e-05, + "loss": 2.3863, + "step": 10268 + }, + { + "epoch": 1.29, + "grad_norm": 8.595596313476562, + "learning_rate": 1.1408609797933314e-05, + "loss": 1.7249, + "step": 10269 + }, + { + "epoch": 1.29, + "grad_norm": 16.92155647277832, + "learning_rate": 1.1407773082876628e-05, + "loss": 1.4368, + "step": 10270 + }, + { + "epoch": 1.29, + "grad_norm": 6.1554646492004395, + "learning_rate": 1.140693636781994e-05, + "loss": 1.1061, + "step": 10271 + }, + { + "epoch": 1.29, + "grad_norm": 7.05190372467041, + "learning_rate": 1.1406099652763252e-05, + "loss": 0.6833, + "step": 10272 + }, + { + "epoch": 1.29, + "grad_norm": 32.9219856262207, + "learning_rate": 1.1405262937706566e-05, + "loss": 1.4377, + "step": 10273 + }, + { + "epoch": 1.29, + "grad_norm": 16.694856643676758, + "learning_rate": 1.1404426222649878e-05, + "loss": 0.8126, + "step": 10274 + }, + { + "epoch": 1.29, + "grad_norm": 34.169456481933594, + "learning_rate": 1.140358950759319e-05, + "loss": 1.5287, + "step": 10275 + }, + { + "epoch": 1.29, + "grad_norm": 4.406486511230469, + "learning_rate": 1.1402752792536502e-05, + "loss": 0.2657, + "step": 10276 + }, + { + "epoch": 1.29, + "grad_norm": 256.61065673828125, + "learning_rate": 1.1401916077479815e-05, + "loss": 0.7833, + "step": 10277 + }, + { + "epoch": 1.29, + "grad_norm": 23.040742874145508, + "learning_rate": 1.1401079362423129e-05, + "loss": 1.6952, + "step": 10278 + }, + { + "epoch": 1.29, + "grad_norm": 45.938045501708984, + "learning_rate": 1.1400242647366439e-05, + "loss": 1.9528, + "step": 10279 + }, + { + "epoch": 1.29, + "grad_norm": 19.950336456298828, + "learning_rate": 1.1399405932309753e-05, + "loss": 1.8527, + "step": 10280 + }, + { + "epoch": 1.29, + "grad_norm": 31.180498123168945, + "learning_rate": 1.1398569217253066e-05, + "loss": 3.2534, + "step": 10281 + }, + { + "epoch": 1.29, + "grad_norm": 16.345396041870117, + "learning_rate": 1.1397732502196377e-05, + "loss": 1.582, + "step": 10282 + }, + { + "epoch": 1.29, + "grad_norm": 11.217645645141602, + "learning_rate": 1.139689578713969e-05, + "loss": 0.863, + "step": 10283 + }, + { + "epoch": 1.29, + "grad_norm": 22.57853126525879, + "learning_rate": 1.1396059072083004e-05, + "loss": 1.0994, + "step": 10284 + }, + { + "epoch": 1.29, + "grad_norm": 16.58045768737793, + "learning_rate": 1.1395222357026316e-05, + "loss": 1.5042, + "step": 10285 + }, + { + "epoch": 1.29, + "grad_norm": 13.019248962402344, + "learning_rate": 1.1394385641969628e-05, + "loss": 1.068, + "step": 10286 + }, + { + "epoch": 1.29, + "grad_norm": 66.55805969238281, + "learning_rate": 1.1393548926912942e-05, + "loss": 2.7568, + "step": 10287 + }, + { + "epoch": 1.29, + "grad_norm": 3.6205310821533203, + "learning_rate": 1.1392712211856253e-05, + "loss": 0.3069, + "step": 10288 + }, + { + "epoch": 1.29, + "grad_norm": 7.0787482261657715, + "learning_rate": 1.1391875496799565e-05, + "loss": 0.6211, + "step": 10289 + }, + { + "epoch": 1.29, + "grad_norm": 22.246379852294922, + "learning_rate": 1.1391038781742877e-05, + "loss": 1.4295, + "step": 10290 + }, + { + "epoch": 1.29, + "grad_norm": 36.28958511352539, + "learning_rate": 1.1390202066686191e-05, + "loss": 2.3918, + "step": 10291 + }, + { + "epoch": 1.29, + "grad_norm": 7.485498905181885, + "learning_rate": 1.1389365351629505e-05, + "loss": 0.6775, + "step": 10292 + }, + { + "epoch": 1.29, + "grad_norm": 15.13989543914795, + "learning_rate": 1.1388528636572815e-05, + "loss": 0.5463, + "step": 10293 + }, + { + "epoch": 1.29, + "grad_norm": 7.350567817687988, + "learning_rate": 1.1387691921516129e-05, + "loss": 1.4391, + "step": 10294 + }, + { + "epoch": 1.29, + "grad_norm": 5.292941570281982, + "learning_rate": 1.1386855206459442e-05, + "loss": 0.5071, + "step": 10295 + }, + { + "epoch": 1.29, + "grad_norm": 14.215531349182129, + "learning_rate": 1.1386018491402752e-05, + "loss": 1.0031, + "step": 10296 + }, + { + "epoch": 1.29, + "grad_norm": 24.693675994873047, + "learning_rate": 1.1385181776346066e-05, + "loss": 2.4584, + "step": 10297 + }, + { + "epoch": 1.29, + "grad_norm": 5.77023983001709, + "learning_rate": 1.138434506128938e-05, + "loss": 0.4245, + "step": 10298 + }, + { + "epoch": 1.29, + "grad_norm": 18.520322799682617, + "learning_rate": 1.1383508346232692e-05, + "loss": 1.4537, + "step": 10299 + }, + { + "epoch": 1.29, + "grad_norm": 35.155845642089844, + "learning_rate": 1.1382671631176004e-05, + "loss": 1.9235, + "step": 10300 + }, + { + "epoch": 1.29, + "grad_norm": 6.172835350036621, + "learning_rate": 1.1381834916119316e-05, + "loss": 0.4536, + "step": 10301 + }, + { + "epoch": 1.29, + "grad_norm": 8.883145332336426, + "learning_rate": 1.138099820106263e-05, + "loss": 1.2697, + "step": 10302 + }, + { + "epoch": 1.29, + "grad_norm": 11.846651077270508, + "learning_rate": 1.1380161486005941e-05, + "loss": 0.9475, + "step": 10303 + }, + { + "epoch": 1.29, + "grad_norm": 13.66583251953125, + "learning_rate": 1.1379324770949253e-05, + "loss": 2.1352, + "step": 10304 + }, + { + "epoch": 1.29, + "grad_norm": 88.45223999023438, + "learning_rate": 1.1378488055892567e-05, + "loss": 3.0673, + "step": 10305 + }, + { + "epoch": 1.29, + "grad_norm": 9.919356346130371, + "learning_rate": 1.137765134083588e-05, + "loss": 0.8511, + "step": 10306 + }, + { + "epoch": 1.29, + "grad_norm": 34.995784759521484, + "learning_rate": 1.137681462577919e-05, + "loss": 1.4538, + "step": 10307 + }, + { + "epoch": 1.29, + "grad_norm": 58.23386001586914, + "learning_rate": 1.1375977910722504e-05, + "loss": 2.2188, + "step": 10308 + }, + { + "epoch": 1.29, + "grad_norm": 5.6721272468566895, + "learning_rate": 1.1375141195665818e-05, + "loss": 0.5302, + "step": 10309 + }, + { + "epoch": 1.29, + "grad_norm": 6.369467258453369, + "learning_rate": 1.1374304480609128e-05, + "loss": 1.5864, + "step": 10310 + }, + { + "epoch": 1.29, + "grad_norm": 10.525195121765137, + "learning_rate": 1.1373467765552442e-05, + "loss": 0.5, + "step": 10311 + }, + { + "epoch": 1.29, + "grad_norm": 9.663617134094238, + "learning_rate": 1.1372631050495756e-05, + "loss": 1.0208, + "step": 10312 + }, + { + "epoch": 1.29, + "grad_norm": 10.464286804199219, + "learning_rate": 1.1371794335439068e-05, + "loss": 1.6635, + "step": 10313 + }, + { + "epoch": 1.29, + "grad_norm": 29.7788143157959, + "learning_rate": 1.137095762038238e-05, + "loss": 0.7489, + "step": 10314 + }, + { + "epoch": 1.29, + "grad_norm": 27.088542938232422, + "learning_rate": 1.1370120905325691e-05, + "loss": 1.816, + "step": 10315 + }, + { + "epoch": 1.29, + "grad_norm": 13.230093002319336, + "learning_rate": 1.1369284190269005e-05, + "loss": 1.5894, + "step": 10316 + }, + { + "epoch": 1.29, + "grad_norm": 11.129487991333008, + "learning_rate": 1.1368447475212317e-05, + "loss": 0.9775, + "step": 10317 + }, + { + "epoch": 1.29, + "grad_norm": 10.884594917297363, + "learning_rate": 1.1367610760155629e-05, + "loss": 1.1296, + "step": 10318 + }, + { + "epoch": 1.3, + "grad_norm": 32.00385665893555, + "learning_rate": 1.1366774045098943e-05, + "loss": 2.2619, + "step": 10319 + }, + { + "epoch": 1.3, + "grad_norm": 79.75541687011719, + "learning_rate": 1.1365937330042256e-05, + "loss": 1.1296, + "step": 10320 + }, + { + "epoch": 1.3, + "grad_norm": 25.60192108154297, + "learning_rate": 1.1365100614985567e-05, + "loss": 2.0423, + "step": 10321 + }, + { + "epoch": 1.3, + "grad_norm": 9.8787202835083, + "learning_rate": 1.136426389992888e-05, + "loss": 1.4763, + "step": 10322 + }, + { + "epoch": 1.3, + "grad_norm": 13.253055572509766, + "learning_rate": 1.1363427184872194e-05, + "loss": 1.5555, + "step": 10323 + }, + { + "epoch": 1.3, + "grad_norm": 16.878116607666016, + "learning_rate": 1.1362590469815504e-05, + "loss": 1.4943, + "step": 10324 + }, + { + "epoch": 1.3, + "grad_norm": 19.132755279541016, + "learning_rate": 1.1361753754758818e-05, + "loss": 1.8916, + "step": 10325 + }, + { + "epoch": 1.3, + "grad_norm": 7.025834083557129, + "learning_rate": 1.1360917039702131e-05, + "loss": 0.3252, + "step": 10326 + }, + { + "epoch": 1.3, + "grad_norm": 27.166078567504883, + "learning_rate": 1.1360080324645443e-05, + "loss": 2.4286, + "step": 10327 + }, + { + "epoch": 1.3, + "grad_norm": 11.396801948547363, + "learning_rate": 1.1359243609588755e-05, + "loss": 1.6724, + "step": 10328 + }, + { + "epoch": 1.3, + "grad_norm": 7.133232116699219, + "learning_rate": 1.1358406894532067e-05, + "loss": 0.3781, + "step": 10329 + }, + { + "epoch": 1.3, + "grad_norm": 18.406600952148438, + "learning_rate": 1.1357570179475381e-05, + "loss": 2.059, + "step": 10330 + }, + { + "epoch": 1.3, + "grad_norm": 37.831607818603516, + "learning_rate": 1.1356733464418693e-05, + "loss": 1.8232, + "step": 10331 + }, + { + "epoch": 1.3, + "grad_norm": 19.915050506591797, + "learning_rate": 1.1355896749362005e-05, + "loss": 1.4515, + "step": 10332 + }, + { + "epoch": 1.3, + "grad_norm": 21.95366668701172, + "learning_rate": 1.1355060034305319e-05, + "loss": 1.7563, + "step": 10333 + }, + { + "epoch": 1.3, + "grad_norm": 12.465929985046387, + "learning_rate": 1.1354223319248632e-05, + "loss": 0.2971, + "step": 10334 + }, + { + "epoch": 1.3, + "grad_norm": 5.080347537994385, + "learning_rate": 1.1353386604191942e-05, + "loss": 0.3639, + "step": 10335 + }, + { + "epoch": 1.3, + "grad_norm": 3.8196351528167725, + "learning_rate": 1.1352549889135256e-05, + "loss": 0.4015, + "step": 10336 + }, + { + "epoch": 1.3, + "grad_norm": 7.730131149291992, + "learning_rate": 1.135171317407857e-05, + "loss": 0.5276, + "step": 10337 + }, + { + "epoch": 1.3, + "grad_norm": 30.008684158325195, + "learning_rate": 1.135087645902188e-05, + "loss": 2.2609, + "step": 10338 + }, + { + "epoch": 1.3, + "grad_norm": 9.635030746459961, + "learning_rate": 1.1350039743965194e-05, + "loss": 0.962, + "step": 10339 + }, + { + "epoch": 1.3, + "grad_norm": 5.797532558441162, + "learning_rate": 1.1349203028908506e-05, + "loss": 0.7564, + "step": 10340 + }, + { + "epoch": 1.3, + "grad_norm": 8.949183464050293, + "learning_rate": 1.134836631385182e-05, + "loss": 0.9836, + "step": 10341 + }, + { + "epoch": 1.3, + "grad_norm": 4.199320316314697, + "learning_rate": 1.1347529598795131e-05, + "loss": 1.0451, + "step": 10342 + }, + { + "epoch": 1.3, + "grad_norm": 38.582027435302734, + "learning_rate": 1.1346692883738443e-05, + "loss": 2.0152, + "step": 10343 + }, + { + "epoch": 1.3, + "grad_norm": 19.874881744384766, + "learning_rate": 1.1345856168681757e-05, + "loss": 1.53, + "step": 10344 + }, + { + "epoch": 1.3, + "grad_norm": 11.34642219543457, + "learning_rate": 1.1345019453625069e-05, + "loss": 1.1512, + "step": 10345 + }, + { + "epoch": 1.3, + "grad_norm": 22.101593017578125, + "learning_rate": 1.134418273856838e-05, + "loss": 0.8094, + "step": 10346 + }, + { + "epoch": 1.3, + "grad_norm": 13.938207626342773, + "learning_rate": 1.1343346023511694e-05, + "loss": 0.6246, + "step": 10347 + }, + { + "epoch": 1.3, + "grad_norm": 10.098628044128418, + "learning_rate": 1.1342509308455008e-05, + "loss": 0.4555, + "step": 10348 + }, + { + "epoch": 1.3, + "grad_norm": 16.11585807800293, + "learning_rate": 1.1341672593398318e-05, + "loss": 2.9272, + "step": 10349 + }, + { + "epoch": 1.3, + "grad_norm": 15.630293846130371, + "learning_rate": 1.1340835878341632e-05, + "loss": 1.1305, + "step": 10350 + }, + { + "epoch": 1.3, + "grad_norm": 17.844301223754883, + "learning_rate": 1.1339999163284946e-05, + "loss": 1.4772, + "step": 10351 + }, + { + "epoch": 1.3, + "grad_norm": 8.894214630126953, + "learning_rate": 1.1339162448228256e-05, + "loss": 0.5381, + "step": 10352 + }, + { + "epoch": 1.3, + "grad_norm": 12.17094898223877, + "learning_rate": 1.133832573317157e-05, + "loss": 0.7044, + "step": 10353 + }, + { + "epoch": 1.3, + "grad_norm": 50.2935905456543, + "learning_rate": 1.1337489018114881e-05, + "loss": 3.1994, + "step": 10354 + }, + { + "epoch": 1.3, + "grad_norm": 106.10897064208984, + "learning_rate": 1.1336652303058195e-05, + "loss": 1.2581, + "step": 10355 + }, + { + "epoch": 1.3, + "grad_norm": 29.674652099609375, + "learning_rate": 1.1335815588001507e-05, + "loss": 2.473, + "step": 10356 + }, + { + "epoch": 1.3, + "grad_norm": 16.675195693969727, + "learning_rate": 1.1334978872944819e-05, + "loss": 3.5494, + "step": 10357 + }, + { + "epoch": 1.3, + "grad_norm": 14.187880516052246, + "learning_rate": 1.1334142157888133e-05, + "loss": 1.0302, + "step": 10358 + }, + { + "epoch": 1.3, + "grad_norm": 16.538631439208984, + "learning_rate": 1.1333305442831443e-05, + "loss": 3.3621, + "step": 10359 + }, + { + "epoch": 1.3, + "grad_norm": 10.103822708129883, + "learning_rate": 1.1332468727774757e-05, + "loss": 2.0432, + "step": 10360 + }, + { + "epoch": 1.3, + "grad_norm": 74.78076171875, + "learning_rate": 1.133163201271807e-05, + "loss": 1.4483, + "step": 10361 + }, + { + "epoch": 1.3, + "grad_norm": 19.359525680541992, + "learning_rate": 1.1330795297661384e-05, + "loss": 2.5388, + "step": 10362 + }, + { + "epoch": 1.3, + "grad_norm": 12.176506996154785, + "learning_rate": 1.1329958582604694e-05, + "loss": 1.2399, + "step": 10363 + }, + { + "epoch": 1.3, + "grad_norm": 16.648141860961914, + "learning_rate": 1.1329121867548008e-05, + "loss": 0.4769, + "step": 10364 + }, + { + "epoch": 1.3, + "grad_norm": 4.545150279998779, + "learning_rate": 1.1328285152491321e-05, + "loss": 0.4344, + "step": 10365 + }, + { + "epoch": 1.3, + "grad_norm": 7.920763969421387, + "learning_rate": 1.1327448437434632e-05, + "loss": 1.3722, + "step": 10366 + }, + { + "epoch": 1.3, + "grad_norm": 8.198457717895508, + "learning_rate": 1.1326611722377945e-05, + "loss": 0.9668, + "step": 10367 + }, + { + "epoch": 1.3, + "grad_norm": 5.3731913566589355, + "learning_rate": 1.1325775007321257e-05, + "loss": 0.2896, + "step": 10368 + }, + { + "epoch": 1.3, + "grad_norm": 11.775003433227539, + "learning_rate": 1.1324938292264571e-05, + "loss": 2.7216, + "step": 10369 + }, + { + "epoch": 1.3, + "grad_norm": 11.60084342956543, + "learning_rate": 1.1324101577207883e-05, + "loss": 1.4263, + "step": 10370 + }, + { + "epoch": 1.3, + "grad_norm": 6.094202041625977, + "learning_rate": 1.1323264862151195e-05, + "loss": 0.6605, + "step": 10371 + }, + { + "epoch": 1.3, + "grad_norm": 6.6341471672058105, + "learning_rate": 1.1322428147094508e-05, + "loss": 0.8613, + "step": 10372 + }, + { + "epoch": 1.3, + "grad_norm": 11.536539077758789, + "learning_rate": 1.1321591432037819e-05, + "loss": 2.1967, + "step": 10373 + }, + { + "epoch": 1.3, + "grad_norm": 17.777511596679688, + "learning_rate": 1.1320754716981132e-05, + "loss": 1.1906, + "step": 10374 + }, + { + "epoch": 1.3, + "grad_norm": 18.944923400878906, + "learning_rate": 1.1319918001924446e-05, + "loss": 1.1539, + "step": 10375 + }, + { + "epoch": 1.3, + "grad_norm": 10.789403915405273, + "learning_rate": 1.131908128686776e-05, + "loss": 2.2528, + "step": 10376 + }, + { + "epoch": 1.3, + "grad_norm": 11.346477508544922, + "learning_rate": 1.131824457181107e-05, + "loss": 2.3022, + "step": 10377 + }, + { + "epoch": 1.3, + "grad_norm": 9.01475715637207, + "learning_rate": 1.1317407856754384e-05, + "loss": 1.7589, + "step": 10378 + }, + { + "epoch": 1.3, + "grad_norm": 12.307167053222656, + "learning_rate": 1.1316571141697697e-05, + "loss": 2.5049, + "step": 10379 + }, + { + "epoch": 1.3, + "grad_norm": 6.579891681671143, + "learning_rate": 1.1315734426641008e-05, + "loss": 0.5182, + "step": 10380 + }, + { + "epoch": 1.3, + "grad_norm": 16.720767974853516, + "learning_rate": 1.1314897711584321e-05, + "loss": 1.0419, + "step": 10381 + }, + { + "epoch": 1.3, + "grad_norm": 28.597412109375, + "learning_rate": 1.1314060996527633e-05, + "loss": 1.2098, + "step": 10382 + }, + { + "epoch": 1.3, + "grad_norm": 16.08039665222168, + "learning_rate": 1.1313224281470947e-05, + "loss": 2.1011, + "step": 10383 + }, + { + "epoch": 1.3, + "grad_norm": 9.145201683044434, + "learning_rate": 1.1312387566414259e-05, + "loss": 0.54, + "step": 10384 + }, + { + "epoch": 1.3, + "grad_norm": 10.521228790283203, + "learning_rate": 1.131155085135757e-05, + "loss": 0.9657, + "step": 10385 + }, + { + "epoch": 1.3, + "grad_norm": 12.017659187316895, + "learning_rate": 1.1310714136300884e-05, + "loss": 0.163, + "step": 10386 + }, + { + "epoch": 1.3, + "grad_norm": 16.294795989990234, + "learning_rate": 1.1309877421244195e-05, + "loss": 0.9599, + "step": 10387 + }, + { + "epoch": 1.3, + "grad_norm": 9.123615264892578, + "learning_rate": 1.1309040706187508e-05, + "loss": 0.214, + "step": 10388 + }, + { + "epoch": 1.3, + "grad_norm": 21.298059463500977, + "learning_rate": 1.1308203991130822e-05, + "loss": 1.9316, + "step": 10389 + }, + { + "epoch": 1.3, + "grad_norm": 15.311811447143555, + "learning_rate": 1.1307367276074136e-05, + "loss": 1.2914, + "step": 10390 + }, + { + "epoch": 1.3, + "grad_norm": 19.213844299316406, + "learning_rate": 1.1306530561017446e-05, + "loss": 1.6291, + "step": 10391 + }, + { + "epoch": 1.3, + "grad_norm": 12.955446243286133, + "learning_rate": 1.130569384596076e-05, + "loss": 1.1187, + "step": 10392 + }, + { + "epoch": 1.3, + "grad_norm": 12.431927680969238, + "learning_rate": 1.1304857130904071e-05, + "loss": 0.8056, + "step": 10393 + }, + { + "epoch": 1.3, + "grad_norm": 18.048500061035156, + "learning_rate": 1.1304020415847383e-05, + "loss": 1.8015, + "step": 10394 + }, + { + "epoch": 1.3, + "grad_norm": 52.673152923583984, + "learning_rate": 1.1303183700790697e-05, + "loss": 2.2139, + "step": 10395 + }, + { + "epoch": 1.3, + "grad_norm": 28.4801082611084, + "learning_rate": 1.1302346985734009e-05, + "loss": 2.603, + "step": 10396 + }, + { + "epoch": 1.3, + "grad_norm": 13.845949172973633, + "learning_rate": 1.1301510270677323e-05, + "loss": 0.7731, + "step": 10397 + }, + { + "epoch": 1.3, + "grad_norm": 19.7300968170166, + "learning_rate": 1.1300673555620635e-05, + "loss": 2.4962, + "step": 10398 + }, + { + "epoch": 1.31, + "grad_norm": 18.611753463745117, + "learning_rate": 1.1299836840563947e-05, + "loss": 2.0268, + "step": 10399 + }, + { + "epoch": 1.31, + "grad_norm": 13.834794998168945, + "learning_rate": 1.129900012550726e-05, + "loss": 0.6129, + "step": 10400 + }, + { + "epoch": 1.31, + "eval_loss": 0.08567207306623459, + "eval_runtime": 97.5514, + "eval_samples_per_second": 36.309, + "eval_steps_per_second": 36.309, + "step": 10400 + }, + { + "epoch": 1.31, + "grad_norm": 10.377630233764648, + "learning_rate": 1.129816341045057e-05, + "loss": 2.2945, + "step": 10401 + }, + { + "epoch": 1.31, + "grad_norm": 14.499702453613281, + "learning_rate": 1.1297326695393884e-05, + "loss": 1.7374, + "step": 10402 + }, + { + "epoch": 1.31, + "grad_norm": 6.5097575187683105, + "learning_rate": 1.1296489980337198e-05, + "loss": 0.7856, + "step": 10403 + }, + { + "epoch": 1.31, + "grad_norm": 7.979915142059326, + "learning_rate": 1.1295653265280511e-05, + "loss": 1.0574, + "step": 10404 + }, + { + "epoch": 1.31, + "grad_norm": 10.252052307128906, + "learning_rate": 1.1294816550223822e-05, + "loss": 1.1272, + "step": 10405 + }, + { + "epoch": 1.31, + "grad_norm": 8.146780967712402, + "learning_rate": 1.1293979835167135e-05, + "loss": 1.1183, + "step": 10406 + }, + { + "epoch": 1.31, + "grad_norm": 38.974308013916016, + "learning_rate": 1.1293143120110447e-05, + "loss": 3.3828, + "step": 10407 + }, + { + "epoch": 1.31, + "grad_norm": 20.843727111816406, + "learning_rate": 1.129230640505376e-05, + "loss": 2.2591, + "step": 10408 + }, + { + "epoch": 1.31, + "grad_norm": 36.327911376953125, + "learning_rate": 1.1291469689997073e-05, + "loss": 2.9167, + "step": 10409 + }, + { + "epoch": 1.31, + "grad_norm": 5.336467742919922, + "learning_rate": 1.1290632974940385e-05, + "loss": 0.3331, + "step": 10410 + }, + { + "epoch": 1.31, + "grad_norm": 17.777746200561523, + "learning_rate": 1.1289796259883698e-05, + "loss": 1.5454, + "step": 10411 + }, + { + "epoch": 1.31, + "grad_norm": 6.859896659851074, + "learning_rate": 1.1288959544827009e-05, + "loss": 0.3444, + "step": 10412 + }, + { + "epoch": 1.31, + "grad_norm": 8.759364128112793, + "learning_rate": 1.1288122829770322e-05, + "loss": 0.8933, + "step": 10413 + }, + { + "epoch": 1.31, + "grad_norm": 7.504409313201904, + "learning_rate": 1.1287286114713636e-05, + "loss": 0.4092, + "step": 10414 + }, + { + "epoch": 1.31, + "grad_norm": 20.213590621948242, + "learning_rate": 1.1286449399656946e-05, + "loss": 1.075, + "step": 10415 + }, + { + "epoch": 1.31, + "grad_norm": 15.260299682617188, + "learning_rate": 1.128561268460026e-05, + "loss": 1.079, + "step": 10416 + }, + { + "epoch": 1.31, + "grad_norm": 29.809040069580078, + "learning_rate": 1.1284775969543574e-05, + "loss": 1.168, + "step": 10417 + }, + { + "epoch": 1.31, + "grad_norm": 15.28903865814209, + "learning_rate": 1.1283939254486887e-05, + "loss": 1.3027, + "step": 10418 + }, + { + "epoch": 1.31, + "grad_norm": 6.6696858406066895, + "learning_rate": 1.1283102539430197e-05, + "loss": 1.1304, + "step": 10419 + }, + { + "epoch": 1.31, + "grad_norm": 8.441073417663574, + "learning_rate": 1.1282265824373511e-05, + "loss": 1.3255, + "step": 10420 + }, + { + "epoch": 1.31, + "grad_norm": 3.9966790676116943, + "learning_rate": 1.1281429109316823e-05, + "loss": 0.7552, + "step": 10421 + }, + { + "epoch": 1.31, + "grad_norm": 8.323063850402832, + "learning_rate": 1.1280592394260135e-05, + "loss": 1.3449, + "step": 10422 + }, + { + "epoch": 1.31, + "grad_norm": 5.556552886962891, + "learning_rate": 1.1279755679203449e-05, + "loss": 0.4441, + "step": 10423 + }, + { + "epoch": 1.31, + "grad_norm": 7.900179386138916, + "learning_rate": 1.127891896414676e-05, + "loss": 0.2652, + "step": 10424 + }, + { + "epoch": 1.31, + "grad_norm": 21.98341941833496, + "learning_rate": 1.1278082249090074e-05, + "loss": 2.6199, + "step": 10425 + }, + { + "epoch": 1.31, + "grad_norm": 38.724266052246094, + "learning_rate": 1.1277245534033385e-05, + "loss": 1.2014, + "step": 10426 + }, + { + "epoch": 1.31, + "grad_norm": 4.7703938484191895, + "learning_rate": 1.1276408818976698e-05, + "loss": 0.4701, + "step": 10427 + }, + { + "epoch": 1.31, + "grad_norm": 10.077507972717285, + "learning_rate": 1.1275572103920012e-05, + "loss": 1.7238, + "step": 10428 + }, + { + "epoch": 1.31, + "grad_norm": 15.9498929977417, + "learning_rate": 1.1274735388863322e-05, + "loss": 2.0018, + "step": 10429 + }, + { + "epoch": 1.31, + "grad_norm": 6.577724933624268, + "learning_rate": 1.1273898673806636e-05, + "loss": 0.5287, + "step": 10430 + }, + { + "epoch": 1.31, + "grad_norm": 14.119277000427246, + "learning_rate": 1.127306195874995e-05, + "loss": 1.7477, + "step": 10431 + }, + { + "epoch": 1.31, + "grad_norm": 78.08936309814453, + "learning_rate": 1.1272225243693263e-05, + "loss": 1.2009, + "step": 10432 + }, + { + "epoch": 1.31, + "grad_norm": 5.330587387084961, + "learning_rate": 1.1271388528636573e-05, + "loss": 0.5086, + "step": 10433 + }, + { + "epoch": 1.31, + "grad_norm": 11.60766887664795, + "learning_rate": 1.1270551813579887e-05, + "loss": 0.2803, + "step": 10434 + }, + { + "epoch": 1.31, + "grad_norm": 22.309850692749023, + "learning_rate": 1.1269715098523199e-05, + "loss": 1.0715, + "step": 10435 + }, + { + "epoch": 1.31, + "grad_norm": 11.775199890136719, + "learning_rate": 1.1268878383466511e-05, + "loss": 0.8933, + "step": 10436 + }, + { + "epoch": 1.31, + "grad_norm": 22.4728946685791, + "learning_rate": 1.1268041668409825e-05, + "loss": 2.7902, + "step": 10437 + }, + { + "epoch": 1.31, + "grad_norm": 11.569796562194824, + "learning_rate": 1.1267204953353136e-05, + "loss": 1.4144, + "step": 10438 + }, + { + "epoch": 1.31, + "grad_norm": 20.00957489013672, + "learning_rate": 1.126636823829645e-05, + "loss": 1.4074, + "step": 10439 + }, + { + "epoch": 1.31, + "grad_norm": 8.141829490661621, + "learning_rate": 1.126553152323976e-05, + "loss": 0.4434, + "step": 10440 + }, + { + "epoch": 1.31, + "grad_norm": 18.492023468017578, + "learning_rate": 1.1264694808183074e-05, + "loss": 1.4251, + "step": 10441 + }, + { + "epoch": 1.31, + "grad_norm": 44.78361511230469, + "learning_rate": 1.1263858093126388e-05, + "loss": 3.159, + "step": 10442 + }, + { + "epoch": 1.31, + "grad_norm": 4.946948051452637, + "learning_rate": 1.1263021378069698e-05, + "loss": 0.1349, + "step": 10443 + }, + { + "epoch": 1.31, + "grad_norm": 24.42609977722168, + "learning_rate": 1.1262184663013012e-05, + "loss": 2.367, + "step": 10444 + }, + { + "epoch": 1.31, + "grad_norm": 12.066911697387695, + "learning_rate": 1.1261347947956325e-05, + "loss": 1.1732, + "step": 10445 + }, + { + "epoch": 1.31, + "grad_norm": 21.528846740722656, + "learning_rate": 1.1260511232899637e-05, + "loss": 0.433, + "step": 10446 + }, + { + "epoch": 1.31, + "grad_norm": 20.1120662689209, + "learning_rate": 1.1259674517842949e-05, + "loss": 0.6791, + "step": 10447 + }, + { + "epoch": 1.31, + "grad_norm": 12.262086868286133, + "learning_rate": 1.1258837802786263e-05, + "loss": 1.2916, + "step": 10448 + }, + { + "epoch": 1.31, + "grad_norm": 22.04985809326172, + "learning_rate": 1.1258001087729575e-05, + "loss": 1.5374, + "step": 10449 + }, + { + "epoch": 1.31, + "grad_norm": 7.273778915405273, + "learning_rate": 1.1257164372672887e-05, + "loss": 2.0661, + "step": 10450 + }, + { + "epoch": 1.31, + "grad_norm": 26.157926559448242, + "learning_rate": 1.12563276576162e-05, + "loss": 0.8122, + "step": 10451 + }, + { + "epoch": 1.31, + "grad_norm": 3.5167133808135986, + "learning_rate": 1.1255490942559512e-05, + "loss": 0.2834, + "step": 10452 + }, + { + "epoch": 1.31, + "grad_norm": 26.909486770629883, + "learning_rate": 1.1254654227502826e-05, + "loss": 2.0141, + "step": 10453 + }, + { + "epoch": 1.31, + "grad_norm": 15.352017402648926, + "learning_rate": 1.1253817512446136e-05, + "loss": 2.5512, + "step": 10454 + }, + { + "epoch": 1.31, + "grad_norm": 13.897164344787598, + "learning_rate": 1.125298079738945e-05, + "loss": 1.4139, + "step": 10455 + }, + { + "epoch": 1.31, + "grad_norm": 49.78348159790039, + "learning_rate": 1.1252144082332764e-05, + "loss": 2.1975, + "step": 10456 + }, + { + "epoch": 1.31, + "grad_norm": 38.473941802978516, + "learning_rate": 1.1251307367276074e-05, + "loss": 3.2748, + "step": 10457 + }, + { + "epoch": 1.31, + "grad_norm": 11.407572746276855, + "learning_rate": 1.1250470652219387e-05, + "loss": 0.8564, + "step": 10458 + }, + { + "epoch": 1.31, + "grad_norm": 35.14699172973633, + "learning_rate": 1.1249633937162701e-05, + "loss": 1.3608, + "step": 10459 + }, + { + "epoch": 1.31, + "grad_norm": 24.109121322631836, + "learning_rate": 1.1248797222106013e-05, + "loss": 1.8741, + "step": 10460 + }, + { + "epoch": 1.31, + "grad_norm": 47.487579345703125, + "learning_rate": 1.1247960507049325e-05, + "loss": 1.6083, + "step": 10461 + }, + { + "epoch": 1.31, + "grad_norm": 7.928770542144775, + "learning_rate": 1.1247123791992639e-05, + "loss": 0.6158, + "step": 10462 + }, + { + "epoch": 1.31, + "grad_norm": 21.42852783203125, + "learning_rate": 1.124628707693595e-05, + "loss": 0.7657, + "step": 10463 + }, + { + "epoch": 1.31, + "grad_norm": 15.748943328857422, + "learning_rate": 1.1245450361879263e-05, + "loss": 1.8107, + "step": 10464 + }, + { + "epoch": 1.31, + "grad_norm": 100.10545349121094, + "learning_rate": 1.1244613646822574e-05, + "loss": 2.828, + "step": 10465 + }, + { + "epoch": 1.31, + "grad_norm": 17.006912231445312, + "learning_rate": 1.1243776931765888e-05, + "loss": 1.855, + "step": 10466 + }, + { + "epoch": 1.31, + "grad_norm": 18.44497299194336, + "learning_rate": 1.1242940216709202e-05, + "loss": 2.5444, + "step": 10467 + }, + { + "epoch": 1.31, + "grad_norm": 12.248494148254395, + "learning_rate": 1.1242103501652512e-05, + "loss": 1.3102, + "step": 10468 + }, + { + "epoch": 1.31, + "grad_norm": 9.847447395324707, + "learning_rate": 1.1241266786595826e-05, + "loss": 0.9991, + "step": 10469 + }, + { + "epoch": 1.31, + "grad_norm": 16.546358108520508, + "learning_rate": 1.124043007153914e-05, + "loss": 1.3043, + "step": 10470 + }, + { + "epoch": 1.31, + "grad_norm": 15.240071296691895, + "learning_rate": 1.123959335648245e-05, + "loss": 2.4128, + "step": 10471 + }, + { + "epoch": 1.31, + "grad_norm": 8.652076721191406, + "learning_rate": 1.1238756641425763e-05, + "loss": 0.7773, + "step": 10472 + }, + { + "epoch": 1.31, + "grad_norm": 16.13754653930664, + "learning_rate": 1.1237919926369077e-05, + "loss": 1.7918, + "step": 10473 + }, + { + "epoch": 1.31, + "grad_norm": 12.626401901245117, + "learning_rate": 1.1237083211312389e-05, + "loss": 1.8125, + "step": 10474 + }, + { + "epoch": 1.31, + "grad_norm": 14.035736083984375, + "learning_rate": 1.12362464962557e-05, + "loss": 0.7317, + "step": 10475 + }, + { + "epoch": 1.31, + "grad_norm": 21.7222957611084, + "learning_rate": 1.1235409781199014e-05, + "loss": 2.3465, + "step": 10476 + }, + { + "epoch": 1.31, + "grad_norm": 8.77615737915039, + "learning_rate": 1.1234573066142326e-05, + "loss": 0.7189, + "step": 10477 + }, + { + "epoch": 1.31, + "grad_norm": 24.431882858276367, + "learning_rate": 1.1233736351085638e-05, + "loss": 1.5463, + "step": 10478 + }, + { + "epoch": 1.32, + "grad_norm": 19.643756866455078, + "learning_rate": 1.123289963602895e-05, + "loss": 1.9038, + "step": 10479 + }, + { + "epoch": 1.32, + "grad_norm": 10.335158348083496, + "learning_rate": 1.1232062920972264e-05, + "loss": 0.5165, + "step": 10480 + }, + { + "epoch": 1.32, + "grad_norm": 31.67599105834961, + "learning_rate": 1.1231226205915578e-05, + "loss": 1.2921, + "step": 10481 + }, + { + "epoch": 1.32, + "grad_norm": 20.917774200439453, + "learning_rate": 1.1230389490858888e-05, + "loss": 2.4767, + "step": 10482 + }, + { + "epoch": 1.32, + "grad_norm": 8.009300231933594, + "learning_rate": 1.1229552775802202e-05, + "loss": 0.771, + "step": 10483 + }, + { + "epoch": 1.32, + "grad_norm": 6.347224235534668, + "learning_rate": 1.1228716060745515e-05, + "loss": 1.6127, + "step": 10484 + }, + { + "epoch": 1.32, + "grad_norm": 31.478605270385742, + "learning_rate": 1.1227879345688825e-05, + "loss": 1.0239, + "step": 10485 + }, + { + "epoch": 1.32, + "grad_norm": 11.87148380279541, + "learning_rate": 1.1227042630632139e-05, + "loss": 0.6186, + "step": 10486 + }, + { + "epoch": 1.32, + "grad_norm": 135.9146270751953, + "learning_rate": 1.1226205915575453e-05, + "loss": 0.802, + "step": 10487 + }, + { + "epoch": 1.32, + "grad_norm": 20.080284118652344, + "learning_rate": 1.1225369200518765e-05, + "loss": 1.8482, + "step": 10488 + }, + { + "epoch": 1.32, + "grad_norm": 47.902992248535156, + "learning_rate": 1.1224532485462077e-05, + "loss": 3.7796, + "step": 10489 + }, + { + "epoch": 1.32, + "grad_norm": 24.935144424438477, + "learning_rate": 1.122369577040539e-05, + "loss": 2.8729, + "step": 10490 + }, + { + "epoch": 1.32, + "grad_norm": 19.10072898864746, + "learning_rate": 1.1222859055348702e-05, + "loss": 1.1724, + "step": 10491 + }, + { + "epoch": 1.32, + "grad_norm": 20.33785057067871, + "learning_rate": 1.1222022340292014e-05, + "loss": 1.2364, + "step": 10492 + }, + { + "epoch": 1.32, + "grad_norm": 7.977625370025635, + "learning_rate": 1.1221185625235326e-05, + "loss": 1.8418, + "step": 10493 + }, + { + "epoch": 1.32, + "grad_norm": 9.695846557617188, + "learning_rate": 1.122034891017864e-05, + "loss": 1.9317, + "step": 10494 + }, + { + "epoch": 1.32, + "grad_norm": 9.307378768920898, + "learning_rate": 1.1219512195121953e-05, + "loss": 1.586, + "step": 10495 + }, + { + "epoch": 1.32, + "grad_norm": 17.317626953125, + "learning_rate": 1.1218675480065264e-05, + "loss": 1.4528, + "step": 10496 + }, + { + "epoch": 1.32, + "grad_norm": 18.528837203979492, + "learning_rate": 1.1217838765008577e-05, + "loss": 1.5133, + "step": 10497 + }, + { + "epoch": 1.32, + "grad_norm": 19.242237091064453, + "learning_rate": 1.1217002049951891e-05, + "loss": 1.4202, + "step": 10498 + }, + { + "epoch": 1.32, + "grad_norm": 7.817399501800537, + "learning_rate": 1.1216165334895201e-05, + "loss": 0.3527, + "step": 10499 + }, + { + "epoch": 1.32, + "grad_norm": 8.068084716796875, + "learning_rate": 1.1215328619838515e-05, + "loss": 0.566, + "step": 10500 + }, + { + "epoch": 1.32, + "grad_norm": 21.281558990478516, + "learning_rate": 1.1214491904781829e-05, + "loss": 1.6802, + "step": 10501 + }, + { + "epoch": 1.32, + "grad_norm": 12.254424095153809, + "learning_rate": 1.121365518972514e-05, + "loss": 0.5506, + "step": 10502 + }, + { + "epoch": 1.32, + "grad_norm": 16.627653121948242, + "learning_rate": 1.1212818474668452e-05, + "loss": 1.5935, + "step": 10503 + }, + { + "epoch": 1.32, + "grad_norm": 5.4081315994262695, + "learning_rate": 1.1211981759611764e-05, + "loss": 0.3894, + "step": 10504 + }, + { + "epoch": 1.32, + "grad_norm": 14.867437362670898, + "learning_rate": 1.1211145044555078e-05, + "loss": 1.5122, + "step": 10505 + }, + { + "epoch": 1.32, + "grad_norm": 46.8331184387207, + "learning_rate": 1.121030832949839e-05, + "loss": 1.1616, + "step": 10506 + }, + { + "epoch": 1.32, + "grad_norm": 8.698062896728516, + "learning_rate": 1.1209471614441702e-05, + "loss": 0.7611, + "step": 10507 + }, + { + "epoch": 1.32, + "grad_norm": 8.20580005645752, + "learning_rate": 1.1208634899385016e-05, + "loss": 0.2818, + "step": 10508 + }, + { + "epoch": 1.32, + "grad_norm": 8.287290573120117, + "learning_rate": 1.120779818432833e-05, + "loss": 0.8224, + "step": 10509 + }, + { + "epoch": 1.32, + "grad_norm": 16.682458877563477, + "learning_rate": 1.120696146927164e-05, + "loss": 1.1507, + "step": 10510 + }, + { + "epoch": 1.32, + "grad_norm": 15.22974967956543, + "learning_rate": 1.1206124754214953e-05, + "loss": 0.662, + "step": 10511 + }, + { + "epoch": 1.32, + "grad_norm": 10.623566627502441, + "learning_rate": 1.1205288039158267e-05, + "loss": 0.5222, + "step": 10512 + }, + { + "epoch": 1.32, + "grad_norm": 9.88694953918457, + "learning_rate": 1.1204451324101577e-05, + "loss": 1.2391, + "step": 10513 + }, + { + "epoch": 1.32, + "grad_norm": 27.594900131225586, + "learning_rate": 1.120361460904489e-05, + "loss": 1.774, + "step": 10514 + }, + { + "epoch": 1.32, + "grad_norm": 15.125885009765625, + "learning_rate": 1.1202777893988204e-05, + "loss": 1.3167, + "step": 10515 + }, + { + "epoch": 1.32, + "grad_norm": 27.385690689086914, + "learning_rate": 1.1201941178931516e-05, + "loss": 2.3734, + "step": 10516 + }, + { + "epoch": 1.32, + "grad_norm": 15.248473167419434, + "learning_rate": 1.1201104463874828e-05, + "loss": 1.4631, + "step": 10517 + }, + { + "epoch": 1.32, + "grad_norm": 13.878178596496582, + "learning_rate": 1.120026774881814e-05, + "loss": 1.1966, + "step": 10518 + }, + { + "epoch": 1.32, + "grad_norm": 13.134208679199219, + "learning_rate": 1.1199431033761454e-05, + "loss": 2.1645, + "step": 10519 + }, + { + "epoch": 1.32, + "grad_norm": 17.41299057006836, + "learning_rate": 1.1198594318704766e-05, + "loss": 1.3558, + "step": 10520 + }, + { + "epoch": 1.32, + "grad_norm": 7.347055912017822, + "learning_rate": 1.1197757603648078e-05, + "loss": 0.6305, + "step": 10521 + }, + { + "epoch": 1.32, + "grad_norm": 4.793478488922119, + "learning_rate": 1.1196920888591391e-05, + "loss": 0.4807, + "step": 10522 + }, + { + "epoch": 1.32, + "grad_norm": 15.79783821105957, + "learning_rate": 1.1196084173534705e-05, + "loss": 1.2236, + "step": 10523 + }, + { + "epoch": 1.32, + "grad_norm": 1.9226435422897339, + "learning_rate": 1.1195247458478015e-05, + "loss": 0.0487, + "step": 10524 + }, + { + "epoch": 1.32, + "grad_norm": 58.216766357421875, + "learning_rate": 1.1194410743421329e-05, + "loss": 2.5017, + "step": 10525 + }, + { + "epoch": 1.32, + "grad_norm": 12.894635200500488, + "learning_rate": 1.1193574028364643e-05, + "loss": 2.6699, + "step": 10526 + }, + { + "epoch": 1.32, + "grad_norm": 60.26153564453125, + "learning_rate": 1.1192737313307953e-05, + "loss": 1.6459, + "step": 10527 + }, + { + "epoch": 1.32, + "grad_norm": 8.879268646240234, + "learning_rate": 1.1191900598251267e-05, + "loss": 1.5969, + "step": 10528 + }, + { + "epoch": 1.32, + "grad_norm": 9.770275115966797, + "learning_rate": 1.119106388319458e-05, + "loss": 1.3921, + "step": 10529 + }, + { + "epoch": 1.32, + "grad_norm": 22.978065490722656, + "learning_rate": 1.119022716813789e-05, + "loss": 1.5411, + "step": 10530 + }, + { + "epoch": 1.32, + "grad_norm": 11.687366485595703, + "learning_rate": 1.1189390453081204e-05, + "loss": 0.6153, + "step": 10531 + }, + { + "epoch": 1.32, + "grad_norm": 6.889460563659668, + "learning_rate": 1.1188553738024516e-05, + "loss": 0.4407, + "step": 10532 + }, + { + "epoch": 1.32, + "grad_norm": 7.504435062408447, + "learning_rate": 1.118771702296783e-05, + "loss": 0.3158, + "step": 10533 + }, + { + "epoch": 1.32, + "grad_norm": 11.856164932250977, + "learning_rate": 1.1186880307911142e-05, + "loss": 0.6373, + "step": 10534 + }, + { + "epoch": 1.32, + "grad_norm": 10.153422355651855, + "learning_rate": 1.1186043592854454e-05, + "loss": 0.352, + "step": 10535 + }, + { + "epoch": 1.32, + "grad_norm": 14.861977577209473, + "learning_rate": 1.1185206877797767e-05, + "loss": 2.5537, + "step": 10536 + }, + { + "epoch": 1.32, + "grad_norm": 11.03333854675293, + "learning_rate": 1.1184370162741078e-05, + "loss": 0.7913, + "step": 10537 + }, + { + "epoch": 1.32, + "grad_norm": 25.498279571533203, + "learning_rate": 1.1183533447684391e-05, + "loss": 1.5244, + "step": 10538 + }, + { + "epoch": 1.32, + "grad_norm": 13.05024528503418, + "learning_rate": 1.1182696732627705e-05, + "loss": 1.8159, + "step": 10539 + }, + { + "epoch": 1.32, + "grad_norm": 17.579965591430664, + "learning_rate": 1.1181860017571019e-05, + "loss": 1.3664, + "step": 10540 + }, + { + "epoch": 1.32, + "grad_norm": 7.962263584136963, + "learning_rate": 1.1181023302514329e-05, + "loss": 1.0635, + "step": 10541 + }, + { + "epoch": 1.32, + "grad_norm": 7.066946506500244, + "learning_rate": 1.1180186587457642e-05, + "loss": 1.743, + "step": 10542 + }, + { + "epoch": 1.32, + "grad_norm": 11.174063682556152, + "learning_rate": 1.1179349872400956e-05, + "loss": 1.1828, + "step": 10543 + }, + { + "epoch": 1.32, + "grad_norm": 33.12421417236328, + "learning_rate": 1.1178513157344266e-05, + "loss": 2.0509, + "step": 10544 + }, + { + "epoch": 1.32, + "grad_norm": 9.112988471984863, + "learning_rate": 1.117767644228758e-05, + "loss": 0.6407, + "step": 10545 + }, + { + "epoch": 1.32, + "grad_norm": 74.02664184570312, + "learning_rate": 1.1176839727230892e-05, + "loss": 1.7778, + "step": 10546 + }, + { + "epoch": 1.32, + "grad_norm": 13.02247428894043, + "learning_rate": 1.1176003012174206e-05, + "loss": 1.609, + "step": 10547 + }, + { + "epoch": 1.32, + "grad_norm": 27.995586395263672, + "learning_rate": 1.1175166297117518e-05, + "loss": 3.5742, + "step": 10548 + }, + { + "epoch": 1.32, + "grad_norm": 5.910315990447998, + "learning_rate": 1.117432958206083e-05, + "loss": 0.4359, + "step": 10549 + }, + { + "epoch": 1.32, + "grad_norm": 22.431060791015625, + "learning_rate": 1.1173492867004143e-05, + "loss": 2.211, + "step": 10550 + }, + { + "epoch": 1.32, + "grad_norm": 13.337568283081055, + "learning_rate": 1.1172656151947453e-05, + "loss": 1.2511, + "step": 10551 + }, + { + "epoch": 1.32, + "grad_norm": 59.052276611328125, + "learning_rate": 1.1171819436890767e-05, + "loss": 1.3315, + "step": 10552 + }, + { + "epoch": 1.32, + "grad_norm": 6.686931133270264, + "learning_rate": 1.117098272183408e-05, + "loss": 0.3952, + "step": 10553 + }, + { + "epoch": 1.32, + "grad_norm": 8.727797508239746, + "learning_rate": 1.1170146006777394e-05, + "loss": 1.8726, + "step": 10554 + }, + { + "epoch": 1.32, + "grad_norm": 16.314666748046875, + "learning_rate": 1.1169309291720705e-05, + "loss": 2.3723, + "step": 10555 + }, + { + "epoch": 1.32, + "grad_norm": 24.2736873626709, + "learning_rate": 1.1168472576664018e-05, + "loss": 1.8258, + "step": 10556 + }, + { + "epoch": 1.32, + "grad_norm": 17.27292251586914, + "learning_rate": 1.116763586160733e-05, + "loss": 2.3978, + "step": 10557 + }, + { + "epoch": 1.33, + "grad_norm": 11.159758567810059, + "learning_rate": 1.1166799146550642e-05, + "loss": 1.688, + "step": 10558 + }, + { + "epoch": 1.33, + "grad_norm": 95.5091781616211, + "learning_rate": 1.1165962431493956e-05, + "loss": 2.191, + "step": 10559 + }, + { + "epoch": 1.33, + "grad_norm": 18.80675506591797, + "learning_rate": 1.1165125716437268e-05, + "loss": 1.5938, + "step": 10560 + }, + { + "epoch": 1.33, + "grad_norm": 6.572784423828125, + "learning_rate": 1.1164289001380581e-05, + "loss": 0.4247, + "step": 10561 + }, + { + "epoch": 1.33, + "grad_norm": 15.58597183227539, + "learning_rate": 1.1163452286323893e-05, + "loss": 1.5509, + "step": 10562 + }, + { + "epoch": 1.33, + "grad_norm": 34.90790557861328, + "learning_rate": 1.1162615571267205e-05, + "loss": 2.4227, + "step": 10563 + }, + { + "epoch": 1.33, + "grad_norm": 13.600082397460938, + "learning_rate": 1.1161778856210519e-05, + "loss": 1.3753, + "step": 10564 + }, + { + "epoch": 1.33, + "grad_norm": 18.139291763305664, + "learning_rate": 1.116094214115383e-05, + "loss": 0.3906, + "step": 10565 + }, + { + "epoch": 1.33, + "grad_norm": 10.340546607971191, + "learning_rate": 1.1160105426097143e-05, + "loss": 1.2416, + "step": 10566 + }, + { + "epoch": 1.33, + "grad_norm": 11.842409133911133, + "learning_rate": 1.1159268711040457e-05, + "loss": 0.9771, + "step": 10567 + }, + { + "epoch": 1.33, + "grad_norm": 11.24379825592041, + "learning_rate": 1.115843199598377e-05, + "loss": 0.9641, + "step": 10568 + }, + { + "epoch": 1.33, + "grad_norm": 13.406096458435059, + "learning_rate": 1.115759528092708e-05, + "loss": 1.8182, + "step": 10569 + }, + { + "epoch": 1.33, + "grad_norm": 25.022401809692383, + "learning_rate": 1.1156758565870394e-05, + "loss": 2.7239, + "step": 10570 + }, + { + "epoch": 1.33, + "grad_norm": 9.70371150970459, + "learning_rate": 1.1155921850813706e-05, + "loss": 1.6387, + "step": 10571 + }, + { + "epoch": 1.33, + "grad_norm": 8.708685874938965, + "learning_rate": 1.1155085135757018e-05, + "loss": 1.5522, + "step": 10572 + }, + { + "epoch": 1.33, + "grad_norm": 16.037275314331055, + "learning_rate": 1.1154248420700332e-05, + "loss": 1.4569, + "step": 10573 + }, + { + "epoch": 1.33, + "grad_norm": 13.24316692352295, + "learning_rate": 1.1153411705643644e-05, + "loss": 1.8347, + "step": 10574 + }, + { + "epoch": 1.33, + "grad_norm": 24.94005012512207, + "learning_rate": 1.1152574990586957e-05, + "loss": 1.1633, + "step": 10575 + }, + { + "epoch": 1.33, + "grad_norm": 40.46428680419922, + "learning_rate": 1.1151738275530268e-05, + "loss": 1.9529, + "step": 10576 + }, + { + "epoch": 1.33, + "grad_norm": 8.654131889343262, + "learning_rate": 1.1150901560473581e-05, + "loss": 0.9671, + "step": 10577 + }, + { + "epoch": 1.33, + "grad_norm": 13.445412635803223, + "learning_rate": 1.1150064845416895e-05, + "loss": 1.0693, + "step": 10578 + }, + { + "epoch": 1.33, + "grad_norm": 6.598649501800537, + "learning_rate": 1.1149228130360205e-05, + "loss": 0.256, + "step": 10579 + }, + { + "epoch": 1.33, + "grad_norm": 7.3994269371032715, + "learning_rate": 1.1148391415303519e-05, + "loss": 0.4632, + "step": 10580 + }, + { + "epoch": 1.33, + "grad_norm": 6.5095343589782715, + "learning_rate": 1.1147554700246832e-05, + "loss": 1.4937, + "step": 10581 + }, + { + "epoch": 1.33, + "grad_norm": 6.868313789367676, + "learning_rate": 1.1146717985190146e-05, + "loss": 1.0561, + "step": 10582 + }, + { + "epoch": 1.33, + "grad_norm": 14.947495460510254, + "learning_rate": 1.1145881270133456e-05, + "loss": 1.4978, + "step": 10583 + }, + { + "epoch": 1.33, + "grad_norm": 11.344189643859863, + "learning_rate": 1.114504455507677e-05, + "loss": 0.6595, + "step": 10584 + }, + { + "epoch": 1.33, + "grad_norm": 20.680248260498047, + "learning_rate": 1.1144207840020082e-05, + "loss": 1.0697, + "step": 10585 + }, + { + "epoch": 1.33, + "grad_norm": 6.246676445007324, + "learning_rate": 1.1143371124963394e-05, + "loss": 1.0448, + "step": 10586 + }, + { + "epoch": 1.33, + "grad_norm": 8.066667556762695, + "learning_rate": 1.1142534409906708e-05, + "loss": 1.8635, + "step": 10587 + }, + { + "epoch": 1.33, + "grad_norm": 7.610118389129639, + "learning_rate": 1.114169769485002e-05, + "loss": 1.1762, + "step": 10588 + }, + { + "epoch": 1.33, + "grad_norm": 38.041683197021484, + "learning_rate": 1.1140860979793333e-05, + "loss": 1.7216, + "step": 10589 + }, + { + "epoch": 1.33, + "grad_norm": 19.77991485595703, + "learning_rate": 1.1140024264736643e-05, + "loss": 0.9676, + "step": 10590 + }, + { + "epoch": 1.33, + "grad_norm": 14.37332534790039, + "learning_rate": 1.1139187549679957e-05, + "loss": 1.7085, + "step": 10591 + }, + { + "epoch": 1.33, + "grad_norm": 9.033374786376953, + "learning_rate": 1.113835083462327e-05, + "loss": 1.5882, + "step": 10592 + }, + { + "epoch": 1.33, + "grad_norm": 7.050271511077881, + "learning_rate": 1.1137514119566581e-05, + "loss": 0.2143, + "step": 10593 + }, + { + "epoch": 1.33, + "grad_norm": 7.486701488494873, + "learning_rate": 1.1136677404509895e-05, + "loss": 0.5395, + "step": 10594 + }, + { + "epoch": 1.33, + "grad_norm": 7.2949371337890625, + "learning_rate": 1.1135840689453208e-05, + "loss": 0.5285, + "step": 10595 + }, + { + "epoch": 1.33, + "grad_norm": 8.485579490661621, + "learning_rate": 1.1135003974396522e-05, + "loss": 1.1012, + "step": 10596 + }, + { + "epoch": 1.33, + "grad_norm": 5.428603649139404, + "learning_rate": 1.1134167259339832e-05, + "loss": 1.3909, + "step": 10597 + }, + { + "epoch": 1.33, + "grad_norm": 7.4245710372924805, + "learning_rate": 1.1133330544283146e-05, + "loss": 0.6164, + "step": 10598 + }, + { + "epoch": 1.33, + "grad_norm": 44.259098052978516, + "learning_rate": 1.1132493829226458e-05, + "loss": 1.6817, + "step": 10599 + }, + { + "epoch": 1.33, + "grad_norm": 12.517207145690918, + "learning_rate": 1.113165711416977e-05, + "loss": 0.3257, + "step": 10600 + }, + { + "epoch": 1.33, + "grad_norm": 12.179635047912598, + "learning_rate": 1.1130820399113083e-05, + "loss": 2.1171, + "step": 10601 + }, + { + "epoch": 1.33, + "grad_norm": 8.754598617553711, + "learning_rate": 1.1129983684056395e-05, + "loss": 0.7985, + "step": 10602 + }, + { + "epoch": 1.33, + "grad_norm": 9.699295997619629, + "learning_rate": 1.1129146968999709e-05, + "loss": 0.3123, + "step": 10603 + }, + { + "epoch": 1.33, + "grad_norm": 14.108746528625488, + "learning_rate": 1.112831025394302e-05, + "loss": 2.128, + "step": 10604 + }, + { + "epoch": 1.33, + "grad_norm": 9.265678405761719, + "learning_rate": 1.1127473538886333e-05, + "loss": 2.2785, + "step": 10605 + }, + { + "epoch": 1.33, + "grad_norm": 37.177345275878906, + "learning_rate": 1.1126636823829647e-05, + "loss": 1.7067, + "step": 10606 + }, + { + "epoch": 1.33, + "grad_norm": 10.379622459411621, + "learning_rate": 1.1125800108772957e-05, + "loss": 0.82, + "step": 10607 + }, + { + "epoch": 1.33, + "grad_norm": 32.17717742919922, + "learning_rate": 1.112496339371627e-05, + "loss": 1.7148, + "step": 10608 + }, + { + "epoch": 1.33, + "grad_norm": 17.302162170410156, + "learning_rate": 1.1124126678659584e-05, + "loss": 1.7778, + "step": 10609 + }, + { + "epoch": 1.33, + "grad_norm": 10.162320137023926, + "learning_rate": 1.1123289963602896e-05, + "loss": 2.5884, + "step": 10610 + }, + { + "epoch": 1.33, + "grad_norm": 31.90337371826172, + "learning_rate": 1.1122453248546208e-05, + "loss": 1.3121, + "step": 10611 + }, + { + "epoch": 1.33, + "grad_norm": 20.651403427124023, + "learning_rate": 1.1121616533489522e-05, + "loss": 1.8125, + "step": 10612 + }, + { + "epoch": 1.33, + "grad_norm": 14.351293563842773, + "learning_rate": 1.1120779818432834e-05, + "loss": 0.8593, + "step": 10613 + }, + { + "epoch": 1.33, + "grad_norm": 7.308723449707031, + "learning_rate": 1.1119943103376146e-05, + "loss": 0.3539, + "step": 10614 + }, + { + "epoch": 1.33, + "grad_norm": 7.801800727844238, + "learning_rate": 1.1119106388319457e-05, + "loss": 0.2271, + "step": 10615 + }, + { + "epoch": 1.33, + "grad_norm": 12.822261810302734, + "learning_rate": 1.1118269673262771e-05, + "loss": 1.3261, + "step": 10616 + }, + { + "epoch": 1.33, + "grad_norm": 30.570140838623047, + "learning_rate": 1.1117432958206085e-05, + "loss": 2.3794, + "step": 10617 + }, + { + "epoch": 1.33, + "grad_norm": 12.661914825439453, + "learning_rate": 1.1116596243149395e-05, + "loss": 0.9678, + "step": 10618 + }, + { + "epoch": 1.33, + "grad_norm": 27.10087013244629, + "learning_rate": 1.1115759528092709e-05, + "loss": 0.6331, + "step": 10619 + }, + { + "epoch": 1.33, + "grad_norm": 24.226333618164062, + "learning_rate": 1.1114922813036022e-05, + "loss": 0.1215, + "step": 10620 + }, + { + "epoch": 1.33, + "grad_norm": 8.87596321105957, + "learning_rate": 1.1114086097979333e-05, + "loss": 1.5549, + "step": 10621 + }, + { + "epoch": 1.33, + "grad_norm": 14.273589134216309, + "learning_rate": 1.1113249382922646e-05, + "loss": 1.8746, + "step": 10622 + }, + { + "epoch": 1.33, + "grad_norm": 7.367445945739746, + "learning_rate": 1.111241266786596e-05, + "loss": 0.1504, + "step": 10623 + }, + { + "epoch": 1.33, + "grad_norm": 46.793949127197266, + "learning_rate": 1.1111575952809272e-05, + "loss": 0.7179, + "step": 10624 + }, + { + "epoch": 1.33, + "grad_norm": 86.86392974853516, + "learning_rate": 1.1110739237752584e-05, + "loss": 1.8453, + "step": 10625 + }, + { + "epoch": 1.33, + "grad_norm": 18.575767517089844, + "learning_rate": 1.1109902522695897e-05, + "loss": 2.5457, + "step": 10626 + }, + { + "epoch": 1.33, + "grad_norm": 10.418265342712402, + "learning_rate": 1.110906580763921e-05, + "loss": 0.9466, + "step": 10627 + }, + { + "epoch": 1.33, + "grad_norm": 7.965066432952881, + "learning_rate": 1.1108229092582521e-05, + "loss": 1.1486, + "step": 10628 + }, + { + "epoch": 1.33, + "grad_norm": 25.836055755615234, + "learning_rate": 1.1107392377525833e-05, + "loss": 1.188, + "step": 10629 + }, + { + "epoch": 1.33, + "grad_norm": 32.03029251098633, + "learning_rate": 1.1106555662469147e-05, + "loss": 3.0007, + "step": 10630 + }, + { + "epoch": 1.33, + "grad_norm": 11.605908393859863, + "learning_rate": 1.110571894741246e-05, + "loss": 0.6728, + "step": 10631 + }, + { + "epoch": 1.33, + "grad_norm": 9.322670936584473, + "learning_rate": 1.1104882232355771e-05, + "loss": 1.0013, + "step": 10632 + }, + { + "epoch": 1.33, + "grad_norm": 12.611401557922363, + "learning_rate": 1.1104045517299085e-05, + "loss": 0.9169, + "step": 10633 + }, + { + "epoch": 1.33, + "grad_norm": 11.119584083557129, + "learning_rate": 1.1103208802242398e-05, + "loss": 1.8297, + "step": 10634 + }, + { + "epoch": 1.33, + "grad_norm": 10.377337455749512, + "learning_rate": 1.1102372087185708e-05, + "loss": 0.898, + "step": 10635 + }, + { + "epoch": 1.33, + "grad_norm": 17.205080032348633, + "learning_rate": 1.1101535372129022e-05, + "loss": 0.8162, + "step": 10636 + }, + { + "epoch": 1.33, + "grad_norm": 10.054287910461426, + "learning_rate": 1.1100698657072336e-05, + "loss": 0.9296, + "step": 10637 + }, + { + "epoch": 1.34, + "grad_norm": 9.22707462310791, + "learning_rate": 1.1099861942015648e-05, + "loss": 0.8789, + "step": 10638 + }, + { + "epoch": 1.34, + "grad_norm": 5.999391555786133, + "learning_rate": 1.109902522695896e-05, + "loss": 0.5489, + "step": 10639 + }, + { + "epoch": 1.34, + "grad_norm": 19.555545806884766, + "learning_rate": 1.1098188511902273e-05, + "loss": 1.0702, + "step": 10640 + }, + { + "epoch": 1.34, + "grad_norm": 29.52223014831543, + "learning_rate": 1.1097351796845585e-05, + "loss": 2.0596, + "step": 10641 + }, + { + "epoch": 1.34, + "grad_norm": 6.383723258972168, + "learning_rate": 1.1096515081788897e-05, + "loss": 0.7636, + "step": 10642 + }, + { + "epoch": 1.34, + "grad_norm": 30.461763381958008, + "learning_rate": 1.109567836673221e-05, + "loss": 2.3186, + "step": 10643 + }, + { + "epoch": 1.34, + "grad_norm": 15.468742370605469, + "learning_rate": 1.1094841651675523e-05, + "loss": 1.3401, + "step": 10644 + }, + { + "epoch": 1.34, + "grad_norm": 24.03126335144043, + "learning_rate": 1.1094004936618836e-05, + "loss": 0.7212, + "step": 10645 + }, + { + "epoch": 1.34, + "grad_norm": 64.02912902832031, + "learning_rate": 1.1093168221562147e-05, + "loss": 1.2535, + "step": 10646 + }, + { + "epoch": 1.34, + "grad_norm": 13.855770111083984, + "learning_rate": 1.109233150650546e-05, + "loss": 0.7123, + "step": 10647 + }, + { + "epoch": 1.34, + "grad_norm": 10.665716171264648, + "learning_rate": 1.1091494791448774e-05, + "loss": 1.0164, + "step": 10648 + }, + { + "epoch": 1.34, + "grad_norm": 13.271306991577148, + "learning_rate": 1.1090658076392084e-05, + "loss": 1.5193, + "step": 10649 + }, + { + "epoch": 1.34, + "grad_norm": 5.731505870819092, + "learning_rate": 1.1089821361335398e-05, + "loss": 0.5393, + "step": 10650 + }, + { + "epoch": 1.34, + "grad_norm": 97.72625732421875, + "learning_rate": 1.1088984646278712e-05, + "loss": 0.8654, + "step": 10651 + }, + { + "epoch": 1.34, + "grad_norm": 24.18966293334961, + "learning_rate": 1.1088147931222024e-05, + "loss": 1.3493, + "step": 10652 + }, + { + "epoch": 1.34, + "grad_norm": 52.74606704711914, + "learning_rate": 1.1087311216165335e-05, + "loss": 1.535, + "step": 10653 + }, + { + "epoch": 1.34, + "grad_norm": 18.01695442199707, + "learning_rate": 1.1086474501108649e-05, + "loss": 0.5694, + "step": 10654 + }, + { + "epoch": 1.34, + "grad_norm": 5.7204365730285645, + "learning_rate": 1.1085637786051961e-05, + "loss": 0.2329, + "step": 10655 + }, + { + "epoch": 1.34, + "grad_norm": 5.812489032745361, + "learning_rate": 1.1084801070995273e-05, + "loss": 0.6321, + "step": 10656 + }, + { + "epoch": 1.34, + "grad_norm": 13.29145622253418, + "learning_rate": 1.1083964355938585e-05, + "loss": 1.7102, + "step": 10657 + }, + { + "epoch": 1.34, + "grad_norm": 8.088118553161621, + "learning_rate": 1.1083127640881899e-05, + "loss": 0.3344, + "step": 10658 + }, + { + "epoch": 1.34, + "grad_norm": 15.177617073059082, + "learning_rate": 1.1082290925825212e-05, + "loss": 1.1892, + "step": 10659 + }, + { + "epoch": 1.34, + "grad_norm": 17.269393920898438, + "learning_rate": 1.1081454210768523e-05, + "loss": 2.7234, + "step": 10660 + }, + { + "epoch": 1.34, + "grad_norm": 13.55111026763916, + "learning_rate": 1.1080617495711836e-05, + "loss": 0.8772, + "step": 10661 + }, + { + "epoch": 1.34, + "grad_norm": 10.325575828552246, + "learning_rate": 1.107978078065515e-05, + "loss": 0.6516, + "step": 10662 + }, + { + "epoch": 1.34, + "grad_norm": 11.700462341308594, + "learning_rate": 1.107894406559846e-05, + "loss": 0.605, + "step": 10663 + }, + { + "epoch": 1.34, + "grad_norm": 163.7694091796875, + "learning_rate": 1.1078107350541774e-05, + "loss": 2.3894, + "step": 10664 + }, + { + "epoch": 1.34, + "grad_norm": 6.185863018035889, + "learning_rate": 1.1077270635485087e-05, + "loss": 0.4748, + "step": 10665 + }, + { + "epoch": 1.34, + "grad_norm": 9.343429565429688, + "learning_rate": 1.10764339204284e-05, + "loss": 0.5629, + "step": 10666 + }, + { + "epoch": 1.34, + "grad_norm": 6.737290859222412, + "learning_rate": 1.1075597205371711e-05, + "loss": 0.5619, + "step": 10667 + }, + { + "epoch": 1.34, + "grad_norm": 21.991905212402344, + "learning_rate": 1.1074760490315023e-05, + "loss": 1.938, + "step": 10668 + }, + { + "epoch": 1.34, + "grad_norm": 8.844432830810547, + "learning_rate": 1.1073923775258337e-05, + "loss": 1.7115, + "step": 10669 + }, + { + "epoch": 1.34, + "grad_norm": 19.51593780517578, + "learning_rate": 1.1073087060201649e-05, + "loss": 2.117, + "step": 10670 + }, + { + "epoch": 1.34, + "grad_norm": 25.96725082397461, + "learning_rate": 1.107225034514496e-05, + "loss": 1.8371, + "step": 10671 + }, + { + "epoch": 1.34, + "grad_norm": 8.808110237121582, + "learning_rate": 1.1071413630088274e-05, + "loss": 0.4944, + "step": 10672 + }, + { + "epoch": 1.34, + "grad_norm": 43.18079376220703, + "learning_rate": 1.1070576915031588e-05, + "loss": 0.7812, + "step": 10673 + }, + { + "epoch": 1.34, + "grad_norm": 14.915264129638672, + "learning_rate": 1.1069740199974898e-05, + "loss": 1.4578, + "step": 10674 + }, + { + "epoch": 1.34, + "grad_norm": 13.375272750854492, + "learning_rate": 1.1068903484918212e-05, + "loss": 1.5969, + "step": 10675 + }, + { + "epoch": 1.34, + "grad_norm": 21.682424545288086, + "learning_rate": 1.1068066769861526e-05, + "loss": 1.688, + "step": 10676 + }, + { + "epoch": 1.34, + "grad_norm": 21.142986297607422, + "learning_rate": 1.1067230054804836e-05, + "loss": 2.6396, + "step": 10677 + }, + { + "epoch": 1.34, + "grad_norm": 19.430593490600586, + "learning_rate": 1.106639333974815e-05, + "loss": 0.8929, + "step": 10678 + }, + { + "epoch": 1.34, + "grad_norm": 7.548630714416504, + "learning_rate": 1.1065556624691463e-05, + "loss": 0.509, + "step": 10679 + }, + { + "epoch": 1.34, + "grad_norm": 15.138720512390137, + "learning_rate": 1.1064719909634775e-05, + "loss": 0.878, + "step": 10680 + }, + { + "epoch": 1.34, + "grad_norm": 25.690645217895508, + "learning_rate": 1.1063883194578087e-05, + "loss": 3.1012, + "step": 10681 + }, + { + "epoch": 1.34, + "grad_norm": 66.32939910888672, + "learning_rate": 1.1063046479521399e-05, + "loss": 0.5862, + "step": 10682 + }, + { + "epoch": 1.34, + "grad_norm": 13.385822296142578, + "learning_rate": 1.1062209764464713e-05, + "loss": 0.6105, + "step": 10683 + }, + { + "epoch": 1.34, + "grad_norm": 40.63554763793945, + "learning_rate": 1.1061373049408025e-05, + "loss": 2.9037, + "step": 10684 + }, + { + "epoch": 1.34, + "grad_norm": 8.389958381652832, + "learning_rate": 1.1060536334351337e-05, + "loss": 0.7247, + "step": 10685 + }, + { + "epoch": 1.34, + "grad_norm": 7.9861979484558105, + "learning_rate": 1.105969961929465e-05, + "loss": 0.5741, + "step": 10686 + }, + { + "epoch": 1.34, + "grad_norm": 20.282155990600586, + "learning_rate": 1.1058862904237964e-05, + "loss": 2.1818, + "step": 10687 + }, + { + "epoch": 1.34, + "grad_norm": 9.458776473999023, + "learning_rate": 1.1058026189181274e-05, + "loss": 0.3538, + "step": 10688 + }, + { + "epoch": 1.34, + "grad_norm": 10.17504596710205, + "learning_rate": 1.1057189474124588e-05, + "loss": 0.849, + "step": 10689 + }, + { + "epoch": 1.34, + "grad_norm": 31.185483932495117, + "learning_rate": 1.1056352759067902e-05, + "loss": 2.2377, + "step": 10690 + }, + { + "epoch": 1.34, + "grad_norm": 6.85310697555542, + "learning_rate": 1.1055516044011212e-05, + "loss": 0.2518, + "step": 10691 + }, + { + "epoch": 1.34, + "grad_norm": 19.96877098083496, + "learning_rate": 1.1054679328954525e-05, + "loss": 1.3782, + "step": 10692 + }, + { + "epoch": 1.34, + "grad_norm": 9.055338859558105, + "learning_rate": 1.1053842613897839e-05, + "loss": 0.6675, + "step": 10693 + }, + { + "epoch": 1.34, + "grad_norm": 10.827799797058105, + "learning_rate": 1.1053005898841151e-05, + "loss": 2.3363, + "step": 10694 + }, + { + "epoch": 1.34, + "grad_norm": 7.236748218536377, + "learning_rate": 1.1052169183784463e-05, + "loss": 0.9744, + "step": 10695 + }, + { + "epoch": 1.34, + "grad_norm": 17.888032913208008, + "learning_rate": 1.1051332468727775e-05, + "loss": 0.9261, + "step": 10696 + }, + { + "epoch": 1.34, + "grad_norm": 41.39914321899414, + "learning_rate": 1.1050495753671089e-05, + "loss": 2.7761, + "step": 10697 + }, + { + "epoch": 1.34, + "grad_norm": 17.63840675354004, + "learning_rate": 1.10496590386144e-05, + "loss": 0.8235, + "step": 10698 + }, + { + "epoch": 1.34, + "grad_norm": 22.87006378173828, + "learning_rate": 1.1048822323557713e-05, + "loss": 1.8333, + "step": 10699 + }, + { + "epoch": 1.34, + "grad_norm": 23.068431854248047, + "learning_rate": 1.1047985608501026e-05, + "loss": 0.9031, + "step": 10700 + }, + { + "epoch": 1.34, + "grad_norm": 10.015175819396973, + "learning_rate": 1.104714889344434e-05, + "loss": 0.6289, + "step": 10701 + }, + { + "epoch": 1.34, + "grad_norm": 4.441029071807861, + "learning_rate": 1.104631217838765e-05, + "loss": 0.2909, + "step": 10702 + }, + { + "epoch": 1.34, + "grad_norm": 17.1404972076416, + "learning_rate": 1.1045475463330964e-05, + "loss": 1.3492, + "step": 10703 + }, + { + "epoch": 1.34, + "grad_norm": 8.014181137084961, + "learning_rate": 1.1044638748274277e-05, + "loss": 1.2163, + "step": 10704 + }, + { + "epoch": 1.34, + "grad_norm": 5.611763000488281, + "learning_rate": 1.1043802033217588e-05, + "loss": 0.6437, + "step": 10705 + }, + { + "epoch": 1.34, + "grad_norm": 13.184558868408203, + "learning_rate": 1.1042965318160901e-05, + "loss": 2.2818, + "step": 10706 + }, + { + "epoch": 1.34, + "grad_norm": 12.149728775024414, + "learning_rate": 1.1042128603104215e-05, + "loss": 0.8291, + "step": 10707 + }, + { + "epoch": 1.34, + "grad_norm": 16.432268142700195, + "learning_rate": 1.1041291888047527e-05, + "loss": 0.4215, + "step": 10708 + }, + { + "epoch": 1.34, + "grad_norm": 15.283656120300293, + "learning_rate": 1.1040455172990839e-05, + "loss": 1.2255, + "step": 10709 + }, + { + "epoch": 1.34, + "grad_norm": 18.21234130859375, + "learning_rate": 1.103961845793415e-05, + "loss": 0.9063, + "step": 10710 + }, + { + "epoch": 1.34, + "grad_norm": 17.743425369262695, + "learning_rate": 1.1038781742877464e-05, + "loss": 1.6894, + "step": 10711 + }, + { + "epoch": 1.34, + "grad_norm": 13.465819358825684, + "learning_rate": 1.1037945027820776e-05, + "loss": 0.2748, + "step": 10712 + }, + { + "epoch": 1.34, + "grad_norm": 8.443482398986816, + "learning_rate": 1.1037108312764088e-05, + "loss": 0.4027, + "step": 10713 + }, + { + "epoch": 1.34, + "grad_norm": 51.657470703125, + "learning_rate": 1.1036271597707402e-05, + "loss": 1.978, + "step": 10714 + }, + { + "epoch": 1.34, + "grad_norm": 13.823935508728027, + "learning_rate": 1.1035434882650716e-05, + "loss": 0.5682, + "step": 10715 + }, + { + "epoch": 1.34, + "grad_norm": 5.75874137878418, + "learning_rate": 1.1034598167594026e-05, + "loss": 0.3957, + "step": 10716 + }, + { + "epoch": 1.34, + "grad_norm": 15.59653091430664, + "learning_rate": 1.103376145253734e-05, + "loss": 0.864, + "step": 10717 + }, + { + "epoch": 1.35, + "grad_norm": 17.857391357421875, + "learning_rate": 1.1032924737480653e-05, + "loss": 1.1767, + "step": 10718 + }, + { + "epoch": 1.35, + "grad_norm": 14.209550857543945, + "learning_rate": 1.1032088022423963e-05, + "loss": 0.9556, + "step": 10719 + }, + { + "epoch": 1.35, + "grad_norm": 11.322964668273926, + "learning_rate": 1.1031251307367277e-05, + "loss": 1.8327, + "step": 10720 + }, + { + "epoch": 1.35, + "grad_norm": 13.394015312194824, + "learning_rate": 1.1030414592310589e-05, + "loss": 0.9464, + "step": 10721 + }, + { + "epoch": 1.35, + "grad_norm": 23.246774673461914, + "learning_rate": 1.1029577877253903e-05, + "loss": 0.5882, + "step": 10722 + }, + { + "epoch": 1.35, + "grad_norm": 12.086636543273926, + "learning_rate": 1.1028741162197215e-05, + "loss": 1.8144, + "step": 10723 + }, + { + "epoch": 1.35, + "grad_norm": 15.44156551361084, + "learning_rate": 1.1027904447140527e-05, + "loss": 0.8163, + "step": 10724 + }, + { + "epoch": 1.35, + "grad_norm": 30.777740478515625, + "learning_rate": 1.102706773208384e-05, + "loss": 1.5816, + "step": 10725 + }, + { + "epoch": 1.35, + "grad_norm": 14.217241287231445, + "learning_rate": 1.102623101702715e-05, + "loss": 1.169, + "step": 10726 + }, + { + "epoch": 1.35, + "grad_norm": 9.077201843261719, + "learning_rate": 1.1025394301970464e-05, + "loss": 0.7978, + "step": 10727 + }, + { + "epoch": 1.35, + "grad_norm": 10.994258880615234, + "learning_rate": 1.1024557586913778e-05, + "loss": 1.5531, + "step": 10728 + }, + { + "epoch": 1.35, + "grad_norm": 14.76309871673584, + "learning_rate": 1.1023720871857091e-05, + "loss": 0.5932, + "step": 10729 + }, + { + "epoch": 1.35, + "grad_norm": 9.74864673614502, + "learning_rate": 1.1022884156800402e-05, + "loss": 0.3559, + "step": 10730 + }, + { + "epoch": 1.35, + "grad_norm": 11.553218841552734, + "learning_rate": 1.1022047441743715e-05, + "loss": 0.5703, + "step": 10731 + }, + { + "epoch": 1.35, + "grad_norm": 17.947389602661133, + "learning_rate": 1.1021210726687029e-05, + "loss": 1.5157, + "step": 10732 + }, + { + "epoch": 1.35, + "grad_norm": 20.41041374206543, + "learning_rate": 1.102037401163034e-05, + "loss": 1.24, + "step": 10733 + }, + { + "epoch": 1.35, + "grad_norm": 8.316654205322266, + "learning_rate": 1.1019537296573653e-05, + "loss": 0.5193, + "step": 10734 + }, + { + "epoch": 1.35, + "grad_norm": 12.348031044006348, + "learning_rate": 1.1018700581516965e-05, + "loss": 0.3684, + "step": 10735 + }, + { + "epoch": 1.35, + "grad_norm": 15.791666984558105, + "learning_rate": 1.1017863866460279e-05, + "loss": 1.1495, + "step": 10736 + }, + { + "epoch": 1.35, + "grad_norm": 10.772533416748047, + "learning_rate": 1.101702715140359e-05, + "loss": 0.6673, + "step": 10737 + }, + { + "epoch": 1.35, + "grad_norm": 5.980607032775879, + "learning_rate": 1.1016190436346902e-05, + "loss": 0.5814, + "step": 10738 + }, + { + "epoch": 1.35, + "grad_norm": 7.196502208709717, + "learning_rate": 1.1015353721290216e-05, + "loss": 0.7421, + "step": 10739 + }, + { + "epoch": 1.35, + "grad_norm": 7.806058406829834, + "learning_rate": 1.1014517006233526e-05, + "loss": 1.1668, + "step": 10740 + }, + { + "epoch": 1.35, + "grad_norm": 22.027751922607422, + "learning_rate": 1.101368029117684e-05, + "loss": 1.6726, + "step": 10741 + }, + { + "epoch": 1.35, + "grad_norm": 16.02842903137207, + "learning_rate": 1.1012843576120154e-05, + "loss": 4.1153, + "step": 10742 + }, + { + "epoch": 1.35, + "grad_norm": 10.5230131149292, + "learning_rate": 1.1012006861063467e-05, + "loss": 0.0575, + "step": 10743 + }, + { + "epoch": 1.35, + "grad_norm": 29.45198631286621, + "learning_rate": 1.1011170146006778e-05, + "loss": 2.2434, + "step": 10744 + }, + { + "epoch": 1.35, + "grad_norm": 19.144636154174805, + "learning_rate": 1.1010333430950091e-05, + "loss": 1.2749, + "step": 10745 + }, + { + "epoch": 1.35, + "grad_norm": 26.141258239746094, + "learning_rate": 1.1009496715893405e-05, + "loss": 1.6285, + "step": 10746 + }, + { + "epoch": 1.35, + "grad_norm": 13.935678482055664, + "learning_rate": 1.1008660000836715e-05, + "loss": 1.5021, + "step": 10747 + }, + { + "epoch": 1.35, + "grad_norm": 341.974365234375, + "learning_rate": 1.1007823285780029e-05, + "loss": 1.0623, + "step": 10748 + }, + { + "epoch": 1.35, + "grad_norm": 14.870683670043945, + "learning_rate": 1.100698657072334e-05, + "loss": 1.3213, + "step": 10749 + }, + { + "epoch": 1.35, + "grad_norm": 11.464217185974121, + "learning_rate": 1.1006149855666654e-05, + "loss": 0.7104, + "step": 10750 + }, + { + "epoch": 1.35, + "grad_norm": 38.540260314941406, + "learning_rate": 1.1005313140609966e-05, + "loss": 3.1378, + "step": 10751 + }, + { + "epoch": 1.35, + "grad_norm": 15.78632926940918, + "learning_rate": 1.1004476425553278e-05, + "loss": 3.0877, + "step": 10752 + }, + { + "epoch": 1.35, + "grad_norm": 5.118893623352051, + "learning_rate": 1.1003639710496592e-05, + "loss": 0.3996, + "step": 10753 + }, + { + "epoch": 1.35, + "grad_norm": 25.54676055908203, + "learning_rate": 1.1002802995439902e-05, + "loss": 1.4233, + "step": 10754 + }, + { + "epoch": 1.35, + "grad_norm": 13.176262855529785, + "learning_rate": 1.1001966280383216e-05, + "loss": 1.7879, + "step": 10755 + }, + { + "epoch": 1.35, + "grad_norm": 15.778350830078125, + "learning_rate": 1.100112956532653e-05, + "loss": 1.4188, + "step": 10756 + }, + { + "epoch": 1.35, + "grad_norm": 19.335603713989258, + "learning_rate": 1.1000292850269843e-05, + "loss": 1.1172, + "step": 10757 + }, + { + "epoch": 1.35, + "grad_norm": 81.1264419555664, + "learning_rate": 1.0999456135213153e-05, + "loss": 0.6375, + "step": 10758 + }, + { + "epoch": 1.35, + "grad_norm": 4.9337334632873535, + "learning_rate": 1.0998619420156467e-05, + "loss": 0.1351, + "step": 10759 + }, + { + "epoch": 1.35, + "grad_norm": 26.468721389770508, + "learning_rate": 1.0997782705099779e-05, + "loss": 1.7092, + "step": 10760 + }, + { + "epoch": 1.35, + "grad_norm": 12.010913848876953, + "learning_rate": 1.0996945990043091e-05, + "loss": 1.2734, + "step": 10761 + }, + { + "epoch": 1.35, + "grad_norm": 6.546537399291992, + "learning_rate": 1.0996109274986405e-05, + "loss": 0.1734, + "step": 10762 + }, + { + "epoch": 1.35, + "grad_norm": 16.221281051635742, + "learning_rate": 1.0995272559929717e-05, + "loss": 0.7127, + "step": 10763 + }, + { + "epoch": 1.35, + "grad_norm": 6.823152542114258, + "learning_rate": 1.099443584487303e-05, + "loss": 0.3035, + "step": 10764 + }, + { + "epoch": 1.35, + "grad_norm": 8.010765075683594, + "learning_rate": 1.0993599129816342e-05, + "loss": 0.6597, + "step": 10765 + }, + { + "epoch": 1.35, + "grad_norm": 13.101228713989258, + "learning_rate": 1.0992762414759654e-05, + "loss": 1.7342, + "step": 10766 + }, + { + "epoch": 1.35, + "grad_norm": 20.14594268798828, + "learning_rate": 1.0991925699702968e-05, + "loss": 1.9099, + "step": 10767 + }, + { + "epoch": 1.35, + "grad_norm": 53.84599304199219, + "learning_rate": 1.0991088984646278e-05, + "loss": 1.1974, + "step": 10768 + }, + { + "epoch": 1.35, + "grad_norm": 15.755928039550781, + "learning_rate": 1.0990252269589592e-05, + "loss": 2.1624, + "step": 10769 + }, + { + "epoch": 1.35, + "grad_norm": 6.549153804779053, + "learning_rate": 1.0989415554532905e-05, + "loss": 0.3828, + "step": 10770 + }, + { + "epoch": 1.35, + "grad_norm": 29.744705200195312, + "learning_rate": 1.0988578839476219e-05, + "loss": 1.5385, + "step": 10771 + }, + { + "epoch": 1.35, + "grad_norm": 12.353290557861328, + "learning_rate": 1.098774212441953e-05, + "loss": 0.7369, + "step": 10772 + }, + { + "epoch": 1.35, + "grad_norm": 44.18974304199219, + "learning_rate": 1.0986905409362843e-05, + "loss": 1.6053, + "step": 10773 + }, + { + "epoch": 1.35, + "grad_norm": 4.334277153015137, + "learning_rate": 1.0986068694306155e-05, + "loss": 0.348, + "step": 10774 + }, + { + "epoch": 1.35, + "grad_norm": 8.039255142211914, + "learning_rate": 1.0985231979249467e-05, + "loss": 0.4938, + "step": 10775 + }, + { + "epoch": 1.35, + "grad_norm": 30.044965744018555, + "learning_rate": 1.098439526419278e-05, + "loss": 1.6979, + "step": 10776 + }, + { + "epoch": 1.35, + "grad_norm": 9.394641876220703, + "learning_rate": 1.0983558549136092e-05, + "loss": 0.6797, + "step": 10777 + }, + { + "epoch": 1.35, + "grad_norm": 14.829742431640625, + "learning_rate": 1.0982721834079406e-05, + "loss": 1.0671, + "step": 10778 + }, + { + "epoch": 1.35, + "grad_norm": 2.1448755264282227, + "learning_rate": 1.0981885119022716e-05, + "loss": 0.0608, + "step": 10779 + }, + { + "epoch": 1.35, + "grad_norm": 43.15748596191406, + "learning_rate": 1.098104840396603e-05, + "loss": 1.4878, + "step": 10780 + }, + { + "epoch": 1.35, + "grad_norm": 11.831615447998047, + "learning_rate": 1.0980211688909344e-05, + "loss": 0.49, + "step": 10781 + }, + { + "epoch": 1.35, + "grad_norm": 5.491212368011475, + "learning_rate": 1.0979374973852654e-05, + "loss": 0.7475, + "step": 10782 + }, + { + "epoch": 1.35, + "grad_norm": 52.36024475097656, + "learning_rate": 1.0978538258795968e-05, + "loss": 1.7538, + "step": 10783 + }, + { + "epoch": 1.35, + "grad_norm": 19.612943649291992, + "learning_rate": 1.0977701543739281e-05, + "loss": 1.9167, + "step": 10784 + }, + { + "epoch": 1.35, + "grad_norm": 17.72964096069336, + "learning_rate": 1.0976864828682595e-05, + "loss": 1.9991, + "step": 10785 + }, + { + "epoch": 1.35, + "grad_norm": 11.766716957092285, + "learning_rate": 1.0976028113625905e-05, + "loss": 1.0331, + "step": 10786 + }, + { + "epoch": 1.35, + "grad_norm": 17.612449645996094, + "learning_rate": 1.0975191398569219e-05, + "loss": 0.8623, + "step": 10787 + }, + { + "epoch": 1.35, + "grad_norm": 1.3621190786361694, + "learning_rate": 1.097435468351253e-05, + "loss": 0.0174, + "step": 10788 + }, + { + "epoch": 1.35, + "grad_norm": 12.920506477355957, + "learning_rate": 1.0973517968455843e-05, + "loss": 0.463, + "step": 10789 + }, + { + "epoch": 1.35, + "grad_norm": 2.6283490657806396, + "learning_rate": 1.0972681253399156e-05, + "loss": 0.0628, + "step": 10790 + }, + { + "epoch": 1.35, + "grad_norm": 8.424253463745117, + "learning_rate": 1.0971844538342468e-05, + "loss": 1.4464, + "step": 10791 + }, + { + "epoch": 1.35, + "grad_norm": 100.44664001464844, + "learning_rate": 1.0971007823285782e-05, + "loss": 1.607, + "step": 10792 + }, + { + "epoch": 1.35, + "grad_norm": 48.81386947631836, + "learning_rate": 1.0970171108229092e-05, + "loss": 1.9988, + "step": 10793 + }, + { + "epoch": 1.35, + "grad_norm": 7.60531759262085, + "learning_rate": 1.0969334393172406e-05, + "loss": 0.278, + "step": 10794 + }, + { + "epoch": 1.35, + "grad_norm": 30.702272415161133, + "learning_rate": 1.096849767811572e-05, + "loss": 1.2332, + "step": 10795 + }, + { + "epoch": 1.35, + "grad_norm": 13.653675079345703, + "learning_rate": 1.096766096305903e-05, + "loss": 1.2394, + "step": 10796 + }, + { + "epoch": 1.36, + "grad_norm": 15.487334251403809, + "learning_rate": 1.0966824248002343e-05, + "loss": 1.075, + "step": 10797 + }, + { + "epoch": 1.36, + "grad_norm": 23.752225875854492, + "learning_rate": 1.0965987532945657e-05, + "loss": 1.3737, + "step": 10798 + }, + { + "epoch": 1.36, + "grad_norm": 8.121785163879395, + "learning_rate": 1.096515081788897e-05, + "loss": 0.4708, + "step": 10799 + }, + { + "epoch": 1.36, + "grad_norm": 6.617895603179932, + "learning_rate": 1.0964314102832281e-05, + "loss": 0.1338, + "step": 10800 + }, + { + "epoch": 1.36, + "eval_loss": 0.09359579533338547, + "eval_runtime": 95.2118, + "eval_samples_per_second": 37.201, + "eval_steps_per_second": 37.201, + "step": 10800 + }, + { + "epoch": 1.36, + "grad_norm": 14.611085891723633, + "learning_rate": 1.0963477387775595e-05, + "loss": 2.6641, + "step": 10801 + }, + { + "epoch": 1.36, + "grad_norm": 7.776219367980957, + "learning_rate": 1.0962640672718907e-05, + "loss": 0.7735, + "step": 10802 + }, + { + "epoch": 1.36, + "grad_norm": 13.162578582763672, + "learning_rate": 1.0961803957662218e-05, + "loss": 1.9287, + "step": 10803 + }, + { + "epoch": 1.36, + "grad_norm": 36.731101989746094, + "learning_rate": 1.0960967242605532e-05, + "loss": 1.6382, + "step": 10804 + }, + { + "epoch": 1.36, + "grad_norm": 14.721063613891602, + "learning_rate": 1.0960130527548844e-05, + "loss": 1.5529, + "step": 10805 + }, + { + "epoch": 1.36, + "grad_norm": 43.59991455078125, + "learning_rate": 1.0959293812492158e-05, + "loss": 1.7317, + "step": 10806 + }, + { + "epoch": 1.36, + "grad_norm": 8.519847869873047, + "learning_rate": 1.0958457097435468e-05, + "loss": 0.8793, + "step": 10807 + }, + { + "epoch": 1.36, + "grad_norm": 31.089527130126953, + "learning_rate": 1.0957620382378782e-05, + "loss": 1.1497, + "step": 10808 + }, + { + "epoch": 1.36, + "grad_norm": 5.586785316467285, + "learning_rate": 1.0956783667322095e-05, + "loss": 0.5224, + "step": 10809 + }, + { + "epoch": 1.36, + "grad_norm": 32.68950653076172, + "learning_rate": 1.0955946952265406e-05, + "loss": 3.5036, + "step": 10810 + }, + { + "epoch": 1.36, + "grad_norm": 9.807537078857422, + "learning_rate": 1.095511023720872e-05, + "loss": 1.3255, + "step": 10811 + }, + { + "epoch": 1.36, + "grad_norm": 27.183666229248047, + "learning_rate": 1.0954273522152033e-05, + "loss": 1.5964, + "step": 10812 + }, + { + "epoch": 1.36, + "grad_norm": 24.39258575439453, + "learning_rate": 1.0953436807095345e-05, + "loss": 2.093, + "step": 10813 + }, + { + "epoch": 1.36, + "grad_norm": 52.604713439941406, + "learning_rate": 1.0952600092038657e-05, + "loss": 0.9319, + "step": 10814 + }, + { + "epoch": 1.36, + "grad_norm": 5.523324012756348, + "learning_rate": 1.095176337698197e-05, + "loss": 0.9474, + "step": 10815 + }, + { + "epoch": 1.36, + "grad_norm": 7.748193264007568, + "learning_rate": 1.0950926661925282e-05, + "loss": 0.7595, + "step": 10816 + }, + { + "epoch": 1.36, + "grad_norm": 19.95014762878418, + "learning_rate": 1.0950089946868594e-05, + "loss": 0.5307, + "step": 10817 + }, + { + "epoch": 1.36, + "grad_norm": 20.722143173217773, + "learning_rate": 1.0949253231811908e-05, + "loss": 1.3851, + "step": 10818 + }, + { + "epoch": 1.36, + "grad_norm": 16.28141212463379, + "learning_rate": 1.094841651675522e-05, + "loss": 1.1545, + "step": 10819 + }, + { + "epoch": 1.36, + "grad_norm": 6.727929592132568, + "learning_rate": 1.0947579801698534e-05, + "loss": 0.4766, + "step": 10820 + }, + { + "epoch": 1.36, + "grad_norm": 15.714207649230957, + "learning_rate": 1.0946743086641844e-05, + "loss": 0.8382, + "step": 10821 + }, + { + "epoch": 1.36, + "grad_norm": 15.240822792053223, + "learning_rate": 1.0945906371585157e-05, + "loss": 1.1112, + "step": 10822 + }, + { + "epoch": 1.36, + "grad_norm": 15.163829803466797, + "learning_rate": 1.0945069656528471e-05, + "loss": 0.7106, + "step": 10823 + }, + { + "epoch": 1.36, + "grad_norm": 41.483150482177734, + "learning_rate": 1.0944232941471781e-05, + "loss": 0.9613, + "step": 10824 + }, + { + "epoch": 1.36, + "grad_norm": 44.760169982910156, + "learning_rate": 1.0943396226415095e-05, + "loss": 1.3909, + "step": 10825 + }, + { + "epoch": 1.36, + "grad_norm": 3.67187237739563, + "learning_rate": 1.0942559511358409e-05, + "loss": 0.2376, + "step": 10826 + }, + { + "epoch": 1.36, + "grad_norm": 4.086060047149658, + "learning_rate": 1.094172279630172e-05, + "loss": 0.148, + "step": 10827 + }, + { + "epoch": 1.36, + "grad_norm": 27.953208923339844, + "learning_rate": 1.0940886081245033e-05, + "loss": 1.6879, + "step": 10828 + }, + { + "epoch": 1.36, + "grad_norm": 95.8250732421875, + "learning_rate": 1.0940049366188346e-05, + "loss": 1.8391, + "step": 10829 + }, + { + "epoch": 1.36, + "grad_norm": 11.172741889953613, + "learning_rate": 1.0939212651131658e-05, + "loss": 1.8034, + "step": 10830 + }, + { + "epoch": 1.36, + "grad_norm": 28.409162521362305, + "learning_rate": 1.093837593607497e-05, + "loss": 0.6427, + "step": 10831 + }, + { + "epoch": 1.36, + "grad_norm": 5.7281694412231445, + "learning_rate": 1.0937539221018282e-05, + "loss": 0.7275, + "step": 10832 + }, + { + "epoch": 1.36, + "grad_norm": 41.91596603393555, + "learning_rate": 1.0936702505961596e-05, + "loss": 2.1048, + "step": 10833 + }, + { + "epoch": 1.36, + "grad_norm": 14.93394947052002, + "learning_rate": 1.093586579090491e-05, + "loss": 0.6343, + "step": 10834 + }, + { + "epoch": 1.36, + "grad_norm": 35.88250732421875, + "learning_rate": 1.093502907584822e-05, + "loss": 0.5998, + "step": 10835 + }, + { + "epoch": 1.36, + "grad_norm": 15.611506462097168, + "learning_rate": 1.0934192360791533e-05, + "loss": 2.218, + "step": 10836 + }, + { + "epoch": 1.36, + "grad_norm": 5.545705318450928, + "learning_rate": 1.0933355645734847e-05, + "loss": 0.5066, + "step": 10837 + }, + { + "epoch": 1.36, + "grad_norm": 14.528839111328125, + "learning_rate": 1.0932518930678157e-05, + "loss": 0.8079, + "step": 10838 + }, + { + "epoch": 1.36, + "grad_norm": 10.407254219055176, + "learning_rate": 1.0931682215621471e-05, + "loss": 0.7827, + "step": 10839 + }, + { + "epoch": 1.36, + "grad_norm": 7.218513011932373, + "learning_rate": 1.0930845500564785e-05, + "loss": 1.8934, + "step": 10840 + }, + { + "epoch": 1.36, + "grad_norm": 14.339374542236328, + "learning_rate": 1.0930008785508096e-05, + "loss": 0.8121, + "step": 10841 + }, + { + "epoch": 1.36, + "grad_norm": 14.267348289489746, + "learning_rate": 1.0929172070451408e-05, + "loss": 1.1232, + "step": 10842 + }, + { + "epoch": 1.36, + "grad_norm": 15.108508110046387, + "learning_rate": 1.0928335355394722e-05, + "loss": 0.8803, + "step": 10843 + }, + { + "epoch": 1.36, + "grad_norm": 22.21465492248535, + "learning_rate": 1.0927498640338034e-05, + "loss": 1.1716, + "step": 10844 + }, + { + "epoch": 1.36, + "grad_norm": 25.122150421142578, + "learning_rate": 1.0926661925281346e-05, + "loss": 2.0065, + "step": 10845 + }, + { + "epoch": 1.36, + "grad_norm": 8.775306701660156, + "learning_rate": 1.0925825210224658e-05, + "loss": 0.9244, + "step": 10846 + }, + { + "epoch": 1.36, + "grad_norm": 56.42924880981445, + "learning_rate": 1.0924988495167972e-05, + "loss": 1.7071, + "step": 10847 + }, + { + "epoch": 1.36, + "grad_norm": 17.765642166137695, + "learning_rate": 1.0924151780111285e-05, + "loss": 0.8892, + "step": 10848 + }, + { + "epoch": 1.36, + "grad_norm": 9.326750755310059, + "learning_rate": 1.0923315065054596e-05, + "loss": 1.3537, + "step": 10849 + }, + { + "epoch": 1.36, + "grad_norm": 34.46554946899414, + "learning_rate": 1.092247834999791e-05, + "loss": 0.8488, + "step": 10850 + }, + { + "epoch": 1.36, + "grad_norm": 114.82874298095703, + "learning_rate": 1.0921641634941223e-05, + "loss": 1.2715, + "step": 10851 + }, + { + "epoch": 1.36, + "grad_norm": 20.8919620513916, + "learning_rate": 1.0920804919884533e-05, + "loss": 1.28, + "step": 10852 + }, + { + "epoch": 1.36, + "grad_norm": 22.95903778076172, + "learning_rate": 1.0919968204827847e-05, + "loss": 0.6853, + "step": 10853 + }, + { + "epoch": 1.36, + "grad_norm": 19.324871063232422, + "learning_rate": 1.091913148977116e-05, + "loss": 0.8568, + "step": 10854 + }, + { + "epoch": 1.36, + "grad_norm": 8.778809547424316, + "learning_rate": 1.0918294774714472e-05, + "loss": 0.7449, + "step": 10855 + }, + { + "epoch": 1.36, + "grad_norm": 20.503408432006836, + "learning_rate": 1.0917458059657784e-05, + "loss": 1.145, + "step": 10856 + }, + { + "epoch": 1.36, + "grad_norm": 22.312541961669922, + "learning_rate": 1.0916621344601098e-05, + "loss": 0.7269, + "step": 10857 + }, + { + "epoch": 1.36, + "grad_norm": 7.290511131286621, + "learning_rate": 1.091578462954441e-05, + "loss": 1.0832, + "step": 10858 + }, + { + "epoch": 1.36, + "grad_norm": 16.26047706604004, + "learning_rate": 1.0914947914487722e-05, + "loss": 1.5293, + "step": 10859 + }, + { + "epoch": 1.36, + "grad_norm": 29.824007034301758, + "learning_rate": 1.0914111199431034e-05, + "loss": 1.0911, + "step": 10860 + }, + { + "epoch": 1.36, + "grad_norm": 20.717750549316406, + "learning_rate": 1.0913274484374347e-05, + "loss": 1.1627, + "step": 10861 + }, + { + "epoch": 1.36, + "grad_norm": 6.122304439544678, + "learning_rate": 1.0912437769317661e-05, + "loss": 0.4995, + "step": 10862 + }, + { + "epoch": 1.36, + "grad_norm": 20.13656997680664, + "learning_rate": 1.0911601054260971e-05, + "loss": 1.2077, + "step": 10863 + }, + { + "epoch": 1.36, + "grad_norm": 19.886560440063477, + "learning_rate": 1.0910764339204285e-05, + "loss": 2.5778, + "step": 10864 + }, + { + "epoch": 1.36, + "grad_norm": 15.786689758300781, + "learning_rate": 1.0909927624147599e-05, + "loss": 0.9969, + "step": 10865 + }, + { + "epoch": 1.36, + "grad_norm": 12.388118743896484, + "learning_rate": 1.0909090909090909e-05, + "loss": 2.4683, + "step": 10866 + }, + { + "epoch": 1.36, + "grad_norm": 9.509102821350098, + "learning_rate": 1.0908254194034223e-05, + "loss": 1.0659, + "step": 10867 + }, + { + "epoch": 1.36, + "grad_norm": 11.638148307800293, + "learning_rate": 1.0907417478977536e-05, + "loss": 0.8309, + "step": 10868 + }, + { + "epoch": 1.36, + "grad_norm": 8.701188087463379, + "learning_rate": 1.0906580763920848e-05, + "loss": 0.3006, + "step": 10869 + }, + { + "epoch": 1.36, + "grad_norm": 14.166561126708984, + "learning_rate": 1.090574404886416e-05, + "loss": 1.5359, + "step": 10870 + }, + { + "epoch": 1.36, + "grad_norm": 12.052389144897461, + "learning_rate": 1.0904907333807472e-05, + "loss": 0.7277, + "step": 10871 + }, + { + "epoch": 1.36, + "grad_norm": 23.465457916259766, + "learning_rate": 1.0904070618750786e-05, + "loss": 1.8724, + "step": 10872 + }, + { + "epoch": 1.36, + "grad_norm": 19.340892791748047, + "learning_rate": 1.0903233903694098e-05, + "loss": 1.5015, + "step": 10873 + }, + { + "epoch": 1.36, + "grad_norm": 7.132235050201416, + "learning_rate": 1.090239718863741e-05, + "loss": 0.8138, + "step": 10874 + }, + { + "epoch": 1.36, + "grad_norm": 34.80180358886719, + "learning_rate": 1.0901560473580723e-05, + "loss": 2.4774, + "step": 10875 + }, + { + "epoch": 1.36, + "grad_norm": 13.024115562438965, + "learning_rate": 1.0900723758524037e-05, + "loss": 1.8248, + "step": 10876 + }, + { + "epoch": 1.37, + "grad_norm": 8.109501838684082, + "learning_rate": 1.0899887043467347e-05, + "loss": 0.5959, + "step": 10877 + }, + { + "epoch": 1.37, + "grad_norm": 18.224985122680664, + "learning_rate": 1.089905032841066e-05, + "loss": 2.3261, + "step": 10878 + }, + { + "epoch": 1.37, + "grad_norm": 19.332311630249023, + "learning_rate": 1.0898213613353974e-05, + "loss": 1.1313, + "step": 10879 + }, + { + "epoch": 1.37, + "grad_norm": 48.47065734863281, + "learning_rate": 1.0897376898297285e-05, + "loss": 1.2985, + "step": 10880 + }, + { + "epoch": 1.37, + "grad_norm": 25.13520050048828, + "learning_rate": 1.0896540183240598e-05, + "loss": 1.4921, + "step": 10881 + }, + { + "epoch": 1.37, + "grad_norm": 12.925703048706055, + "learning_rate": 1.0895703468183912e-05, + "loss": 1.7304, + "step": 10882 + }, + { + "epoch": 1.37, + "grad_norm": 5.960899829864502, + "learning_rate": 1.0894866753127222e-05, + "loss": 0.3075, + "step": 10883 + }, + { + "epoch": 1.37, + "grad_norm": 40.891258239746094, + "learning_rate": 1.0894030038070536e-05, + "loss": 1.0237, + "step": 10884 + }, + { + "epoch": 1.37, + "grad_norm": 9.174378395080566, + "learning_rate": 1.0893193323013848e-05, + "loss": 1.8066, + "step": 10885 + }, + { + "epoch": 1.37, + "grad_norm": 18.660842895507812, + "learning_rate": 1.0892356607957162e-05, + "loss": 2.1987, + "step": 10886 + }, + { + "epoch": 1.37, + "grad_norm": 16.45781135559082, + "learning_rate": 1.0891519892900474e-05, + "loss": 0.7041, + "step": 10887 + }, + { + "epoch": 1.37, + "grad_norm": 15.377516746520996, + "learning_rate": 1.0890683177843785e-05, + "loss": 1.4118, + "step": 10888 + }, + { + "epoch": 1.37, + "grad_norm": 199.94894409179688, + "learning_rate": 1.0889846462787099e-05, + "loss": 2.0811, + "step": 10889 + }, + { + "epoch": 1.37, + "grad_norm": 25.48408317565918, + "learning_rate": 1.088900974773041e-05, + "loss": 2.1387, + "step": 10890 + }, + { + "epoch": 1.37, + "grad_norm": 24.191665649414062, + "learning_rate": 1.0888173032673723e-05, + "loss": 1.1457, + "step": 10891 + }, + { + "epoch": 1.37, + "grad_norm": 22.73685646057129, + "learning_rate": 1.0887336317617037e-05, + "loss": 2.3694, + "step": 10892 + }, + { + "epoch": 1.37, + "grad_norm": 21.452775955200195, + "learning_rate": 1.088649960256035e-05, + "loss": 1.7771, + "step": 10893 + }, + { + "epoch": 1.37, + "grad_norm": 56.104278564453125, + "learning_rate": 1.088566288750366e-05, + "loss": 2.2139, + "step": 10894 + }, + { + "epoch": 1.37, + "grad_norm": 27.917959213256836, + "learning_rate": 1.0884826172446974e-05, + "loss": 1.4838, + "step": 10895 + }, + { + "epoch": 1.37, + "grad_norm": 12.48881721496582, + "learning_rate": 1.0883989457390288e-05, + "loss": 2.6583, + "step": 10896 + }, + { + "epoch": 1.37, + "grad_norm": 10.346531867980957, + "learning_rate": 1.0883152742333598e-05, + "loss": 0.8103, + "step": 10897 + }, + { + "epoch": 1.37, + "grad_norm": 7.896469593048096, + "learning_rate": 1.0882316027276912e-05, + "loss": 0.9102, + "step": 10898 + }, + { + "epoch": 1.37, + "grad_norm": 6.133947849273682, + "learning_rate": 1.0881479312220224e-05, + "loss": 0.4329, + "step": 10899 + }, + { + "epoch": 1.37, + "grad_norm": 15.891400337219238, + "learning_rate": 1.0880642597163537e-05, + "loss": 0.7828, + "step": 10900 + }, + { + "epoch": 1.37, + "grad_norm": 7.322772026062012, + "learning_rate": 1.087980588210685e-05, + "loss": 1.7073, + "step": 10901 + }, + { + "epoch": 1.37, + "grad_norm": 26.040386199951172, + "learning_rate": 1.0878969167050161e-05, + "loss": 0.8719, + "step": 10902 + }, + { + "epoch": 1.37, + "grad_norm": 12.847599029541016, + "learning_rate": 1.0878132451993475e-05, + "loss": 0.6563, + "step": 10903 + }, + { + "epoch": 1.37, + "grad_norm": 10.403653144836426, + "learning_rate": 1.0877295736936785e-05, + "loss": 0.9705, + "step": 10904 + }, + { + "epoch": 1.37, + "grad_norm": 113.82823181152344, + "learning_rate": 1.0876459021880099e-05, + "loss": 1.1203, + "step": 10905 + }, + { + "epoch": 1.37, + "grad_norm": 6.354864597320557, + "learning_rate": 1.0875622306823413e-05, + "loss": 2.0016, + "step": 10906 + }, + { + "epoch": 1.37, + "grad_norm": 18.746736526489258, + "learning_rate": 1.0874785591766726e-05, + "loss": 0.6388, + "step": 10907 + }, + { + "epoch": 1.37, + "grad_norm": 33.26611328125, + "learning_rate": 1.0873948876710036e-05, + "loss": 2.0198, + "step": 10908 + }, + { + "epoch": 1.37, + "grad_norm": 12.522151947021484, + "learning_rate": 1.087311216165335e-05, + "loss": 2.2584, + "step": 10909 + }, + { + "epoch": 1.37, + "grad_norm": 32.2423210144043, + "learning_rate": 1.0872275446596664e-05, + "loss": 1.6917, + "step": 10910 + }, + { + "epoch": 1.37, + "grad_norm": 9.617437362670898, + "learning_rate": 1.0871438731539974e-05, + "loss": 0.489, + "step": 10911 + }, + { + "epoch": 1.37, + "grad_norm": 13.178160667419434, + "learning_rate": 1.0870602016483288e-05, + "loss": 0.6835, + "step": 10912 + }, + { + "epoch": 1.37, + "grad_norm": 21.637924194335938, + "learning_rate": 1.08697653014266e-05, + "loss": 1.0005, + "step": 10913 + }, + { + "epoch": 1.37, + "grad_norm": 12.994182586669922, + "learning_rate": 1.0868928586369913e-05, + "loss": 1.3531, + "step": 10914 + }, + { + "epoch": 1.37, + "grad_norm": 8.365532875061035, + "learning_rate": 1.0868091871313225e-05, + "loss": 0.7093, + "step": 10915 + }, + { + "epoch": 1.37, + "grad_norm": 53.14915084838867, + "learning_rate": 1.0867255156256537e-05, + "loss": 1.7739, + "step": 10916 + }, + { + "epoch": 1.37, + "grad_norm": 12.870243072509766, + "learning_rate": 1.086641844119985e-05, + "loss": 1.5167, + "step": 10917 + }, + { + "epoch": 1.37, + "grad_norm": 12.425950050354004, + "learning_rate": 1.0865581726143161e-05, + "loss": 2.6167, + "step": 10918 + }, + { + "epoch": 1.37, + "grad_norm": 18.783540725708008, + "learning_rate": 1.0864745011086475e-05, + "loss": 1.4072, + "step": 10919 + }, + { + "epoch": 1.37, + "grad_norm": 17.30661964416504, + "learning_rate": 1.0863908296029788e-05, + "loss": 0.9908, + "step": 10920 + }, + { + "epoch": 1.37, + "grad_norm": 124.06700897216797, + "learning_rate": 1.0863071580973102e-05, + "loss": 1.6201, + "step": 10921 + }, + { + "epoch": 1.37, + "grad_norm": 25.860363006591797, + "learning_rate": 1.0862234865916412e-05, + "loss": 1.925, + "step": 10922 + }, + { + "epoch": 1.37, + "grad_norm": 66.43064880371094, + "learning_rate": 1.0861398150859726e-05, + "loss": 2.0882, + "step": 10923 + }, + { + "epoch": 1.37, + "grad_norm": 32.965850830078125, + "learning_rate": 1.0860561435803038e-05, + "loss": 3.9624, + "step": 10924 + }, + { + "epoch": 1.37, + "grad_norm": 7.869022369384766, + "learning_rate": 1.085972472074635e-05, + "loss": 0.5503, + "step": 10925 + }, + { + "epoch": 1.37, + "grad_norm": 8.238611221313477, + "learning_rate": 1.0858888005689663e-05, + "loss": 1.504, + "step": 10926 + }, + { + "epoch": 1.37, + "grad_norm": 19.855043411254883, + "learning_rate": 1.0858051290632975e-05, + "loss": 2.174, + "step": 10927 + }, + { + "epoch": 1.37, + "grad_norm": 13.219988822937012, + "learning_rate": 1.0857214575576289e-05, + "loss": 0.8025, + "step": 10928 + }, + { + "epoch": 1.37, + "grad_norm": 48.4361686706543, + "learning_rate": 1.0856377860519601e-05, + "loss": 3.9421, + "step": 10929 + }, + { + "epoch": 1.37, + "grad_norm": 51.29350662231445, + "learning_rate": 1.0855541145462913e-05, + "loss": 1.864, + "step": 10930 + }, + { + "epoch": 1.37, + "grad_norm": 162.08787536621094, + "learning_rate": 1.0854704430406227e-05, + "loss": 0.8445, + "step": 10931 + }, + { + "epoch": 1.37, + "grad_norm": 5.514544486999512, + "learning_rate": 1.0853867715349537e-05, + "loss": 0.2165, + "step": 10932 + }, + { + "epoch": 1.37, + "grad_norm": 8.022409439086914, + "learning_rate": 1.085303100029285e-05, + "loss": 0.4751, + "step": 10933 + }, + { + "epoch": 1.37, + "grad_norm": 13.04383659362793, + "learning_rate": 1.0852194285236164e-05, + "loss": 1.0201, + "step": 10934 + }, + { + "epoch": 1.37, + "grad_norm": 11.314271926879883, + "learning_rate": 1.0851357570179478e-05, + "loss": 1.0127, + "step": 10935 + }, + { + "epoch": 1.37, + "grad_norm": 13.84196949005127, + "learning_rate": 1.0850520855122788e-05, + "loss": 0.8237, + "step": 10936 + }, + { + "epoch": 1.37, + "grad_norm": 18.14813804626465, + "learning_rate": 1.0849684140066102e-05, + "loss": 0.6021, + "step": 10937 + }, + { + "epoch": 1.37, + "grad_norm": 31.143245697021484, + "learning_rate": 1.0848847425009414e-05, + "loss": 2.0824, + "step": 10938 + }, + { + "epoch": 1.37, + "grad_norm": 14.831005096435547, + "learning_rate": 1.0848010709952726e-05, + "loss": 0.5582, + "step": 10939 + }, + { + "epoch": 1.37, + "grad_norm": 15.000322341918945, + "learning_rate": 1.084717399489604e-05, + "loss": 1.863, + "step": 10940 + }, + { + "epoch": 1.37, + "grad_norm": 9.42466926574707, + "learning_rate": 1.0846337279839351e-05, + "loss": 1.4959, + "step": 10941 + }, + { + "epoch": 1.37, + "grad_norm": 28.115360260009766, + "learning_rate": 1.0845500564782665e-05, + "loss": 1.155, + "step": 10942 + }, + { + "epoch": 1.37, + "grad_norm": 14.936105728149414, + "learning_rate": 1.0844663849725975e-05, + "loss": 2.4056, + "step": 10943 + }, + { + "epoch": 1.37, + "grad_norm": 13.039700508117676, + "learning_rate": 1.0843827134669289e-05, + "loss": 0.8834, + "step": 10944 + }, + { + "epoch": 1.37, + "grad_norm": 3.926374673843384, + "learning_rate": 1.0842990419612602e-05, + "loss": 1.1018, + "step": 10945 + }, + { + "epoch": 1.37, + "grad_norm": 16.927934646606445, + "learning_rate": 1.0842153704555913e-05, + "loss": 2.5262, + "step": 10946 + }, + { + "epoch": 1.37, + "grad_norm": 23.388479232788086, + "learning_rate": 1.0841316989499226e-05, + "loss": 2.0981, + "step": 10947 + }, + { + "epoch": 1.37, + "grad_norm": 5.1547932624816895, + "learning_rate": 1.084048027444254e-05, + "loss": 0.466, + "step": 10948 + }, + { + "epoch": 1.37, + "grad_norm": 16.516647338867188, + "learning_rate": 1.0839643559385854e-05, + "loss": 0.9563, + "step": 10949 + }, + { + "epoch": 1.37, + "grad_norm": 4.277315616607666, + "learning_rate": 1.0838806844329164e-05, + "loss": 0.377, + "step": 10950 + }, + { + "epoch": 1.37, + "grad_norm": 29.24924659729004, + "learning_rate": 1.0837970129272478e-05, + "loss": 0.8384, + "step": 10951 + }, + { + "epoch": 1.37, + "grad_norm": 7.615672588348389, + "learning_rate": 1.083713341421579e-05, + "loss": 1.0544, + "step": 10952 + }, + { + "epoch": 1.37, + "grad_norm": 15.231378555297852, + "learning_rate": 1.0836296699159101e-05, + "loss": 3.7036, + "step": 10953 + }, + { + "epoch": 1.37, + "grad_norm": 3.526764154434204, + "learning_rate": 1.0835459984102415e-05, + "loss": 0.1095, + "step": 10954 + }, + { + "epoch": 1.37, + "grad_norm": 67.74472045898438, + "learning_rate": 1.0834623269045727e-05, + "loss": 1.8468, + "step": 10955 + }, + { + "epoch": 1.37, + "grad_norm": 14.515176773071289, + "learning_rate": 1.083378655398904e-05, + "loss": 0.5935, + "step": 10956 + }, + { + "epoch": 1.38, + "grad_norm": 28.489931106567383, + "learning_rate": 1.0832949838932351e-05, + "loss": 2.053, + "step": 10957 + }, + { + "epoch": 1.38, + "grad_norm": 5.610323905944824, + "learning_rate": 1.0832113123875665e-05, + "loss": 0.5024, + "step": 10958 + }, + { + "epoch": 1.38, + "grad_norm": 141.2397003173828, + "learning_rate": 1.0831276408818978e-05, + "loss": 1.8588, + "step": 10959 + }, + { + "epoch": 1.38, + "grad_norm": 23.450443267822266, + "learning_rate": 1.0830439693762289e-05, + "loss": 1.5275, + "step": 10960 + }, + { + "epoch": 1.38, + "grad_norm": 33.665977478027344, + "learning_rate": 1.0829602978705602e-05, + "loss": 2.2573, + "step": 10961 + }, + { + "epoch": 1.38, + "grad_norm": 4.286736965179443, + "learning_rate": 1.0828766263648916e-05, + "loss": 0.3182, + "step": 10962 + }, + { + "epoch": 1.38, + "grad_norm": 30.770402908325195, + "learning_rate": 1.082792954859223e-05, + "loss": 1.2403, + "step": 10963 + }, + { + "epoch": 1.38, + "grad_norm": 7.3374247550964355, + "learning_rate": 1.082709283353554e-05, + "loss": 1.1554, + "step": 10964 + }, + { + "epoch": 1.38, + "grad_norm": 28.713409423828125, + "learning_rate": 1.0826256118478853e-05, + "loss": 1.6937, + "step": 10965 + }, + { + "epoch": 1.38, + "grad_norm": 6.346406936645508, + "learning_rate": 1.0825419403422165e-05, + "loss": 2.4099, + "step": 10966 + }, + { + "epoch": 1.38, + "grad_norm": 12.938896179199219, + "learning_rate": 1.0824582688365477e-05, + "loss": 1.9321, + "step": 10967 + }, + { + "epoch": 1.38, + "grad_norm": 14.221976280212402, + "learning_rate": 1.0823745973308791e-05, + "loss": 0.5429, + "step": 10968 + }, + { + "epoch": 1.38, + "grad_norm": 9.394267082214355, + "learning_rate": 1.0822909258252103e-05, + "loss": 0.7491, + "step": 10969 + }, + { + "epoch": 1.38, + "grad_norm": 6.654685020446777, + "learning_rate": 1.0822072543195417e-05, + "loss": 0.8189, + "step": 10970 + }, + { + "epoch": 1.38, + "grad_norm": 38.056480407714844, + "learning_rate": 1.0821235828138727e-05, + "loss": 1.2402, + "step": 10971 + }, + { + "epoch": 1.38, + "grad_norm": 9.358619689941406, + "learning_rate": 1.082039911308204e-05, + "loss": 0.9531, + "step": 10972 + }, + { + "epoch": 1.38, + "grad_norm": 15.162528991699219, + "learning_rate": 1.0819562398025354e-05, + "loss": 1.2263, + "step": 10973 + }, + { + "epoch": 1.38, + "grad_norm": 26.586856842041016, + "learning_rate": 1.0818725682968664e-05, + "loss": 2.359, + "step": 10974 + }, + { + "epoch": 1.38, + "grad_norm": 7.965980529785156, + "learning_rate": 1.0817888967911978e-05, + "loss": 0.2239, + "step": 10975 + }, + { + "epoch": 1.38, + "grad_norm": 26.117725372314453, + "learning_rate": 1.0817052252855292e-05, + "loss": 2.2138, + "step": 10976 + }, + { + "epoch": 1.38, + "grad_norm": 14.616991996765137, + "learning_rate": 1.0816215537798604e-05, + "loss": 0.8846, + "step": 10977 + }, + { + "epoch": 1.38, + "grad_norm": 9.697427749633789, + "learning_rate": 1.0815378822741916e-05, + "loss": 0.7569, + "step": 10978 + }, + { + "epoch": 1.38, + "grad_norm": 12.512846946716309, + "learning_rate": 1.081454210768523e-05, + "loss": 0.5427, + "step": 10979 + }, + { + "epoch": 1.38, + "grad_norm": 8.949728012084961, + "learning_rate": 1.0813705392628541e-05, + "loss": 1.1092, + "step": 10980 + }, + { + "epoch": 1.38, + "grad_norm": 19.04229164123535, + "learning_rate": 1.0812868677571853e-05, + "loss": 0.796, + "step": 10981 + }, + { + "epoch": 1.38, + "grad_norm": 12.645513534545898, + "learning_rate": 1.0812031962515165e-05, + "loss": 1.5419, + "step": 10982 + }, + { + "epoch": 1.38, + "grad_norm": 12.557134628295898, + "learning_rate": 1.0811195247458479e-05, + "loss": 1.5873, + "step": 10983 + }, + { + "epoch": 1.38, + "grad_norm": 7.657785892486572, + "learning_rate": 1.0810358532401792e-05, + "loss": 0.8016, + "step": 10984 + }, + { + "epoch": 1.38, + "grad_norm": 15.9237642288208, + "learning_rate": 1.0809521817345103e-05, + "loss": 1.4037, + "step": 10985 + }, + { + "epoch": 1.38, + "grad_norm": 11.617383003234863, + "learning_rate": 1.0808685102288416e-05, + "loss": 0.4267, + "step": 10986 + }, + { + "epoch": 1.38, + "grad_norm": 28.64150047302246, + "learning_rate": 1.080784838723173e-05, + "loss": 1.7304, + "step": 10987 + }, + { + "epoch": 1.38, + "grad_norm": 12.376594543457031, + "learning_rate": 1.080701167217504e-05, + "loss": 1.46, + "step": 10988 + }, + { + "epoch": 1.38, + "grad_norm": 10.38775634765625, + "learning_rate": 1.0806174957118354e-05, + "loss": 0.4572, + "step": 10989 + }, + { + "epoch": 1.38, + "grad_norm": 11.072006225585938, + "learning_rate": 1.0805338242061668e-05, + "loss": 0.5525, + "step": 10990 + }, + { + "epoch": 1.38, + "grad_norm": 5.45393180847168, + "learning_rate": 1.080450152700498e-05, + "loss": 1.8282, + "step": 10991 + }, + { + "epoch": 1.38, + "grad_norm": 12.413850784301758, + "learning_rate": 1.0803664811948291e-05, + "loss": 0.4873, + "step": 10992 + }, + { + "epoch": 1.38, + "grad_norm": 25.184465408325195, + "learning_rate": 1.0802828096891605e-05, + "loss": 2.4874, + "step": 10993 + }, + { + "epoch": 1.38, + "grad_norm": 21.327537536621094, + "learning_rate": 1.0801991381834917e-05, + "loss": 1.5459, + "step": 10994 + }, + { + "epoch": 1.38, + "grad_norm": 16.912900924682617, + "learning_rate": 1.0801154666778229e-05, + "loss": 1.3192, + "step": 10995 + }, + { + "epoch": 1.38, + "grad_norm": 23.991470336914062, + "learning_rate": 1.0800317951721541e-05, + "loss": 2.3134, + "step": 10996 + }, + { + "epoch": 1.38, + "grad_norm": 6.955298900604248, + "learning_rate": 1.0799481236664855e-05, + "loss": 0.5381, + "step": 10997 + }, + { + "epoch": 1.38, + "grad_norm": 15.276537895202637, + "learning_rate": 1.0798644521608168e-05, + "loss": 0.6888, + "step": 10998 + }, + { + "epoch": 1.38, + "grad_norm": 14.224745750427246, + "learning_rate": 1.0797807806551479e-05, + "loss": 1.1304, + "step": 10999 + }, + { + "epoch": 1.38, + "grad_norm": 14.31340503692627, + "learning_rate": 1.0796971091494792e-05, + "loss": 1.3738, + "step": 11000 + }, + { + "epoch": 1.38, + "grad_norm": 33.715232849121094, + "learning_rate": 1.0796134376438106e-05, + "loss": 1.6522, + "step": 11001 + }, + { + "epoch": 1.38, + "grad_norm": 7.345632076263428, + "learning_rate": 1.0795297661381416e-05, + "loss": 0.1618, + "step": 11002 + }, + { + "epoch": 1.38, + "grad_norm": 23.19948387145996, + "learning_rate": 1.079446094632473e-05, + "loss": 1.6154, + "step": 11003 + }, + { + "epoch": 1.38, + "grad_norm": 18.290569305419922, + "learning_rate": 1.0793624231268043e-05, + "loss": 2.6678, + "step": 11004 + }, + { + "epoch": 1.38, + "grad_norm": 13.243331909179688, + "learning_rate": 1.0792787516211355e-05, + "loss": 0.7295, + "step": 11005 + }, + { + "epoch": 1.38, + "grad_norm": 25.255516052246094, + "learning_rate": 1.0791950801154667e-05, + "loss": 2.2459, + "step": 11006 + }, + { + "epoch": 1.38, + "grad_norm": 30.202260971069336, + "learning_rate": 1.0791114086097981e-05, + "loss": 1.402, + "step": 11007 + }, + { + "epoch": 1.38, + "grad_norm": 9.659870147705078, + "learning_rate": 1.0790277371041293e-05, + "loss": 0.3835, + "step": 11008 + }, + { + "epoch": 1.38, + "grad_norm": 10.047741889953613, + "learning_rate": 1.0789440655984605e-05, + "loss": 0.9404, + "step": 11009 + }, + { + "epoch": 1.38, + "grad_norm": 12.712565422058105, + "learning_rate": 1.0788603940927917e-05, + "loss": 2.0874, + "step": 11010 + }, + { + "epoch": 1.38, + "grad_norm": 4.661409378051758, + "learning_rate": 1.078776722587123e-05, + "loss": 0.3309, + "step": 11011 + }, + { + "epoch": 1.38, + "grad_norm": 20.807668685913086, + "learning_rate": 1.0786930510814544e-05, + "loss": 1.5155, + "step": 11012 + }, + { + "epoch": 1.38, + "grad_norm": 9.328977584838867, + "learning_rate": 1.0786093795757854e-05, + "loss": 0.8445, + "step": 11013 + }, + { + "epoch": 1.38, + "grad_norm": 37.37929153442383, + "learning_rate": 1.0785257080701168e-05, + "loss": 4.7872, + "step": 11014 + }, + { + "epoch": 1.38, + "grad_norm": 9.015878677368164, + "learning_rate": 1.0784420365644482e-05, + "loss": 2.3438, + "step": 11015 + }, + { + "epoch": 1.38, + "grad_norm": 26.955360412597656, + "learning_rate": 1.0783583650587792e-05, + "loss": 1.3095, + "step": 11016 + }, + { + "epoch": 1.38, + "grad_norm": 23.8460693359375, + "learning_rate": 1.0782746935531106e-05, + "loss": 1.2315, + "step": 11017 + }, + { + "epoch": 1.38, + "grad_norm": 13.200667381286621, + "learning_rate": 1.078191022047442e-05, + "loss": 2.8354, + "step": 11018 + }, + { + "epoch": 1.38, + "grad_norm": 29.953876495361328, + "learning_rate": 1.0781073505417731e-05, + "loss": 0.7924, + "step": 11019 + }, + { + "epoch": 1.38, + "grad_norm": 5.776995658874512, + "learning_rate": 1.0780236790361043e-05, + "loss": 1.5278, + "step": 11020 + }, + { + "epoch": 1.38, + "grad_norm": 12.799798011779785, + "learning_rate": 1.0779400075304357e-05, + "loss": 2.6669, + "step": 11021 + }, + { + "epoch": 1.38, + "grad_norm": 12.158820152282715, + "learning_rate": 1.0778563360247669e-05, + "loss": 1.147, + "step": 11022 + }, + { + "epoch": 1.38, + "grad_norm": 5.1400532722473145, + "learning_rate": 1.077772664519098e-05, + "loss": 1.771, + "step": 11023 + }, + { + "epoch": 1.38, + "grad_norm": 11.69130802154541, + "learning_rate": 1.0776889930134293e-05, + "loss": 0.685, + "step": 11024 + }, + { + "epoch": 1.38, + "grad_norm": 25.547908782958984, + "learning_rate": 1.0776053215077606e-05, + "loss": 1.9566, + "step": 11025 + }, + { + "epoch": 1.38, + "grad_norm": 9.598379135131836, + "learning_rate": 1.077521650002092e-05, + "loss": 0.4463, + "step": 11026 + }, + { + "epoch": 1.38, + "grad_norm": 25.001876831054688, + "learning_rate": 1.077437978496423e-05, + "loss": 0.8894, + "step": 11027 + }, + { + "epoch": 1.38, + "grad_norm": 12.50566577911377, + "learning_rate": 1.0773543069907544e-05, + "loss": 0.563, + "step": 11028 + }, + { + "epoch": 1.38, + "grad_norm": 11.333840370178223, + "learning_rate": 1.0772706354850857e-05, + "loss": 0.8401, + "step": 11029 + }, + { + "epoch": 1.38, + "grad_norm": 49.985897064208984, + "learning_rate": 1.0771869639794168e-05, + "loss": 1.3542, + "step": 11030 + }, + { + "epoch": 1.38, + "grad_norm": 20.68662452697754, + "learning_rate": 1.0771032924737481e-05, + "loss": 1.6074, + "step": 11031 + }, + { + "epoch": 1.38, + "grad_norm": 5.764721393585205, + "learning_rate": 1.0770196209680795e-05, + "loss": 1.8161, + "step": 11032 + }, + { + "epoch": 1.38, + "grad_norm": 21.266937255859375, + "learning_rate": 1.0769359494624107e-05, + "loss": 1.4947, + "step": 11033 + }, + { + "epoch": 1.38, + "grad_norm": 22.41715431213379, + "learning_rate": 1.0768522779567419e-05, + "loss": 2.498, + "step": 11034 + }, + { + "epoch": 1.38, + "grad_norm": 45.06110382080078, + "learning_rate": 1.0767686064510731e-05, + "loss": 3.1562, + "step": 11035 + }, + { + "epoch": 1.38, + "grad_norm": 10.207090377807617, + "learning_rate": 1.0766849349454045e-05, + "loss": 0.7283, + "step": 11036 + }, + { + "epoch": 1.39, + "grad_norm": 32.75028610229492, + "learning_rate": 1.0766012634397357e-05, + "loss": 0.9633, + "step": 11037 + }, + { + "epoch": 1.39, + "grad_norm": 11.14129638671875, + "learning_rate": 1.0765175919340668e-05, + "loss": 1.9259, + "step": 11038 + }, + { + "epoch": 1.39, + "grad_norm": 9.889047622680664, + "learning_rate": 1.0764339204283982e-05, + "loss": 0.4887, + "step": 11039 + }, + { + "epoch": 1.39, + "grad_norm": 16.445697784423828, + "learning_rate": 1.0763502489227296e-05, + "loss": 0.8201, + "step": 11040 + }, + { + "epoch": 1.39, + "grad_norm": 21.291526794433594, + "learning_rate": 1.0762665774170606e-05, + "loss": 1.5882, + "step": 11041 + }, + { + "epoch": 1.39, + "grad_norm": 6.2344465255737305, + "learning_rate": 1.076182905911392e-05, + "loss": 0.3961, + "step": 11042 + }, + { + "epoch": 1.39, + "grad_norm": 17.333131790161133, + "learning_rate": 1.0760992344057233e-05, + "loss": 1.5523, + "step": 11043 + }, + { + "epoch": 1.39, + "grad_norm": 5.335277557373047, + "learning_rate": 1.0760155629000544e-05, + "loss": 1.0519, + "step": 11044 + }, + { + "epoch": 1.39, + "grad_norm": 7.969385147094727, + "learning_rate": 1.0759318913943857e-05, + "loss": 0.338, + "step": 11045 + }, + { + "epoch": 1.39, + "grad_norm": 8.689923286437988, + "learning_rate": 1.0758482198887171e-05, + "loss": 0.7622, + "step": 11046 + }, + { + "epoch": 1.39, + "grad_norm": 17.758501052856445, + "learning_rate": 1.0757645483830483e-05, + "loss": 1.8925, + "step": 11047 + }, + { + "epoch": 1.39, + "grad_norm": 8.584065437316895, + "learning_rate": 1.0756808768773795e-05, + "loss": 0.677, + "step": 11048 + }, + { + "epoch": 1.39, + "grad_norm": 11.227311134338379, + "learning_rate": 1.0755972053717107e-05, + "loss": 2.2452, + "step": 11049 + }, + { + "epoch": 1.39, + "grad_norm": 31.221769332885742, + "learning_rate": 1.075513533866042e-05, + "loss": 0.7633, + "step": 11050 + }, + { + "epoch": 1.39, + "grad_norm": 20.232498168945312, + "learning_rate": 1.0754298623603732e-05, + "loss": 2.8231, + "step": 11051 + }, + { + "epoch": 1.39, + "grad_norm": 11.236428260803223, + "learning_rate": 1.0753461908547044e-05, + "loss": 1.6659, + "step": 11052 + }, + { + "epoch": 1.39, + "grad_norm": 5.119511604309082, + "learning_rate": 1.0752625193490358e-05, + "loss": 2.6232, + "step": 11053 + }, + { + "epoch": 1.39, + "grad_norm": 27.271800994873047, + "learning_rate": 1.0751788478433672e-05, + "loss": 1.7496, + "step": 11054 + }, + { + "epoch": 1.39, + "grad_norm": 33.42025375366211, + "learning_rate": 1.0750951763376982e-05, + "loss": 0.9294, + "step": 11055 + }, + { + "epoch": 1.39, + "grad_norm": 41.25328063964844, + "learning_rate": 1.0750115048320296e-05, + "loss": 2.0432, + "step": 11056 + }, + { + "epoch": 1.39, + "grad_norm": 8.878971099853516, + "learning_rate": 1.074927833326361e-05, + "loss": 0.5169, + "step": 11057 + }, + { + "epoch": 1.39, + "grad_norm": 14.663554191589355, + "learning_rate": 1.074844161820692e-05, + "loss": 1.2426, + "step": 11058 + }, + { + "epoch": 1.39, + "grad_norm": 13.726330757141113, + "learning_rate": 1.0747604903150233e-05, + "loss": 1.5593, + "step": 11059 + }, + { + "epoch": 1.39, + "grad_norm": 17.34848976135254, + "learning_rate": 1.0746768188093547e-05, + "loss": 0.9445, + "step": 11060 + }, + { + "epoch": 1.39, + "grad_norm": 13.294726371765137, + "learning_rate": 1.0745931473036859e-05, + "loss": 2.4288, + "step": 11061 + }, + { + "epoch": 1.39, + "grad_norm": 9.532093048095703, + "learning_rate": 1.074509475798017e-05, + "loss": 2.0992, + "step": 11062 + }, + { + "epoch": 1.39, + "grad_norm": 18.581283569335938, + "learning_rate": 1.0744258042923483e-05, + "loss": 3.5613, + "step": 11063 + }, + { + "epoch": 1.39, + "grad_norm": 13.983625411987305, + "learning_rate": 1.0743421327866796e-05, + "loss": 1.7882, + "step": 11064 + }, + { + "epoch": 1.39, + "grad_norm": 15.763188362121582, + "learning_rate": 1.0742584612810108e-05, + "loss": 1.6749, + "step": 11065 + }, + { + "epoch": 1.39, + "grad_norm": 13.039581298828125, + "learning_rate": 1.074174789775342e-05, + "loss": 2.1293, + "step": 11066 + }, + { + "epoch": 1.39, + "grad_norm": 10.097599983215332, + "learning_rate": 1.0740911182696734e-05, + "loss": 0.3257, + "step": 11067 + }, + { + "epoch": 1.39, + "grad_norm": 10.246635437011719, + "learning_rate": 1.0740074467640047e-05, + "loss": 1.0761, + "step": 11068 + }, + { + "epoch": 1.39, + "grad_norm": 18.120683670043945, + "learning_rate": 1.0739237752583358e-05, + "loss": 2.1921, + "step": 11069 + }, + { + "epoch": 1.39, + "grad_norm": 10.108755111694336, + "learning_rate": 1.0738401037526671e-05, + "loss": 1.1033, + "step": 11070 + }, + { + "epoch": 1.39, + "grad_norm": 20.601940155029297, + "learning_rate": 1.0737564322469985e-05, + "loss": 0.8256, + "step": 11071 + }, + { + "epoch": 1.39, + "grad_norm": 13.79460334777832, + "learning_rate": 1.0736727607413295e-05, + "loss": 1.4347, + "step": 11072 + }, + { + "epoch": 1.39, + "grad_norm": 25.632383346557617, + "learning_rate": 1.0735890892356609e-05, + "loss": 1.1222, + "step": 11073 + }, + { + "epoch": 1.39, + "grad_norm": 2.9022481441497803, + "learning_rate": 1.0735054177299923e-05, + "loss": 0.0795, + "step": 11074 + }, + { + "epoch": 1.39, + "grad_norm": 10.55939769744873, + "learning_rate": 1.0734217462243235e-05, + "loss": 2.9203, + "step": 11075 + }, + { + "epoch": 1.39, + "grad_norm": 41.440025329589844, + "learning_rate": 1.0733380747186546e-05, + "loss": 2.0544, + "step": 11076 + }, + { + "epoch": 1.39, + "grad_norm": 2.9340813159942627, + "learning_rate": 1.0732544032129858e-05, + "loss": 0.0388, + "step": 11077 + }, + { + "epoch": 1.39, + "grad_norm": 19.389799118041992, + "learning_rate": 1.0731707317073172e-05, + "loss": 0.9938, + "step": 11078 + }, + { + "epoch": 1.39, + "grad_norm": 10.55417251586914, + "learning_rate": 1.0730870602016484e-05, + "loss": 1.8281, + "step": 11079 + }, + { + "epoch": 1.39, + "grad_norm": 9.094369888305664, + "learning_rate": 1.0730033886959796e-05, + "loss": 0.5223, + "step": 11080 + }, + { + "epoch": 1.39, + "grad_norm": 12.736441612243652, + "learning_rate": 1.072919717190311e-05, + "loss": 0.9671, + "step": 11081 + }, + { + "epoch": 1.39, + "grad_norm": 37.989707946777344, + "learning_rate": 1.0728360456846423e-05, + "loss": 1.6481, + "step": 11082 + }, + { + "epoch": 1.39, + "grad_norm": 8.87106704711914, + "learning_rate": 1.0727523741789734e-05, + "loss": 1.3616, + "step": 11083 + }, + { + "epoch": 1.39, + "grad_norm": 14.872847557067871, + "learning_rate": 1.0726687026733047e-05, + "loss": 0.4853, + "step": 11084 + }, + { + "epoch": 1.39, + "grad_norm": 36.0228385925293, + "learning_rate": 1.072585031167636e-05, + "loss": 1.675, + "step": 11085 + }, + { + "epoch": 1.39, + "grad_norm": 14.692157745361328, + "learning_rate": 1.0725013596619671e-05, + "loss": 1.6181, + "step": 11086 + }, + { + "epoch": 1.39, + "grad_norm": 7.602644920349121, + "learning_rate": 1.0724176881562985e-05, + "loss": 0.9902, + "step": 11087 + }, + { + "epoch": 1.39, + "grad_norm": 51.961585998535156, + "learning_rate": 1.0723340166506297e-05, + "loss": 1.2465, + "step": 11088 + }, + { + "epoch": 1.39, + "grad_norm": 20.320905685424805, + "learning_rate": 1.072250345144961e-05, + "loss": 2.1883, + "step": 11089 + }, + { + "epoch": 1.39, + "grad_norm": 9.149676322937012, + "learning_rate": 1.0721666736392922e-05, + "loss": 0.6142, + "step": 11090 + }, + { + "epoch": 1.39, + "grad_norm": 14.76880931854248, + "learning_rate": 1.0720830021336234e-05, + "loss": 1.689, + "step": 11091 + }, + { + "epoch": 1.39, + "grad_norm": 13.005758285522461, + "learning_rate": 1.0719993306279548e-05, + "loss": 1.1633, + "step": 11092 + }, + { + "epoch": 1.39, + "grad_norm": 34.54526901245117, + "learning_rate": 1.071915659122286e-05, + "loss": 1.3603, + "step": 11093 + }, + { + "epoch": 1.39, + "grad_norm": 54.09306335449219, + "learning_rate": 1.0718319876166172e-05, + "loss": 1.089, + "step": 11094 + }, + { + "epoch": 1.39, + "grad_norm": 13.492703437805176, + "learning_rate": 1.0717483161109485e-05, + "loss": 0.705, + "step": 11095 + }, + { + "epoch": 1.39, + "grad_norm": 10.318517684936523, + "learning_rate": 1.0716646446052799e-05, + "loss": 0.9415, + "step": 11096 + }, + { + "epoch": 1.39, + "grad_norm": 8.012618064880371, + "learning_rate": 1.071580973099611e-05, + "loss": 0.3978, + "step": 11097 + }, + { + "epoch": 1.39, + "grad_norm": 9.157938003540039, + "learning_rate": 1.0714973015939423e-05, + "loss": 0.9549, + "step": 11098 + }, + { + "epoch": 1.39, + "grad_norm": 15.60629940032959, + "learning_rate": 1.0714136300882737e-05, + "loss": 1.1989, + "step": 11099 + }, + { + "epoch": 1.39, + "grad_norm": 11.698266983032227, + "learning_rate": 1.0713299585826047e-05, + "loss": 0.4457, + "step": 11100 + }, + { + "epoch": 1.39, + "grad_norm": 9.133544921875, + "learning_rate": 1.071246287076936e-05, + "loss": 0.5908, + "step": 11101 + }, + { + "epoch": 1.39, + "grad_norm": 19.813976287841797, + "learning_rate": 1.0711626155712673e-05, + "loss": 1.8152, + "step": 11102 + }, + { + "epoch": 1.39, + "grad_norm": 22.051944732666016, + "learning_rate": 1.0710789440655986e-05, + "loss": 1.7517, + "step": 11103 + }, + { + "epoch": 1.39, + "grad_norm": 163.89105224609375, + "learning_rate": 1.0709952725599298e-05, + "loss": 1.6949, + "step": 11104 + }, + { + "epoch": 1.39, + "grad_norm": 6.643505096435547, + "learning_rate": 1.070911601054261e-05, + "loss": 0.0331, + "step": 11105 + }, + { + "epoch": 1.39, + "grad_norm": 7.673321723937988, + "learning_rate": 1.0708279295485924e-05, + "loss": 1.1346, + "step": 11106 + }, + { + "epoch": 1.39, + "grad_norm": 25.48453140258789, + "learning_rate": 1.0707442580429234e-05, + "loss": 1.3993, + "step": 11107 + }, + { + "epoch": 1.39, + "grad_norm": 54.76433563232422, + "learning_rate": 1.0706605865372548e-05, + "loss": 1.6626, + "step": 11108 + }, + { + "epoch": 1.39, + "grad_norm": 10.953798294067383, + "learning_rate": 1.0705769150315861e-05, + "loss": 2.0811, + "step": 11109 + }, + { + "epoch": 1.39, + "grad_norm": 43.57255554199219, + "learning_rate": 1.0704932435259175e-05, + "loss": 2.039, + "step": 11110 + }, + { + "epoch": 1.39, + "grad_norm": 6.343830108642578, + "learning_rate": 1.0704095720202485e-05, + "loss": 0.3893, + "step": 11111 + }, + { + "epoch": 1.39, + "grad_norm": 12.723817825317383, + "learning_rate": 1.0703259005145799e-05, + "loss": 1.3809, + "step": 11112 + }, + { + "epoch": 1.39, + "grad_norm": 10.008481979370117, + "learning_rate": 1.0702422290089113e-05, + "loss": 0.3705, + "step": 11113 + }, + { + "epoch": 1.39, + "grad_norm": 16.619836807250977, + "learning_rate": 1.0701585575032423e-05, + "loss": 0.363, + "step": 11114 + }, + { + "epoch": 1.39, + "grad_norm": 29.2028865814209, + "learning_rate": 1.0700748859975736e-05, + "loss": 1.9747, + "step": 11115 + }, + { + "epoch": 1.4, + "grad_norm": 12.458796501159668, + "learning_rate": 1.0699912144919048e-05, + "loss": 1.2696, + "step": 11116 + }, + { + "epoch": 1.4, + "grad_norm": 7.526585102081299, + "learning_rate": 1.0699075429862362e-05, + "loss": 0.1859, + "step": 11117 + }, + { + "epoch": 1.4, + "grad_norm": 7.33280086517334, + "learning_rate": 1.0698238714805674e-05, + "loss": 1.7716, + "step": 11118 + }, + { + "epoch": 1.4, + "grad_norm": 7.18589973449707, + "learning_rate": 1.0697401999748986e-05, + "loss": 1.2223, + "step": 11119 + }, + { + "epoch": 1.4, + "grad_norm": 7.9433746337890625, + "learning_rate": 1.06965652846923e-05, + "loss": 1.883, + "step": 11120 + }, + { + "epoch": 1.4, + "grad_norm": 6.184037208557129, + "learning_rate": 1.069572856963561e-05, + "loss": 0.7084, + "step": 11121 + }, + { + "epoch": 1.4, + "grad_norm": 5.708268165588379, + "learning_rate": 1.0694891854578923e-05, + "loss": 0.48, + "step": 11122 + }, + { + "epoch": 1.4, + "grad_norm": 21.270519256591797, + "learning_rate": 1.0694055139522237e-05, + "loss": 1.8448, + "step": 11123 + }, + { + "epoch": 1.4, + "grad_norm": 26.80547332763672, + "learning_rate": 1.069321842446555e-05, + "loss": 2.0975, + "step": 11124 + }, + { + "epoch": 1.4, + "grad_norm": 8.265152931213379, + "learning_rate": 1.0692381709408861e-05, + "loss": 2.047, + "step": 11125 + }, + { + "epoch": 1.4, + "grad_norm": 7.684669494628906, + "learning_rate": 1.0691544994352175e-05, + "loss": 1.9874, + "step": 11126 + }, + { + "epoch": 1.4, + "grad_norm": 6.357775688171387, + "learning_rate": 1.0690708279295487e-05, + "loss": 0.3144, + "step": 11127 + }, + { + "epoch": 1.4, + "grad_norm": 6.835781097412109, + "learning_rate": 1.0689871564238799e-05, + "loss": 0.5789, + "step": 11128 + }, + { + "epoch": 1.4, + "grad_norm": 29.526168823242188, + "learning_rate": 1.0689034849182112e-05, + "loss": 1.5356, + "step": 11129 + }, + { + "epoch": 1.4, + "grad_norm": 14.135659217834473, + "learning_rate": 1.0688198134125424e-05, + "loss": 0.3501, + "step": 11130 + }, + { + "epoch": 1.4, + "grad_norm": 41.2221565246582, + "learning_rate": 1.0687361419068738e-05, + "loss": 2.5301, + "step": 11131 + }, + { + "epoch": 1.4, + "grad_norm": 25.79400634765625, + "learning_rate": 1.068652470401205e-05, + "loss": 3.3588, + "step": 11132 + }, + { + "epoch": 1.4, + "grad_norm": 10.162888526916504, + "learning_rate": 1.0685687988955362e-05, + "loss": 0.6071, + "step": 11133 + }, + { + "epoch": 1.4, + "grad_norm": 42.55033493041992, + "learning_rate": 1.0684851273898675e-05, + "loss": 2.1959, + "step": 11134 + }, + { + "epoch": 1.4, + "grad_norm": 4.78655481338501, + "learning_rate": 1.0684014558841986e-05, + "loss": 0.3336, + "step": 11135 + }, + { + "epoch": 1.4, + "grad_norm": 28.380346298217773, + "learning_rate": 1.06831778437853e-05, + "loss": 1.1981, + "step": 11136 + }, + { + "epoch": 1.4, + "grad_norm": 25.562061309814453, + "learning_rate": 1.0682341128728613e-05, + "loss": 0.5148, + "step": 11137 + }, + { + "epoch": 1.4, + "grad_norm": 12.578266143798828, + "learning_rate": 1.0681504413671927e-05, + "loss": 2.066, + "step": 11138 + }, + { + "epoch": 1.4, + "grad_norm": 9.793967247009277, + "learning_rate": 1.0680667698615237e-05, + "loss": 1.0688, + "step": 11139 + }, + { + "epoch": 1.4, + "grad_norm": 10.32377815246582, + "learning_rate": 1.067983098355855e-05, + "loss": 1.2779, + "step": 11140 + }, + { + "epoch": 1.4, + "grad_norm": 9.000517845153809, + "learning_rate": 1.0678994268501862e-05, + "loss": 1.5666, + "step": 11141 + }, + { + "epoch": 1.4, + "grad_norm": 8.126484870910645, + "learning_rate": 1.0678157553445174e-05, + "loss": 1.2407, + "step": 11142 + }, + { + "epoch": 1.4, + "grad_norm": 16.945219039916992, + "learning_rate": 1.0677320838388488e-05, + "loss": 2.471, + "step": 11143 + }, + { + "epoch": 1.4, + "grad_norm": 7.179367542266846, + "learning_rate": 1.06764841233318e-05, + "loss": 0.2999, + "step": 11144 + }, + { + "epoch": 1.4, + "grad_norm": 18.401262283325195, + "learning_rate": 1.0675647408275114e-05, + "loss": 1.4733, + "step": 11145 + }, + { + "epoch": 1.4, + "grad_norm": 13.894248008728027, + "learning_rate": 1.0674810693218424e-05, + "loss": 1.1163, + "step": 11146 + }, + { + "epoch": 1.4, + "grad_norm": 11.486808776855469, + "learning_rate": 1.0673973978161738e-05, + "loss": 2.0351, + "step": 11147 + }, + { + "epoch": 1.4, + "grad_norm": 12.675158500671387, + "learning_rate": 1.0673137263105051e-05, + "loss": 1.801, + "step": 11148 + }, + { + "epoch": 1.4, + "grad_norm": 18.01869773864746, + "learning_rate": 1.0672300548048362e-05, + "loss": 1.8798, + "step": 11149 + }, + { + "epoch": 1.4, + "grad_norm": 26.31698989868164, + "learning_rate": 1.0671463832991675e-05, + "loss": 1.3533, + "step": 11150 + }, + { + "epoch": 1.4, + "grad_norm": 21.525146484375, + "learning_rate": 1.0670627117934989e-05, + "loss": 1.5077, + "step": 11151 + }, + { + "epoch": 1.4, + "grad_norm": 13.209208488464355, + "learning_rate": 1.0669790402878302e-05, + "loss": 1.3362, + "step": 11152 + }, + { + "epoch": 1.4, + "grad_norm": 18.235010147094727, + "learning_rate": 1.0668953687821613e-05, + "loss": 2.3339, + "step": 11153 + }, + { + "epoch": 1.4, + "grad_norm": 3.60266375541687, + "learning_rate": 1.0668116972764926e-05, + "loss": 0.0966, + "step": 11154 + }, + { + "epoch": 1.4, + "grad_norm": 5.538740158081055, + "learning_rate": 1.0667280257708238e-05, + "loss": 0.6292, + "step": 11155 + }, + { + "epoch": 1.4, + "grad_norm": 10.977618217468262, + "learning_rate": 1.066644354265155e-05, + "loss": 0.8987, + "step": 11156 + }, + { + "epoch": 1.4, + "grad_norm": 7.504702091217041, + "learning_rate": 1.0665606827594864e-05, + "loss": 1.1534, + "step": 11157 + }, + { + "epoch": 1.4, + "grad_norm": 14.682522773742676, + "learning_rate": 1.0664770112538176e-05, + "loss": 0.4943, + "step": 11158 + }, + { + "epoch": 1.4, + "grad_norm": 28.776384353637695, + "learning_rate": 1.066393339748149e-05, + "loss": 3.0167, + "step": 11159 + }, + { + "epoch": 1.4, + "grad_norm": 19.993257522583008, + "learning_rate": 1.06630966824248e-05, + "loss": 2.2105, + "step": 11160 + }, + { + "epoch": 1.4, + "grad_norm": 3.3054468631744385, + "learning_rate": 1.0662259967368113e-05, + "loss": 0.3531, + "step": 11161 + }, + { + "epoch": 1.4, + "grad_norm": 18.72955322265625, + "learning_rate": 1.0661423252311427e-05, + "loss": 2.1389, + "step": 11162 + }, + { + "epoch": 1.4, + "grad_norm": 40.49760818481445, + "learning_rate": 1.0660586537254737e-05, + "loss": 2.056, + "step": 11163 + }, + { + "epoch": 1.4, + "grad_norm": 22.261398315429688, + "learning_rate": 1.0659749822198051e-05, + "loss": 2.8086, + "step": 11164 + }, + { + "epoch": 1.4, + "grad_norm": 7.372966766357422, + "learning_rate": 1.0658913107141365e-05, + "loss": 0.9135, + "step": 11165 + }, + { + "epoch": 1.4, + "grad_norm": 51.850547790527344, + "learning_rate": 1.0658076392084678e-05, + "loss": 1.1674, + "step": 11166 + }, + { + "epoch": 1.4, + "grad_norm": 14.914164543151855, + "learning_rate": 1.0657239677027989e-05, + "loss": 0.8428, + "step": 11167 + }, + { + "epoch": 1.4, + "grad_norm": 11.955036163330078, + "learning_rate": 1.0656402961971302e-05, + "loss": 1.8275, + "step": 11168 + }, + { + "epoch": 1.4, + "grad_norm": 2.5064961910247803, + "learning_rate": 1.0655566246914614e-05, + "loss": 0.1438, + "step": 11169 + }, + { + "epoch": 1.4, + "grad_norm": 26.521604537963867, + "learning_rate": 1.0654729531857926e-05, + "loss": 0.3152, + "step": 11170 + }, + { + "epoch": 1.4, + "grad_norm": 46.38266372680664, + "learning_rate": 1.065389281680124e-05, + "loss": 1.2401, + "step": 11171 + }, + { + "epoch": 1.4, + "grad_norm": 13.363889694213867, + "learning_rate": 1.0653056101744552e-05, + "loss": 1.3346, + "step": 11172 + }, + { + "epoch": 1.4, + "grad_norm": 52.94842529296875, + "learning_rate": 1.0652219386687865e-05, + "loss": 1.834, + "step": 11173 + }, + { + "epoch": 1.4, + "grad_norm": 101.2232666015625, + "learning_rate": 1.0651382671631176e-05, + "loss": 1.8024, + "step": 11174 + }, + { + "epoch": 1.4, + "grad_norm": 3.6217756271362305, + "learning_rate": 1.065054595657449e-05, + "loss": 0.08, + "step": 11175 + }, + { + "epoch": 1.4, + "grad_norm": 9.53453254699707, + "learning_rate": 1.0649709241517803e-05, + "loss": 0.6623, + "step": 11176 + }, + { + "epoch": 1.4, + "grad_norm": 38.03897476196289, + "learning_rate": 1.0648872526461113e-05, + "loss": 1.3396, + "step": 11177 + }, + { + "epoch": 1.4, + "grad_norm": 18.410459518432617, + "learning_rate": 1.0648035811404427e-05, + "loss": 1.9926, + "step": 11178 + }, + { + "epoch": 1.4, + "grad_norm": 17.08340072631836, + "learning_rate": 1.064719909634774e-05, + "loss": 1.7154, + "step": 11179 + }, + { + "epoch": 1.4, + "grad_norm": 9.814212799072266, + "learning_rate": 1.0646362381291052e-05, + "loss": 1.2624, + "step": 11180 + }, + { + "epoch": 1.4, + "grad_norm": 20.458986282348633, + "learning_rate": 1.0645525666234364e-05, + "loss": 1.3171, + "step": 11181 + }, + { + "epoch": 1.4, + "grad_norm": 22.185209274291992, + "learning_rate": 1.0644688951177678e-05, + "loss": 2.7426, + "step": 11182 + }, + { + "epoch": 1.4, + "grad_norm": 10.910929679870605, + "learning_rate": 1.064385223612099e-05, + "loss": 2.198, + "step": 11183 + }, + { + "epoch": 1.4, + "grad_norm": 22.389389038085938, + "learning_rate": 1.0643015521064302e-05, + "loss": 1.3108, + "step": 11184 + }, + { + "epoch": 1.4, + "grad_norm": 11.566834449768066, + "learning_rate": 1.0642178806007616e-05, + "loss": 0.5679, + "step": 11185 + }, + { + "epoch": 1.4, + "grad_norm": 6.975268363952637, + "learning_rate": 1.0641342090950928e-05, + "loss": 1.1047, + "step": 11186 + }, + { + "epoch": 1.4, + "grad_norm": 143.4335174560547, + "learning_rate": 1.0640505375894241e-05, + "loss": 1.254, + "step": 11187 + }, + { + "epoch": 1.4, + "grad_norm": 14.49937915802002, + "learning_rate": 1.0639668660837551e-05, + "loss": 1.3799, + "step": 11188 + }, + { + "epoch": 1.4, + "grad_norm": 11.432032585144043, + "learning_rate": 1.0638831945780865e-05, + "loss": 0.7361, + "step": 11189 + }, + { + "epoch": 1.4, + "grad_norm": 4.690389156341553, + "learning_rate": 1.0637995230724179e-05, + "loss": 1.2973, + "step": 11190 + }, + { + "epoch": 1.4, + "grad_norm": 6.342746257781982, + "learning_rate": 1.0637158515667489e-05, + "loss": 0.4781, + "step": 11191 + }, + { + "epoch": 1.4, + "grad_norm": 24.245380401611328, + "learning_rate": 1.0636321800610803e-05, + "loss": 2.3932, + "step": 11192 + }, + { + "epoch": 1.4, + "grad_norm": 19.726337432861328, + "learning_rate": 1.0635485085554116e-05, + "loss": 1.4064, + "step": 11193 + }, + { + "epoch": 1.4, + "grad_norm": 9.738777160644531, + "learning_rate": 1.0634648370497428e-05, + "loss": 0.4974, + "step": 11194 + }, + { + "epoch": 1.4, + "grad_norm": 16.016401290893555, + "learning_rate": 1.063381165544074e-05, + "loss": 1.3654, + "step": 11195 + }, + { + "epoch": 1.41, + "grad_norm": 29.034860610961914, + "learning_rate": 1.0632974940384054e-05, + "loss": 1.8016, + "step": 11196 + }, + { + "epoch": 1.41, + "grad_norm": 8.642669677734375, + "learning_rate": 1.0632138225327366e-05, + "loss": 1.2493, + "step": 11197 + }, + { + "epoch": 1.41, + "grad_norm": 18.040664672851562, + "learning_rate": 1.0631301510270678e-05, + "loss": 1.14, + "step": 11198 + }, + { + "epoch": 1.41, + "grad_norm": 12.934800148010254, + "learning_rate": 1.063046479521399e-05, + "loss": 0.777, + "step": 11199 + }, + { + "epoch": 1.41, + "grad_norm": 39.06425094604492, + "learning_rate": 1.0629628080157303e-05, + "loss": 2.6454, + "step": 11200 + }, + { + "epoch": 1.41, + "eval_loss": 0.0833502933382988, + "eval_runtime": 94.902, + "eval_samples_per_second": 37.323, + "eval_steps_per_second": 37.323, + "step": 11200 + }, + { + "epoch": 1.41, + "grad_norm": 24.45269012451172, + "learning_rate": 1.0628791365100617e-05, + "loss": 2.3683, + "step": 11201 + }, + { + "epoch": 1.41, + "grad_norm": 12.949420928955078, + "learning_rate": 1.0627954650043927e-05, + "loss": 1.3383, + "step": 11202 + }, + { + "epoch": 1.41, + "grad_norm": 5.550119400024414, + "learning_rate": 1.0627117934987241e-05, + "loss": 0.2897, + "step": 11203 + }, + { + "epoch": 1.41, + "grad_norm": 11.11974048614502, + "learning_rate": 1.0626281219930555e-05, + "loss": 1.0486, + "step": 11204 + }, + { + "epoch": 1.41, + "grad_norm": 11.411757469177246, + "learning_rate": 1.0625444504873865e-05, + "loss": 1.0877, + "step": 11205 + }, + { + "epoch": 1.41, + "grad_norm": 0.8041766285896301, + "learning_rate": 1.0624607789817179e-05, + "loss": 0.014, + "step": 11206 + }, + { + "epoch": 1.41, + "grad_norm": 9.608766555786133, + "learning_rate": 1.0623771074760492e-05, + "loss": 1.3749, + "step": 11207 + }, + { + "epoch": 1.41, + "grad_norm": 29.028156280517578, + "learning_rate": 1.0622934359703804e-05, + "loss": 1.3058, + "step": 11208 + }, + { + "epoch": 1.41, + "grad_norm": 14.396626472473145, + "learning_rate": 1.0622097644647116e-05, + "loss": 0.8126, + "step": 11209 + }, + { + "epoch": 1.41, + "grad_norm": 11.106209754943848, + "learning_rate": 1.062126092959043e-05, + "loss": 0.8738, + "step": 11210 + }, + { + "epoch": 1.41, + "grad_norm": 6.783996105194092, + "learning_rate": 1.0620424214533742e-05, + "loss": 0.5723, + "step": 11211 + }, + { + "epoch": 1.41, + "grad_norm": 126.74786376953125, + "learning_rate": 1.0619587499477054e-05, + "loss": 0.8913, + "step": 11212 + }, + { + "epoch": 1.41, + "grad_norm": 14.129014015197754, + "learning_rate": 1.0618750784420366e-05, + "loss": 1.6854, + "step": 11213 + }, + { + "epoch": 1.41, + "grad_norm": 56.18551254272461, + "learning_rate": 1.061791406936368e-05, + "loss": 3.4491, + "step": 11214 + }, + { + "epoch": 1.41, + "grad_norm": 8.337648391723633, + "learning_rate": 1.0617077354306993e-05, + "loss": 0.7548, + "step": 11215 + }, + { + "epoch": 1.41, + "grad_norm": 24.816200256347656, + "learning_rate": 1.0616240639250303e-05, + "loss": 1.7799, + "step": 11216 + }, + { + "epoch": 1.41, + "grad_norm": 15.044414520263672, + "learning_rate": 1.0615403924193617e-05, + "loss": 0.9788, + "step": 11217 + }, + { + "epoch": 1.41, + "grad_norm": 30.718883514404297, + "learning_rate": 1.061456720913693e-05, + "loss": 1.8779, + "step": 11218 + }, + { + "epoch": 1.41, + "grad_norm": 32.816646575927734, + "learning_rate": 1.061373049408024e-05, + "loss": 1.4192, + "step": 11219 + }, + { + "epoch": 1.41, + "grad_norm": 12.458331108093262, + "learning_rate": 1.0612893779023554e-05, + "loss": 0.6344, + "step": 11220 + }, + { + "epoch": 1.41, + "grad_norm": 18.22350311279297, + "learning_rate": 1.0612057063966868e-05, + "loss": 0.9284, + "step": 11221 + }, + { + "epoch": 1.41, + "grad_norm": 16.123369216918945, + "learning_rate": 1.061122034891018e-05, + "loss": 0.6967, + "step": 11222 + }, + { + "epoch": 1.41, + "grad_norm": 12.499308586120605, + "learning_rate": 1.0610383633853492e-05, + "loss": 1.1679, + "step": 11223 + }, + { + "epoch": 1.41, + "grad_norm": 4.924980640411377, + "learning_rate": 1.0609546918796806e-05, + "loss": 0.9552, + "step": 11224 + }, + { + "epoch": 1.41, + "grad_norm": 18.533519744873047, + "learning_rate": 1.0608710203740118e-05, + "loss": 1.2606, + "step": 11225 + }, + { + "epoch": 1.41, + "grad_norm": 12.271876335144043, + "learning_rate": 1.060787348868343e-05, + "loss": 0.6614, + "step": 11226 + }, + { + "epoch": 1.41, + "grad_norm": 31.398971557617188, + "learning_rate": 1.0607036773626741e-05, + "loss": 2.1384, + "step": 11227 + }, + { + "epoch": 1.41, + "grad_norm": 22.711267471313477, + "learning_rate": 1.0606200058570055e-05, + "loss": 1.2964, + "step": 11228 + }, + { + "epoch": 1.41, + "grad_norm": 6.177417278289795, + "learning_rate": 1.0605363343513369e-05, + "loss": 0.7655, + "step": 11229 + }, + { + "epoch": 1.41, + "grad_norm": 6.442416667938232, + "learning_rate": 1.0604526628456679e-05, + "loss": 0.4298, + "step": 11230 + }, + { + "epoch": 1.41, + "grad_norm": 8.38805866241455, + "learning_rate": 1.0603689913399993e-05, + "loss": 0.7203, + "step": 11231 + }, + { + "epoch": 1.41, + "grad_norm": 11.661182403564453, + "learning_rate": 1.0602853198343306e-05, + "loss": 1.6595, + "step": 11232 + }, + { + "epoch": 1.41, + "grad_norm": 16.503828048706055, + "learning_rate": 1.0602016483286617e-05, + "loss": 1.3584, + "step": 11233 + }, + { + "epoch": 1.41, + "grad_norm": 10.20715618133545, + "learning_rate": 1.060117976822993e-05, + "loss": 0.533, + "step": 11234 + }, + { + "epoch": 1.41, + "grad_norm": 8.280577659606934, + "learning_rate": 1.0600343053173244e-05, + "loss": 0.7534, + "step": 11235 + }, + { + "epoch": 1.41, + "grad_norm": 35.52324676513672, + "learning_rate": 1.0599506338116556e-05, + "loss": 3.1192, + "step": 11236 + }, + { + "epoch": 1.41, + "grad_norm": 48.30099868774414, + "learning_rate": 1.0598669623059868e-05, + "loss": 2.1817, + "step": 11237 + }, + { + "epoch": 1.41, + "grad_norm": 10.228086471557617, + "learning_rate": 1.0597832908003181e-05, + "loss": 0.8891, + "step": 11238 + }, + { + "epoch": 1.41, + "grad_norm": 17.013261795043945, + "learning_rate": 1.0596996192946493e-05, + "loss": 2.3246, + "step": 11239 + }, + { + "epoch": 1.41, + "grad_norm": 75.33102416992188, + "learning_rate": 1.0596159477889805e-05, + "loss": 2.7242, + "step": 11240 + }, + { + "epoch": 1.41, + "grad_norm": 20.065509796142578, + "learning_rate": 1.0595322762833117e-05, + "loss": 1.4334, + "step": 11241 + }, + { + "epoch": 1.41, + "grad_norm": 28.460887908935547, + "learning_rate": 1.0594486047776431e-05, + "loss": 0.3438, + "step": 11242 + }, + { + "epoch": 1.41, + "grad_norm": 15.307353019714355, + "learning_rate": 1.0593649332719743e-05, + "loss": 0.8457, + "step": 11243 + }, + { + "epoch": 1.41, + "grad_norm": 9.002567291259766, + "learning_rate": 1.0592812617663055e-05, + "loss": 0.7748, + "step": 11244 + }, + { + "epoch": 1.41, + "grad_norm": 5.05307674407959, + "learning_rate": 1.0591975902606368e-05, + "loss": 0.5225, + "step": 11245 + }, + { + "epoch": 1.41, + "grad_norm": 30.22115135192871, + "learning_rate": 1.0591139187549682e-05, + "loss": 2.7615, + "step": 11246 + }, + { + "epoch": 1.41, + "grad_norm": 18.640111923217773, + "learning_rate": 1.0590302472492992e-05, + "loss": 0.8867, + "step": 11247 + }, + { + "epoch": 1.41, + "grad_norm": 12.942728996276855, + "learning_rate": 1.0589465757436306e-05, + "loss": 0.8576, + "step": 11248 + }, + { + "epoch": 1.41, + "grad_norm": 7.473307132720947, + "learning_rate": 1.058862904237962e-05, + "loss": 0.7675, + "step": 11249 + }, + { + "epoch": 1.41, + "grad_norm": 6.772233486175537, + "learning_rate": 1.058779232732293e-05, + "loss": 0.3126, + "step": 11250 + }, + { + "epoch": 1.41, + "grad_norm": 28.95512580871582, + "learning_rate": 1.0586955612266244e-05, + "loss": 1.0426, + "step": 11251 + }, + { + "epoch": 1.41, + "grad_norm": 7.875672817230225, + "learning_rate": 1.0586118897209556e-05, + "loss": 0.6801, + "step": 11252 + }, + { + "epoch": 1.41, + "grad_norm": 7.985882759094238, + "learning_rate": 1.058528218215287e-05, + "loss": 1.1323, + "step": 11253 + }, + { + "epoch": 1.41, + "grad_norm": 9.809475898742676, + "learning_rate": 1.0584445467096181e-05, + "loss": 1.0171, + "step": 11254 + }, + { + "epoch": 1.41, + "grad_norm": 9.97216510772705, + "learning_rate": 1.0583608752039493e-05, + "loss": 0.4769, + "step": 11255 + }, + { + "epoch": 1.41, + "grad_norm": 20.22585105895996, + "learning_rate": 1.0582772036982807e-05, + "loss": 0.7561, + "step": 11256 + }, + { + "epoch": 1.41, + "grad_norm": 13.346476554870605, + "learning_rate": 1.0581935321926117e-05, + "loss": 1.3258, + "step": 11257 + }, + { + "epoch": 1.41, + "grad_norm": 17.68963050842285, + "learning_rate": 1.058109860686943e-05, + "loss": 2.4481, + "step": 11258 + }, + { + "epoch": 1.41, + "grad_norm": 15.243430137634277, + "learning_rate": 1.0580261891812744e-05, + "loss": 0.9186, + "step": 11259 + }, + { + "epoch": 1.41, + "grad_norm": 10.015549659729004, + "learning_rate": 1.0579425176756058e-05, + "loss": 0.648, + "step": 11260 + }, + { + "epoch": 1.41, + "grad_norm": 6.309828758239746, + "learning_rate": 1.0578588461699368e-05, + "loss": 0.7387, + "step": 11261 + }, + { + "epoch": 1.41, + "grad_norm": 7.163145542144775, + "learning_rate": 1.0577751746642682e-05, + "loss": 0.3292, + "step": 11262 + }, + { + "epoch": 1.41, + "grad_norm": 40.61732864379883, + "learning_rate": 1.0576915031585996e-05, + "loss": 2.8839, + "step": 11263 + }, + { + "epoch": 1.41, + "grad_norm": 33.57455062866211, + "learning_rate": 1.0576078316529306e-05, + "loss": 1.0886, + "step": 11264 + }, + { + "epoch": 1.41, + "grad_norm": 7.607451438903809, + "learning_rate": 1.057524160147262e-05, + "loss": 0.6742, + "step": 11265 + }, + { + "epoch": 1.41, + "grad_norm": 7.639183521270752, + "learning_rate": 1.0574404886415931e-05, + "loss": 2.3329, + "step": 11266 + }, + { + "epoch": 1.41, + "grad_norm": 6.122906684875488, + "learning_rate": 1.0573568171359245e-05, + "loss": 0.4457, + "step": 11267 + }, + { + "epoch": 1.41, + "grad_norm": 11.732542037963867, + "learning_rate": 1.0572731456302557e-05, + "loss": 1.0847, + "step": 11268 + }, + { + "epoch": 1.41, + "grad_norm": 18.822357177734375, + "learning_rate": 1.0571894741245869e-05, + "loss": 1.259, + "step": 11269 + }, + { + "epoch": 1.41, + "grad_norm": 14.801311492919922, + "learning_rate": 1.0571058026189183e-05, + "loss": 1.6006, + "step": 11270 + }, + { + "epoch": 1.41, + "grad_norm": 12.911343574523926, + "learning_rate": 1.0570221311132493e-05, + "loss": 1.0157, + "step": 11271 + }, + { + "epoch": 1.41, + "grad_norm": 8.495565414428711, + "learning_rate": 1.0569384596075806e-05, + "loss": 0.7486, + "step": 11272 + }, + { + "epoch": 1.41, + "grad_norm": 15.057435989379883, + "learning_rate": 1.056854788101912e-05, + "loss": 1.025, + "step": 11273 + }, + { + "epoch": 1.41, + "grad_norm": 14.769805908203125, + "learning_rate": 1.0567711165962434e-05, + "loss": 1.022, + "step": 11274 + }, + { + "epoch": 1.41, + "grad_norm": 11.487122535705566, + "learning_rate": 1.0566874450905744e-05, + "loss": 0.6012, + "step": 11275 + }, + { + "epoch": 1.42, + "grad_norm": 13.399752616882324, + "learning_rate": 1.0566037735849058e-05, + "loss": 1.006, + "step": 11276 + }, + { + "epoch": 1.42, + "grad_norm": 17.783620834350586, + "learning_rate": 1.0565201020792371e-05, + "loss": 0.493, + "step": 11277 + }, + { + "epoch": 1.42, + "grad_norm": 39.72786331176758, + "learning_rate": 1.0564364305735682e-05, + "loss": 1.5032, + "step": 11278 + }, + { + "epoch": 1.42, + "grad_norm": 10.015669822692871, + "learning_rate": 1.0563527590678995e-05, + "loss": 1.5298, + "step": 11279 + }, + { + "epoch": 1.42, + "grad_norm": 16.162492752075195, + "learning_rate": 1.0562690875622307e-05, + "loss": 1.7034, + "step": 11280 + }, + { + "epoch": 1.42, + "grad_norm": 42.08264923095703, + "learning_rate": 1.0561854160565621e-05, + "loss": 3.0007, + "step": 11281 + }, + { + "epoch": 1.42, + "grad_norm": 14.679417610168457, + "learning_rate": 1.0561017445508933e-05, + "loss": 1.5274, + "step": 11282 + }, + { + "epoch": 1.42, + "grad_norm": 22.4543514251709, + "learning_rate": 1.0560180730452245e-05, + "loss": 1.086, + "step": 11283 + }, + { + "epoch": 1.42, + "grad_norm": 19.550586700439453, + "learning_rate": 1.0559344015395558e-05, + "loss": 2.1566, + "step": 11284 + }, + { + "epoch": 1.42, + "grad_norm": 14.509361267089844, + "learning_rate": 1.0558507300338869e-05, + "loss": 1.9627, + "step": 11285 + }, + { + "epoch": 1.42, + "grad_norm": 37.67739486694336, + "learning_rate": 1.0557670585282182e-05, + "loss": 0.7337, + "step": 11286 + }, + { + "epoch": 1.42, + "grad_norm": 13.477119445800781, + "learning_rate": 1.0556833870225496e-05, + "loss": 1.9903, + "step": 11287 + }, + { + "epoch": 1.42, + "grad_norm": 9.830741882324219, + "learning_rate": 1.055599715516881e-05, + "loss": 0.3425, + "step": 11288 + }, + { + "epoch": 1.42, + "grad_norm": 4.542238235473633, + "learning_rate": 1.055516044011212e-05, + "loss": 0.7427, + "step": 11289 + }, + { + "epoch": 1.42, + "grad_norm": 4.7642998695373535, + "learning_rate": 1.0554323725055434e-05, + "loss": 0.3059, + "step": 11290 + }, + { + "epoch": 1.42, + "grad_norm": 26.863927841186523, + "learning_rate": 1.0553487009998745e-05, + "loss": 2.5606, + "step": 11291 + }, + { + "epoch": 1.42, + "grad_norm": 7.374013423919678, + "learning_rate": 1.0552650294942057e-05, + "loss": 0.6605, + "step": 11292 + }, + { + "epoch": 1.42, + "grad_norm": 11.80407428741455, + "learning_rate": 1.0551813579885371e-05, + "loss": 1.5701, + "step": 11293 + }, + { + "epoch": 1.42, + "grad_norm": 19.342987060546875, + "learning_rate": 1.0550976864828683e-05, + "loss": 1.057, + "step": 11294 + }, + { + "epoch": 1.42, + "grad_norm": 16.646146774291992, + "learning_rate": 1.0550140149771997e-05, + "loss": 1.1462, + "step": 11295 + }, + { + "epoch": 1.42, + "grad_norm": 6.888421058654785, + "learning_rate": 1.0549303434715309e-05, + "loss": 1.4076, + "step": 11296 + }, + { + "epoch": 1.42, + "grad_norm": 7.04971170425415, + "learning_rate": 1.054846671965862e-05, + "loss": 1.6847, + "step": 11297 + }, + { + "epoch": 1.42, + "grad_norm": 41.948150634765625, + "learning_rate": 1.0547630004601934e-05, + "loss": 1.7488, + "step": 11298 + }, + { + "epoch": 1.42, + "grad_norm": 6.25320291519165, + "learning_rate": 1.0546793289545245e-05, + "loss": 1.3213, + "step": 11299 + }, + { + "epoch": 1.42, + "grad_norm": 28.94004249572754, + "learning_rate": 1.0545956574488558e-05, + "loss": 1.1052, + "step": 11300 + }, + { + "epoch": 1.42, + "grad_norm": 41.42261505126953, + "learning_rate": 1.0545119859431872e-05, + "loss": 1.3953, + "step": 11301 + }, + { + "epoch": 1.42, + "grad_norm": 15.843208312988281, + "learning_rate": 1.0544283144375185e-05, + "loss": 0.9795, + "step": 11302 + }, + { + "epoch": 1.42, + "grad_norm": 11.01613998413086, + "learning_rate": 1.0543446429318496e-05, + "loss": 2.2015, + "step": 11303 + }, + { + "epoch": 1.42, + "grad_norm": 13.635912895202637, + "learning_rate": 1.054260971426181e-05, + "loss": 1.0478, + "step": 11304 + }, + { + "epoch": 1.42, + "grad_norm": 22.0612850189209, + "learning_rate": 1.0541772999205121e-05, + "loss": 1.3575, + "step": 11305 + }, + { + "epoch": 1.42, + "grad_norm": 9.714583396911621, + "learning_rate": 1.0540936284148433e-05, + "loss": 0.6664, + "step": 11306 + }, + { + "epoch": 1.42, + "grad_norm": 20.06359100341797, + "learning_rate": 1.0540099569091747e-05, + "loss": 0.3482, + "step": 11307 + }, + { + "epoch": 1.42, + "grad_norm": 15.595358848571777, + "learning_rate": 1.0539262854035059e-05, + "loss": 1.8518, + "step": 11308 + }, + { + "epoch": 1.42, + "grad_norm": 60.89354705810547, + "learning_rate": 1.0538426138978373e-05, + "loss": 1.7748, + "step": 11309 + }, + { + "epoch": 1.42, + "grad_norm": 11.46818733215332, + "learning_rate": 1.0537589423921683e-05, + "loss": 0.4397, + "step": 11310 + }, + { + "epoch": 1.42, + "grad_norm": 22.049875259399414, + "learning_rate": 1.0536752708864996e-05, + "loss": 1.8015, + "step": 11311 + }, + { + "epoch": 1.42, + "grad_norm": 11.30109977722168, + "learning_rate": 1.053591599380831e-05, + "loss": 0.3747, + "step": 11312 + }, + { + "epoch": 1.42, + "grad_norm": 12.697015762329102, + "learning_rate": 1.053507927875162e-05, + "loss": 0.8116, + "step": 11313 + }, + { + "epoch": 1.42, + "grad_norm": 14.568947792053223, + "learning_rate": 1.0534242563694934e-05, + "loss": 1.4176, + "step": 11314 + }, + { + "epoch": 1.42, + "grad_norm": 20.654478073120117, + "learning_rate": 1.0533405848638248e-05, + "loss": 1.3716, + "step": 11315 + }, + { + "epoch": 1.42, + "grad_norm": 119.02867889404297, + "learning_rate": 1.0532569133581561e-05, + "loss": 1.7289, + "step": 11316 + }, + { + "epoch": 1.42, + "grad_norm": 8.667135238647461, + "learning_rate": 1.0531732418524872e-05, + "loss": 1.968, + "step": 11317 + }, + { + "epoch": 1.42, + "grad_norm": 22.299379348754883, + "learning_rate": 1.0530895703468185e-05, + "loss": 1.5755, + "step": 11318 + }, + { + "epoch": 1.42, + "grad_norm": 14.984415054321289, + "learning_rate": 1.0530058988411497e-05, + "loss": 0.8569, + "step": 11319 + }, + { + "epoch": 1.42, + "grad_norm": 92.09161376953125, + "learning_rate": 1.0529222273354809e-05, + "loss": 3.5519, + "step": 11320 + }, + { + "epoch": 1.42, + "grad_norm": 32.287445068359375, + "learning_rate": 1.0528385558298123e-05, + "loss": 1.8257, + "step": 11321 + }, + { + "epoch": 1.42, + "grad_norm": 432.5163879394531, + "learning_rate": 1.0527548843241435e-05, + "loss": 1.5465, + "step": 11322 + }, + { + "epoch": 1.42, + "grad_norm": 15.813030242919922, + "learning_rate": 1.0526712128184748e-05, + "loss": 0.9823, + "step": 11323 + }, + { + "epoch": 1.42, + "grad_norm": 17.160110473632812, + "learning_rate": 1.0525875413128059e-05, + "loss": 2.3561, + "step": 11324 + }, + { + "epoch": 1.42, + "grad_norm": 5.169161319732666, + "learning_rate": 1.0525038698071372e-05, + "loss": 0.2158, + "step": 11325 + }, + { + "epoch": 1.42, + "grad_norm": 12.82089614868164, + "learning_rate": 1.0524201983014686e-05, + "loss": 1.0913, + "step": 11326 + }, + { + "epoch": 1.42, + "grad_norm": 55.144737243652344, + "learning_rate": 1.0523365267957996e-05, + "loss": 1.4059, + "step": 11327 + }, + { + "epoch": 1.42, + "grad_norm": 16.044841766357422, + "learning_rate": 1.052252855290131e-05, + "loss": 1.3942, + "step": 11328 + }, + { + "epoch": 1.42, + "grad_norm": 12.018836975097656, + "learning_rate": 1.0521691837844623e-05, + "loss": 1.4321, + "step": 11329 + }, + { + "epoch": 1.42, + "grad_norm": 15.334610939025879, + "learning_rate": 1.0520855122787937e-05, + "loss": 1.2405, + "step": 11330 + }, + { + "epoch": 1.42, + "grad_norm": 11.532013893127441, + "learning_rate": 1.0520018407731247e-05, + "loss": 0.6072, + "step": 11331 + }, + { + "epoch": 1.42, + "grad_norm": 15.10476303100586, + "learning_rate": 1.0519181692674561e-05, + "loss": 0.8413, + "step": 11332 + }, + { + "epoch": 1.42, + "grad_norm": 19.77190399169922, + "learning_rate": 1.0518344977617873e-05, + "loss": 0.9303, + "step": 11333 + }, + { + "epoch": 1.42, + "grad_norm": 11.441058158874512, + "learning_rate": 1.0517508262561185e-05, + "loss": 1.1298, + "step": 11334 + }, + { + "epoch": 1.42, + "grad_norm": 6.324008941650391, + "learning_rate": 1.0516671547504499e-05, + "loss": 1.5995, + "step": 11335 + }, + { + "epoch": 1.42, + "grad_norm": 5.152180194854736, + "learning_rate": 1.051583483244781e-05, + "loss": 0.3173, + "step": 11336 + }, + { + "epoch": 1.42, + "grad_norm": 7.624914646148682, + "learning_rate": 1.0514998117391124e-05, + "loss": 0.7505, + "step": 11337 + }, + { + "epoch": 1.42, + "grad_norm": 16.759044647216797, + "learning_rate": 1.0514161402334434e-05, + "loss": 1.0097, + "step": 11338 + }, + { + "epoch": 1.42, + "grad_norm": 18.782703399658203, + "learning_rate": 1.0513324687277748e-05, + "loss": 1.6375, + "step": 11339 + }, + { + "epoch": 1.42, + "grad_norm": 4.0037994384765625, + "learning_rate": 1.0512487972221062e-05, + "loss": 0.1343, + "step": 11340 + }, + { + "epoch": 1.42, + "grad_norm": 19.290443420410156, + "learning_rate": 1.0511651257164372e-05, + "loss": 1.1627, + "step": 11341 + }, + { + "epoch": 1.42, + "grad_norm": 9.35206413269043, + "learning_rate": 1.0510814542107686e-05, + "loss": 1.6862, + "step": 11342 + }, + { + "epoch": 1.42, + "grad_norm": 73.9336166381836, + "learning_rate": 1.0509977827051e-05, + "loss": 1.1326, + "step": 11343 + }, + { + "epoch": 1.42, + "grad_norm": 8.469733238220215, + "learning_rate": 1.0509141111994311e-05, + "loss": 0.8905, + "step": 11344 + }, + { + "epoch": 1.42, + "grad_norm": 41.667808532714844, + "learning_rate": 1.0508304396937623e-05, + "loss": 1.9519, + "step": 11345 + }, + { + "epoch": 1.42, + "grad_norm": 6.640600681304932, + "learning_rate": 1.0507467681880937e-05, + "loss": 0.2494, + "step": 11346 + }, + { + "epoch": 1.42, + "grad_norm": 13.451955795288086, + "learning_rate": 1.0506630966824249e-05, + "loss": 1.1055, + "step": 11347 + }, + { + "epoch": 1.42, + "grad_norm": 27.924213409423828, + "learning_rate": 1.050579425176756e-05, + "loss": 1.7074, + "step": 11348 + }, + { + "epoch": 1.42, + "grad_norm": 9.30042839050293, + "learning_rate": 1.0504957536710874e-05, + "loss": 0.8538, + "step": 11349 + }, + { + "epoch": 1.42, + "grad_norm": 7.299258708953857, + "learning_rate": 1.0504120821654186e-05, + "loss": 1.7927, + "step": 11350 + }, + { + "epoch": 1.42, + "grad_norm": 21.970739364624023, + "learning_rate": 1.05032841065975e-05, + "loss": 1.4202, + "step": 11351 + }, + { + "epoch": 1.42, + "grad_norm": 26.46619415283203, + "learning_rate": 1.050244739154081e-05, + "loss": 2.9561, + "step": 11352 + }, + { + "epoch": 1.42, + "grad_norm": 17.56070327758789, + "learning_rate": 1.0501610676484124e-05, + "loss": 1.0894, + "step": 11353 + }, + { + "epoch": 1.42, + "grad_norm": 16.233503341674805, + "learning_rate": 1.0500773961427438e-05, + "loss": 1.1073, + "step": 11354 + }, + { + "epoch": 1.43, + "grad_norm": 11.83210277557373, + "learning_rate": 1.0499937246370748e-05, + "loss": 1.1373, + "step": 11355 + }, + { + "epoch": 1.43, + "grad_norm": 28.475740432739258, + "learning_rate": 1.0499100531314062e-05, + "loss": 1.1082, + "step": 11356 + }, + { + "epoch": 1.43, + "grad_norm": 11.7533540725708, + "learning_rate": 1.0498263816257375e-05, + "loss": 1.259, + "step": 11357 + }, + { + "epoch": 1.43, + "grad_norm": 124.90774536132812, + "learning_rate": 1.0497427101200687e-05, + "loss": 0.9487, + "step": 11358 + }, + { + "epoch": 1.43, + "grad_norm": 10.858553886413574, + "learning_rate": 1.0496590386143999e-05, + "loss": 0.9905, + "step": 11359 + }, + { + "epoch": 1.43, + "grad_norm": 22.065006256103516, + "learning_rate": 1.0495753671087313e-05, + "loss": 1.4661, + "step": 11360 + }, + { + "epoch": 1.43, + "grad_norm": 12.489311218261719, + "learning_rate": 1.0494916956030625e-05, + "loss": 1.2853, + "step": 11361 + }, + { + "epoch": 1.43, + "grad_norm": 15.446892738342285, + "learning_rate": 1.0494080240973937e-05, + "loss": 0.8781, + "step": 11362 + }, + { + "epoch": 1.43, + "grad_norm": 8.910172462463379, + "learning_rate": 1.0493243525917249e-05, + "loss": 0.9655, + "step": 11363 + }, + { + "epoch": 1.43, + "grad_norm": 6.618316173553467, + "learning_rate": 1.0492406810860562e-05, + "loss": 0.7508, + "step": 11364 + }, + { + "epoch": 1.43, + "grad_norm": 10.195474624633789, + "learning_rate": 1.0491570095803876e-05, + "loss": 1.1058, + "step": 11365 + }, + { + "epoch": 1.43, + "grad_norm": 26.93873405456543, + "learning_rate": 1.0490733380747186e-05, + "loss": 1.4308, + "step": 11366 + }, + { + "epoch": 1.43, + "grad_norm": 10.582056045532227, + "learning_rate": 1.04898966656905e-05, + "loss": 1.334, + "step": 11367 + }, + { + "epoch": 1.43, + "grad_norm": 3.099984645843506, + "learning_rate": 1.0489059950633813e-05, + "loss": 0.1451, + "step": 11368 + }, + { + "epoch": 1.43, + "grad_norm": 10.376474380493164, + "learning_rate": 1.0488223235577124e-05, + "loss": 0.6118, + "step": 11369 + }, + { + "epoch": 1.43, + "grad_norm": 15.648659706115723, + "learning_rate": 1.0487386520520437e-05, + "loss": 1.2294, + "step": 11370 + }, + { + "epoch": 1.43, + "grad_norm": 44.50442123413086, + "learning_rate": 1.0486549805463751e-05, + "loss": 1.3816, + "step": 11371 + }, + { + "epoch": 1.43, + "grad_norm": 28.336875915527344, + "learning_rate": 1.0485713090407063e-05, + "loss": 0.7956, + "step": 11372 + }, + { + "epoch": 1.43, + "grad_norm": 6.759217262268066, + "learning_rate": 1.0484876375350375e-05, + "loss": 1.1031, + "step": 11373 + }, + { + "epoch": 1.43, + "grad_norm": 16.726430892944336, + "learning_rate": 1.0484039660293689e-05, + "loss": 1.4283, + "step": 11374 + }, + { + "epoch": 1.43, + "grad_norm": 12.317497253417969, + "learning_rate": 1.0483202945237e-05, + "loss": 1.1722, + "step": 11375 + }, + { + "epoch": 1.43, + "grad_norm": 12.657833099365234, + "learning_rate": 1.0482366230180312e-05, + "loss": 0.4978, + "step": 11376 + }, + { + "epoch": 1.43, + "grad_norm": 15.169766426086426, + "learning_rate": 1.0481529515123624e-05, + "loss": 1.3068, + "step": 11377 + }, + { + "epoch": 1.43, + "grad_norm": 4.539684295654297, + "learning_rate": 1.0480692800066938e-05, + "loss": 0.3645, + "step": 11378 + }, + { + "epoch": 1.43, + "grad_norm": 19.042705535888672, + "learning_rate": 1.0479856085010252e-05, + "loss": 1.455, + "step": 11379 + }, + { + "epoch": 1.43, + "grad_norm": 7.281583786010742, + "learning_rate": 1.0479019369953562e-05, + "loss": 0.2026, + "step": 11380 + }, + { + "epoch": 1.43, + "grad_norm": 17.259510040283203, + "learning_rate": 1.0478182654896876e-05, + "loss": 1.4423, + "step": 11381 + }, + { + "epoch": 1.43, + "grad_norm": 15.83186149597168, + "learning_rate": 1.047734593984019e-05, + "loss": 1.578, + "step": 11382 + }, + { + "epoch": 1.43, + "grad_norm": 7.90648889541626, + "learning_rate": 1.04765092247835e-05, + "loss": 1.1944, + "step": 11383 + }, + { + "epoch": 1.43, + "grad_norm": 21.07403564453125, + "learning_rate": 1.0475672509726813e-05, + "loss": 1.6348, + "step": 11384 + }, + { + "epoch": 1.43, + "grad_norm": 12.450182914733887, + "learning_rate": 1.0474835794670127e-05, + "loss": 2.2169, + "step": 11385 + }, + { + "epoch": 1.43, + "grad_norm": 6.377939224243164, + "learning_rate": 1.0473999079613439e-05, + "loss": 0.4044, + "step": 11386 + }, + { + "epoch": 1.43, + "grad_norm": 8.51230525970459, + "learning_rate": 1.047316236455675e-05, + "loss": 0.7925, + "step": 11387 + }, + { + "epoch": 1.43, + "grad_norm": 56.4589958190918, + "learning_rate": 1.0472325649500064e-05, + "loss": 0.9871, + "step": 11388 + }, + { + "epoch": 1.43, + "grad_norm": 26.939266204833984, + "learning_rate": 1.0471488934443376e-05, + "loss": 1.4693, + "step": 11389 + }, + { + "epoch": 1.43, + "grad_norm": 18.43749237060547, + "learning_rate": 1.0470652219386688e-05, + "loss": 1.4306, + "step": 11390 + }, + { + "epoch": 1.43, + "grad_norm": 28.045595169067383, + "learning_rate": 1.046981550433e-05, + "loss": 1.5361, + "step": 11391 + }, + { + "epoch": 1.43, + "grad_norm": 10.949115753173828, + "learning_rate": 1.0468978789273314e-05, + "loss": 1.523, + "step": 11392 + }, + { + "epoch": 1.43, + "grad_norm": 51.4660758972168, + "learning_rate": 1.0468142074216628e-05, + "loss": 2.6285, + "step": 11393 + }, + { + "epoch": 1.43, + "grad_norm": 9.311626434326172, + "learning_rate": 1.0467305359159938e-05, + "loss": 1.2782, + "step": 11394 + }, + { + "epoch": 1.43, + "grad_norm": 18.409934997558594, + "learning_rate": 1.0466468644103251e-05, + "loss": 1.2012, + "step": 11395 + }, + { + "epoch": 1.43, + "grad_norm": 14.347053527832031, + "learning_rate": 1.0465631929046565e-05, + "loss": 1.9738, + "step": 11396 + }, + { + "epoch": 1.43, + "grad_norm": 16.370105743408203, + "learning_rate": 1.0464795213989875e-05, + "loss": 0.9552, + "step": 11397 + }, + { + "epoch": 1.43, + "grad_norm": 7.3930182456970215, + "learning_rate": 1.0463958498933189e-05, + "loss": 0.4915, + "step": 11398 + }, + { + "epoch": 1.43, + "grad_norm": 15.962390899658203, + "learning_rate": 1.0463121783876503e-05, + "loss": 1.4147, + "step": 11399 + }, + { + "epoch": 1.43, + "grad_norm": 42.52497482299805, + "learning_rate": 1.0462285068819815e-05, + "loss": 2.0916, + "step": 11400 + }, + { + "epoch": 1.43, + "grad_norm": 34.833011627197266, + "learning_rate": 1.0461448353763127e-05, + "loss": 2.3191, + "step": 11401 + }, + { + "epoch": 1.43, + "grad_norm": 19.962411880493164, + "learning_rate": 1.0460611638706439e-05, + "loss": 1.586, + "step": 11402 + }, + { + "epoch": 1.43, + "grad_norm": 8.79418659210205, + "learning_rate": 1.0459774923649752e-05, + "loss": 0.8382, + "step": 11403 + }, + { + "epoch": 1.43, + "grad_norm": 32.29881286621094, + "learning_rate": 1.0458938208593064e-05, + "loss": 1.0914, + "step": 11404 + }, + { + "epoch": 1.43, + "grad_norm": 20.271190643310547, + "learning_rate": 1.0458101493536376e-05, + "loss": 1.0341, + "step": 11405 + }, + { + "epoch": 1.43, + "grad_norm": 43.88380432128906, + "learning_rate": 1.045726477847969e-05, + "loss": 0.8645, + "step": 11406 + }, + { + "epoch": 1.43, + "grad_norm": 15.907600402832031, + "learning_rate": 1.0456428063423003e-05, + "loss": 1.7109, + "step": 11407 + }, + { + "epoch": 1.43, + "grad_norm": 12.382284164428711, + "learning_rate": 1.0455591348366314e-05, + "loss": 1.8094, + "step": 11408 + }, + { + "epoch": 1.43, + "grad_norm": 10.19265079498291, + "learning_rate": 1.0454754633309627e-05, + "loss": 0.7449, + "step": 11409 + }, + { + "epoch": 1.43, + "grad_norm": 9.750264167785645, + "learning_rate": 1.0453917918252941e-05, + "loss": 0.4312, + "step": 11410 + }, + { + "epoch": 1.43, + "grad_norm": 2.6973702907562256, + "learning_rate": 1.0453081203196251e-05, + "loss": 0.1651, + "step": 11411 + }, + { + "epoch": 1.43, + "grad_norm": 4.4212212562561035, + "learning_rate": 1.0452244488139565e-05, + "loss": 0.5799, + "step": 11412 + }, + { + "epoch": 1.43, + "grad_norm": 9.936755180358887, + "learning_rate": 1.0451407773082879e-05, + "loss": 1.1073, + "step": 11413 + }, + { + "epoch": 1.43, + "grad_norm": 11.346553802490234, + "learning_rate": 1.045057105802619e-05, + "loss": 1.0439, + "step": 11414 + }, + { + "epoch": 1.43, + "grad_norm": 11.141072273254395, + "learning_rate": 1.0449734342969502e-05, + "loss": 0.6386, + "step": 11415 + }, + { + "epoch": 1.43, + "grad_norm": 16.572446823120117, + "learning_rate": 1.0448897627912814e-05, + "loss": 0.7752, + "step": 11416 + }, + { + "epoch": 1.43, + "grad_norm": 11.637643814086914, + "learning_rate": 1.0448060912856128e-05, + "loss": 0.4291, + "step": 11417 + }, + { + "epoch": 1.43, + "grad_norm": 15.866935729980469, + "learning_rate": 1.044722419779944e-05, + "loss": 0.701, + "step": 11418 + }, + { + "epoch": 1.43, + "grad_norm": 7.4652018547058105, + "learning_rate": 1.0446387482742752e-05, + "loss": 0.3932, + "step": 11419 + }, + { + "epoch": 1.43, + "grad_norm": 10.4305419921875, + "learning_rate": 1.0445550767686066e-05, + "loss": 1.7346, + "step": 11420 + }, + { + "epoch": 1.43, + "grad_norm": 30.32324981689453, + "learning_rate": 1.044471405262938e-05, + "loss": 1.9951, + "step": 11421 + }, + { + "epoch": 1.43, + "grad_norm": 8.34090518951416, + "learning_rate": 1.044387733757269e-05, + "loss": 0.8096, + "step": 11422 + }, + { + "epoch": 1.43, + "grad_norm": 5.801850318908691, + "learning_rate": 1.0443040622516003e-05, + "loss": 0.6392, + "step": 11423 + }, + { + "epoch": 1.43, + "grad_norm": 15.56621265411377, + "learning_rate": 1.0442203907459317e-05, + "loss": 0.9069, + "step": 11424 + }, + { + "epoch": 1.43, + "grad_norm": 8.361141204833984, + "learning_rate": 1.0441367192402627e-05, + "loss": 0.6857, + "step": 11425 + }, + { + "epoch": 1.43, + "grad_norm": 11.270186424255371, + "learning_rate": 1.044053047734594e-05, + "loss": 1.0766, + "step": 11426 + }, + { + "epoch": 1.43, + "grad_norm": 23.772802352905273, + "learning_rate": 1.0439693762289254e-05, + "loss": 1.2279, + "step": 11427 + }, + { + "epoch": 1.43, + "grad_norm": 5.569838047027588, + "learning_rate": 1.0438857047232566e-05, + "loss": 0.1386, + "step": 11428 + }, + { + "epoch": 1.43, + "grad_norm": 113.14727783203125, + "learning_rate": 1.0438020332175878e-05, + "loss": 1.6265, + "step": 11429 + }, + { + "epoch": 1.43, + "grad_norm": 8.468873977661133, + "learning_rate": 1.043718361711919e-05, + "loss": 0.8927, + "step": 11430 + }, + { + "epoch": 1.43, + "grad_norm": 21.616683959960938, + "learning_rate": 1.0436346902062504e-05, + "loss": 0.6899, + "step": 11431 + }, + { + "epoch": 1.43, + "grad_norm": 14.02167797088623, + "learning_rate": 1.0435510187005816e-05, + "loss": 1.9948, + "step": 11432 + }, + { + "epoch": 1.43, + "grad_norm": 8.072164535522461, + "learning_rate": 1.0434673471949128e-05, + "loss": 2.6065, + "step": 11433 + }, + { + "epoch": 1.43, + "grad_norm": 31.57025718688965, + "learning_rate": 1.0433836756892441e-05, + "loss": 3.0612, + "step": 11434 + }, + { + "epoch": 1.44, + "grad_norm": 5.659616470336914, + "learning_rate": 1.0433000041835755e-05, + "loss": 0.4403, + "step": 11435 + }, + { + "epoch": 1.44, + "grad_norm": 11.711050987243652, + "learning_rate": 1.0432163326779065e-05, + "loss": 1.7349, + "step": 11436 + }, + { + "epoch": 1.44, + "grad_norm": 10.188949584960938, + "learning_rate": 1.0431326611722379e-05, + "loss": 1.0138, + "step": 11437 + }, + { + "epoch": 1.44, + "grad_norm": 13.183706283569336, + "learning_rate": 1.0430489896665693e-05, + "loss": 0.8548, + "step": 11438 + }, + { + "epoch": 1.44, + "grad_norm": 7.962393283843994, + "learning_rate": 1.0429653181609003e-05, + "loss": 0.26, + "step": 11439 + }, + { + "epoch": 1.44, + "grad_norm": 9.664677619934082, + "learning_rate": 1.0428816466552317e-05, + "loss": 0.7845, + "step": 11440 + }, + { + "epoch": 1.44, + "grad_norm": 14.836224555969238, + "learning_rate": 1.042797975149563e-05, + "loss": 1.2228, + "step": 11441 + }, + { + "epoch": 1.44, + "grad_norm": 8.407991409301758, + "learning_rate": 1.0427143036438942e-05, + "loss": 0.6964, + "step": 11442 + }, + { + "epoch": 1.44, + "grad_norm": 32.23660659790039, + "learning_rate": 1.0426306321382254e-05, + "loss": 2.0007, + "step": 11443 + }, + { + "epoch": 1.44, + "grad_norm": 10.350091934204102, + "learning_rate": 1.0425469606325566e-05, + "loss": 1.1657, + "step": 11444 + }, + { + "epoch": 1.44, + "grad_norm": 6.385477542877197, + "learning_rate": 1.042463289126888e-05, + "loss": 1.4644, + "step": 11445 + }, + { + "epoch": 1.44, + "grad_norm": 12.992371559143066, + "learning_rate": 1.0423796176212192e-05, + "loss": 0.805, + "step": 11446 + }, + { + "epoch": 1.44, + "grad_norm": 16.413516998291016, + "learning_rate": 1.0422959461155504e-05, + "loss": 1.0079, + "step": 11447 + }, + { + "epoch": 1.44, + "grad_norm": 12.926766395568848, + "learning_rate": 1.0422122746098817e-05, + "loss": 1.6126, + "step": 11448 + }, + { + "epoch": 1.44, + "grad_norm": 8.546387672424316, + "learning_rate": 1.0421286031042131e-05, + "loss": 0.6899, + "step": 11449 + }, + { + "epoch": 1.44, + "grad_norm": 66.18031311035156, + "learning_rate": 1.0420449315985441e-05, + "loss": 0.4709, + "step": 11450 + }, + { + "epoch": 1.44, + "grad_norm": 24.744321823120117, + "learning_rate": 1.0419612600928755e-05, + "loss": 2.5164, + "step": 11451 + }, + { + "epoch": 1.44, + "grad_norm": 25.81559181213379, + "learning_rate": 1.0418775885872068e-05, + "loss": 1.6183, + "step": 11452 + }, + { + "epoch": 1.44, + "grad_norm": 16.952499389648438, + "learning_rate": 1.0417939170815379e-05, + "loss": 2.4686, + "step": 11453 + }, + { + "epoch": 1.44, + "grad_norm": 4.533487796783447, + "learning_rate": 1.0417102455758692e-05, + "loss": 0.341, + "step": 11454 + }, + { + "epoch": 1.44, + "grad_norm": 40.2689208984375, + "learning_rate": 1.0416265740702004e-05, + "loss": 1.9866, + "step": 11455 + }, + { + "epoch": 1.44, + "grad_norm": 12.823920249938965, + "learning_rate": 1.0415429025645318e-05, + "loss": 1.1613, + "step": 11456 + }, + { + "epoch": 1.44, + "grad_norm": 12.001120567321777, + "learning_rate": 1.041459231058863e-05, + "loss": 0.9691, + "step": 11457 + }, + { + "epoch": 1.44, + "grad_norm": 8.59779167175293, + "learning_rate": 1.0413755595531942e-05, + "loss": 2.6485, + "step": 11458 + }, + { + "epoch": 1.44, + "grad_norm": 51.74159240722656, + "learning_rate": 1.0412918880475256e-05, + "loss": 0.967, + "step": 11459 + }, + { + "epoch": 1.44, + "grad_norm": 18.291950225830078, + "learning_rate": 1.0412082165418567e-05, + "loss": 1.0599, + "step": 11460 + }, + { + "epoch": 1.44, + "grad_norm": 6.537196636199951, + "learning_rate": 1.041124545036188e-05, + "loss": 0.7339, + "step": 11461 + }, + { + "epoch": 1.44, + "grad_norm": 62.63052749633789, + "learning_rate": 1.0410408735305193e-05, + "loss": 2.2704, + "step": 11462 + }, + { + "epoch": 1.44, + "grad_norm": 9.313514709472656, + "learning_rate": 1.0409572020248507e-05, + "loss": 1.0498, + "step": 11463 + }, + { + "epoch": 1.44, + "grad_norm": 21.436363220214844, + "learning_rate": 1.0408735305191817e-05, + "loss": 2.0415, + "step": 11464 + }, + { + "epoch": 1.44, + "grad_norm": 24.772859573364258, + "learning_rate": 1.040789859013513e-05, + "loss": 1.6862, + "step": 11465 + }, + { + "epoch": 1.44, + "grad_norm": 14.833982467651367, + "learning_rate": 1.0407061875078444e-05, + "loss": 3.0364, + "step": 11466 + }, + { + "epoch": 1.44, + "grad_norm": 28.4583797454834, + "learning_rate": 1.0406225160021755e-05, + "loss": 1.5662, + "step": 11467 + }, + { + "epoch": 1.44, + "grad_norm": 20.74558448791504, + "learning_rate": 1.0405388444965068e-05, + "loss": 2.1873, + "step": 11468 + }, + { + "epoch": 1.44, + "grad_norm": 11.81302547454834, + "learning_rate": 1.040455172990838e-05, + "loss": 1.2035, + "step": 11469 + }, + { + "epoch": 1.44, + "grad_norm": 21.872716903686523, + "learning_rate": 1.0403715014851694e-05, + "loss": 3.0724, + "step": 11470 + }, + { + "epoch": 1.44, + "grad_norm": 31.094301223754883, + "learning_rate": 1.0402878299795006e-05, + "loss": 1.7347, + "step": 11471 + }, + { + "epoch": 1.44, + "grad_norm": 9.362553596496582, + "learning_rate": 1.0402041584738318e-05, + "loss": 0.4224, + "step": 11472 + }, + { + "epoch": 1.44, + "grad_norm": 16.24514389038086, + "learning_rate": 1.0401204869681631e-05, + "loss": 2.101, + "step": 11473 + }, + { + "epoch": 1.44, + "grad_norm": 12.963959693908691, + "learning_rate": 1.0400368154624942e-05, + "loss": 0.8612, + "step": 11474 + }, + { + "epoch": 1.44, + "grad_norm": 14.617683410644531, + "learning_rate": 1.0399531439568255e-05, + "loss": 1.9835, + "step": 11475 + }, + { + "epoch": 1.44, + "grad_norm": 13.395903587341309, + "learning_rate": 1.0398694724511569e-05, + "loss": 1.2385, + "step": 11476 + }, + { + "epoch": 1.44, + "grad_norm": 8.507243156433105, + "learning_rate": 1.0397858009454883e-05, + "loss": 2.0466, + "step": 11477 + }, + { + "epoch": 1.44, + "grad_norm": 12.856938362121582, + "learning_rate": 1.0397021294398193e-05, + "loss": 0.7299, + "step": 11478 + }, + { + "epoch": 1.44, + "grad_norm": 3.7521162033081055, + "learning_rate": 1.0396184579341506e-05, + "loss": 0.196, + "step": 11479 + }, + { + "epoch": 1.44, + "grad_norm": 14.700419425964355, + "learning_rate": 1.039534786428482e-05, + "loss": 0.9978, + "step": 11480 + }, + { + "epoch": 1.44, + "grad_norm": 7.636246681213379, + "learning_rate": 1.039451114922813e-05, + "loss": 1.7426, + "step": 11481 + }, + { + "epoch": 1.44, + "grad_norm": 10.989336013793945, + "learning_rate": 1.0393674434171444e-05, + "loss": 0.488, + "step": 11482 + }, + { + "epoch": 1.44, + "grad_norm": 27.758371353149414, + "learning_rate": 1.0392837719114756e-05, + "loss": 0.6158, + "step": 11483 + }, + { + "epoch": 1.44, + "grad_norm": 78.2485122680664, + "learning_rate": 1.039200100405807e-05, + "loss": 2.7756, + "step": 11484 + }, + { + "epoch": 1.44, + "grad_norm": 21.227731704711914, + "learning_rate": 1.0391164289001382e-05, + "loss": 1.2272, + "step": 11485 + }, + { + "epoch": 1.44, + "grad_norm": 7.953176021575928, + "learning_rate": 1.0390327573944694e-05, + "loss": 0.5091, + "step": 11486 + }, + { + "epoch": 1.44, + "grad_norm": 12.84647274017334, + "learning_rate": 1.0389490858888007e-05, + "loss": 1.1015, + "step": 11487 + }, + { + "epoch": 1.44, + "grad_norm": 10.707669258117676, + "learning_rate": 1.0388654143831317e-05, + "loss": 0.7951, + "step": 11488 + }, + { + "epoch": 1.44, + "grad_norm": 13.149401664733887, + "learning_rate": 1.0387817428774631e-05, + "loss": 1.0076, + "step": 11489 + }, + { + "epoch": 1.44, + "grad_norm": 7.7071709632873535, + "learning_rate": 1.0386980713717945e-05, + "loss": 0.8714, + "step": 11490 + }, + { + "epoch": 1.44, + "grad_norm": 10.428049087524414, + "learning_rate": 1.0386143998661258e-05, + "loss": 0.8757, + "step": 11491 + }, + { + "epoch": 1.44, + "grad_norm": 17.69566535949707, + "learning_rate": 1.0385307283604569e-05, + "loss": 0.6535, + "step": 11492 + }, + { + "epoch": 1.44, + "grad_norm": 33.63081359863281, + "learning_rate": 1.0384470568547882e-05, + "loss": 1.939, + "step": 11493 + }, + { + "epoch": 1.44, + "grad_norm": 6.924600124359131, + "learning_rate": 1.0383633853491196e-05, + "loss": 0.4562, + "step": 11494 + }, + { + "epoch": 1.44, + "grad_norm": 26.665849685668945, + "learning_rate": 1.0382797138434506e-05, + "loss": 2.8667, + "step": 11495 + }, + { + "epoch": 1.44, + "grad_norm": 20.90088653564453, + "learning_rate": 1.038196042337782e-05, + "loss": 2.0433, + "step": 11496 + }, + { + "epoch": 1.44, + "grad_norm": 3.360067367553711, + "learning_rate": 1.0381123708321132e-05, + "loss": 0.1102, + "step": 11497 + }, + { + "epoch": 1.44, + "grad_norm": 63.318931579589844, + "learning_rate": 1.0380286993264445e-05, + "loss": 1.4949, + "step": 11498 + }, + { + "epoch": 1.44, + "grad_norm": 9.656840324401855, + "learning_rate": 1.0379450278207757e-05, + "loss": 0.7261, + "step": 11499 + }, + { + "epoch": 1.44, + "grad_norm": 17.443967819213867, + "learning_rate": 1.037861356315107e-05, + "loss": 1.3823, + "step": 11500 + }, + { + "epoch": 1.44, + "grad_norm": 17.237642288208008, + "learning_rate": 1.0377776848094383e-05, + "loss": 0.8791, + "step": 11501 + }, + { + "epoch": 1.44, + "grad_norm": 7.85336971282959, + "learning_rate": 1.0376940133037693e-05, + "loss": 0.6561, + "step": 11502 + }, + { + "epoch": 1.44, + "grad_norm": 5.917194366455078, + "learning_rate": 1.0376103417981007e-05, + "loss": 0.2258, + "step": 11503 + }, + { + "epoch": 1.44, + "grad_norm": 25.715185165405273, + "learning_rate": 1.037526670292432e-05, + "loss": 2.5617, + "step": 11504 + }, + { + "epoch": 1.44, + "grad_norm": 4.817445278167725, + "learning_rate": 1.0374429987867634e-05, + "loss": 0.4108, + "step": 11505 + }, + { + "epoch": 1.44, + "grad_norm": 103.81026458740234, + "learning_rate": 1.0373593272810945e-05, + "loss": 1.3861, + "step": 11506 + }, + { + "epoch": 1.44, + "grad_norm": 9.29235553741455, + "learning_rate": 1.0372756557754258e-05, + "loss": 0.8288, + "step": 11507 + }, + { + "epoch": 1.44, + "grad_norm": 30.044078826904297, + "learning_rate": 1.037191984269757e-05, + "loss": 0.7739, + "step": 11508 + }, + { + "epoch": 1.44, + "grad_norm": 5.862963676452637, + "learning_rate": 1.0371083127640882e-05, + "loss": 0.4256, + "step": 11509 + }, + { + "epoch": 1.44, + "grad_norm": 20.0323543548584, + "learning_rate": 1.0370246412584196e-05, + "loss": 1.6705, + "step": 11510 + }, + { + "epoch": 1.44, + "grad_norm": 27.138111114501953, + "learning_rate": 1.0369409697527508e-05, + "loss": 0.9324, + "step": 11511 + }, + { + "epoch": 1.44, + "grad_norm": 24.705583572387695, + "learning_rate": 1.0368572982470821e-05, + "loss": 2.9491, + "step": 11512 + }, + { + "epoch": 1.44, + "grad_norm": 7.6751837730407715, + "learning_rate": 1.0367736267414132e-05, + "loss": 0.2387, + "step": 11513 + }, + { + "epoch": 1.44, + "grad_norm": 12.83401107788086, + "learning_rate": 1.0366899552357445e-05, + "loss": 0.5735, + "step": 11514 + }, + { + "epoch": 1.45, + "grad_norm": 8.1944580078125, + "learning_rate": 1.0366062837300759e-05, + "loss": 2.7075, + "step": 11515 + }, + { + "epoch": 1.45, + "grad_norm": 31.396709442138672, + "learning_rate": 1.0365226122244069e-05, + "loss": 0.9407, + "step": 11516 + }, + { + "epoch": 1.45, + "grad_norm": 17.157712936401367, + "learning_rate": 1.0364389407187383e-05, + "loss": 1.2364, + "step": 11517 + }, + { + "epoch": 1.45, + "grad_norm": 11.019781112670898, + "learning_rate": 1.0363552692130696e-05, + "loss": 1.5119, + "step": 11518 + }, + { + "epoch": 1.45, + "grad_norm": 14.416360855102539, + "learning_rate": 1.036271597707401e-05, + "loss": 0.9504, + "step": 11519 + }, + { + "epoch": 1.45, + "grad_norm": 6.151710033416748, + "learning_rate": 1.036187926201732e-05, + "loss": 0.5117, + "step": 11520 + }, + { + "epoch": 1.45, + "grad_norm": 13.85291576385498, + "learning_rate": 1.0361042546960634e-05, + "loss": 1.6905, + "step": 11521 + }, + { + "epoch": 1.45, + "grad_norm": 4.2402143478393555, + "learning_rate": 1.0360205831903946e-05, + "loss": 0.3068, + "step": 11522 + }, + { + "epoch": 1.45, + "grad_norm": 10.13919734954834, + "learning_rate": 1.0359369116847258e-05, + "loss": 0.7341, + "step": 11523 + }, + { + "epoch": 1.45, + "grad_norm": 8.106743812561035, + "learning_rate": 1.0358532401790572e-05, + "loss": 1.064, + "step": 11524 + }, + { + "epoch": 1.45, + "grad_norm": 9.152956008911133, + "learning_rate": 1.0357695686733884e-05, + "loss": 1.9516, + "step": 11525 + }, + { + "epoch": 1.45, + "grad_norm": 17.164810180664062, + "learning_rate": 1.0356858971677197e-05, + "loss": 1.3683, + "step": 11526 + }, + { + "epoch": 1.45, + "grad_norm": 10.938882827758789, + "learning_rate": 1.0356022256620507e-05, + "loss": 0.4788, + "step": 11527 + }, + { + "epoch": 1.45, + "grad_norm": 26.89095687866211, + "learning_rate": 1.0355185541563821e-05, + "loss": 3.3631, + "step": 11528 + }, + { + "epoch": 1.45, + "grad_norm": 7.967386722564697, + "learning_rate": 1.0354348826507135e-05, + "loss": 1.0033, + "step": 11529 + }, + { + "epoch": 1.45, + "grad_norm": 13.68557357788086, + "learning_rate": 1.0353512111450445e-05, + "loss": 1.1689, + "step": 11530 + }, + { + "epoch": 1.45, + "grad_norm": 15.132197380065918, + "learning_rate": 1.0352675396393759e-05, + "loss": 1.3014, + "step": 11531 + }, + { + "epoch": 1.45, + "grad_norm": 45.730281829833984, + "learning_rate": 1.0351838681337072e-05, + "loss": 2.1266, + "step": 11532 + }, + { + "epoch": 1.45, + "grad_norm": 82.78030395507812, + "learning_rate": 1.0351001966280386e-05, + "loss": 0.9793, + "step": 11533 + }, + { + "epoch": 1.45, + "grad_norm": 7.21171236038208, + "learning_rate": 1.0350165251223696e-05, + "loss": 1.0636, + "step": 11534 + }, + { + "epoch": 1.45, + "grad_norm": 908.1190185546875, + "learning_rate": 1.034932853616701e-05, + "loss": 2.1047, + "step": 11535 + }, + { + "epoch": 1.45, + "grad_norm": 20.707496643066406, + "learning_rate": 1.0348491821110322e-05, + "loss": 2.0304, + "step": 11536 + }, + { + "epoch": 1.45, + "grad_norm": 181.55435180664062, + "learning_rate": 1.0347655106053634e-05, + "loss": 1.5087, + "step": 11537 + }, + { + "epoch": 1.45, + "grad_norm": 20.19994354248047, + "learning_rate": 1.0346818390996947e-05, + "loss": 2.1974, + "step": 11538 + }, + { + "epoch": 1.45, + "grad_norm": 12.254837989807129, + "learning_rate": 1.034598167594026e-05, + "loss": 1.0284, + "step": 11539 + }, + { + "epoch": 1.45, + "grad_norm": 10.827713966369629, + "learning_rate": 1.0345144960883573e-05, + "loss": 0.7259, + "step": 11540 + }, + { + "epoch": 1.45, + "grad_norm": 18.657577514648438, + "learning_rate": 1.0344308245826883e-05, + "loss": 1.1016, + "step": 11541 + }, + { + "epoch": 1.45, + "grad_norm": 26.329896926879883, + "learning_rate": 1.0343471530770197e-05, + "loss": 1.3192, + "step": 11542 + }, + { + "epoch": 1.45, + "grad_norm": 13.237192153930664, + "learning_rate": 1.034263481571351e-05, + "loss": 0.7066, + "step": 11543 + }, + { + "epoch": 1.45, + "grad_norm": 12.47546672821045, + "learning_rate": 1.034179810065682e-05, + "loss": 0.7557, + "step": 11544 + }, + { + "epoch": 1.45, + "grad_norm": 159.52273559570312, + "learning_rate": 1.0340961385600134e-05, + "loss": 1.6982, + "step": 11545 + }, + { + "epoch": 1.45, + "grad_norm": 6.027006149291992, + "learning_rate": 1.0340124670543448e-05, + "loss": 0.5325, + "step": 11546 + }, + { + "epoch": 1.45, + "grad_norm": 13.722332954406738, + "learning_rate": 1.033928795548676e-05, + "loss": 2.528, + "step": 11547 + }, + { + "epoch": 1.45, + "grad_norm": 18.110464096069336, + "learning_rate": 1.0338451240430072e-05, + "loss": 0.572, + "step": 11548 + }, + { + "epoch": 1.45, + "grad_norm": 11.07425308227539, + "learning_rate": 1.0337614525373386e-05, + "loss": 0.8301, + "step": 11549 + }, + { + "epoch": 1.45, + "grad_norm": 33.45741653442383, + "learning_rate": 1.0336777810316698e-05, + "loss": 1.9811, + "step": 11550 + }, + { + "epoch": 1.45, + "grad_norm": 13.170585632324219, + "learning_rate": 1.033594109526001e-05, + "loss": 0.5509, + "step": 11551 + }, + { + "epoch": 1.45, + "grad_norm": 6.260337829589844, + "learning_rate": 1.0335104380203323e-05, + "loss": 0.1017, + "step": 11552 + }, + { + "epoch": 1.45, + "grad_norm": 8.019852638244629, + "learning_rate": 1.0334267665146635e-05, + "loss": 0.6926, + "step": 11553 + }, + { + "epoch": 1.45, + "grad_norm": 8.174002647399902, + "learning_rate": 1.0333430950089949e-05, + "loss": 0.3792, + "step": 11554 + }, + { + "epoch": 1.45, + "grad_norm": 35.70268249511719, + "learning_rate": 1.0332594235033259e-05, + "loss": 2.7399, + "step": 11555 + }, + { + "epoch": 1.45, + "grad_norm": 7.015445709228516, + "learning_rate": 1.0331757519976573e-05, + "loss": 0.2739, + "step": 11556 + }, + { + "epoch": 1.45, + "grad_norm": 9.410852432250977, + "learning_rate": 1.0330920804919886e-05, + "loss": 1.3884, + "step": 11557 + }, + { + "epoch": 1.45, + "grad_norm": 9.94278621673584, + "learning_rate": 1.0330084089863197e-05, + "loss": 1.0147, + "step": 11558 + }, + { + "epoch": 1.45, + "grad_norm": 20.14786148071289, + "learning_rate": 1.032924737480651e-05, + "loss": 1.0886, + "step": 11559 + }, + { + "epoch": 1.45, + "grad_norm": 10.37377643585205, + "learning_rate": 1.0328410659749824e-05, + "loss": 0.7988, + "step": 11560 + }, + { + "epoch": 1.45, + "grad_norm": 7.83779239654541, + "learning_rate": 1.0327573944693136e-05, + "loss": 1.6752, + "step": 11561 + }, + { + "epoch": 1.45, + "grad_norm": 15.421342849731445, + "learning_rate": 1.0326737229636448e-05, + "loss": 1.7482, + "step": 11562 + }, + { + "epoch": 1.45, + "grad_norm": 12.408710479736328, + "learning_rate": 1.0325900514579762e-05, + "loss": 0.3391, + "step": 11563 + }, + { + "epoch": 1.45, + "grad_norm": 20.381467819213867, + "learning_rate": 1.0325063799523073e-05, + "loss": 2.3143, + "step": 11564 + }, + { + "epoch": 1.45, + "grad_norm": 13.723130226135254, + "learning_rate": 1.0324227084466385e-05, + "loss": 1.6073, + "step": 11565 + }, + { + "epoch": 1.45, + "grad_norm": 4.7990264892578125, + "learning_rate": 1.0323390369409697e-05, + "loss": 0.5784, + "step": 11566 + }, + { + "epoch": 1.45, + "grad_norm": 23.338542938232422, + "learning_rate": 1.0322553654353011e-05, + "loss": 0.5699, + "step": 11567 + }, + { + "epoch": 1.45, + "grad_norm": 25.580055236816406, + "learning_rate": 1.0321716939296325e-05, + "loss": 0.2767, + "step": 11568 + }, + { + "epoch": 1.45, + "grad_norm": 18.603832244873047, + "learning_rate": 1.0320880224239635e-05, + "loss": 1.0041, + "step": 11569 + }, + { + "epoch": 1.45, + "grad_norm": 15.5408353805542, + "learning_rate": 1.0320043509182949e-05, + "loss": 1.151, + "step": 11570 + }, + { + "epoch": 1.45, + "grad_norm": 20.128328323364258, + "learning_rate": 1.0319206794126262e-05, + "loss": 1.1813, + "step": 11571 + }, + { + "epoch": 1.45, + "grad_norm": 79.19524383544922, + "learning_rate": 1.0318370079069572e-05, + "loss": 3.1646, + "step": 11572 + }, + { + "epoch": 1.45, + "grad_norm": 18.595197677612305, + "learning_rate": 1.0317533364012886e-05, + "loss": 2.187, + "step": 11573 + }, + { + "epoch": 1.45, + "grad_norm": 6.652132034301758, + "learning_rate": 1.03166966489562e-05, + "loss": 0.3347, + "step": 11574 + }, + { + "epoch": 1.45, + "grad_norm": 33.59093475341797, + "learning_rate": 1.0315859933899512e-05, + "loss": 3.074, + "step": 11575 + }, + { + "epoch": 1.45, + "grad_norm": 9.482759475708008, + "learning_rate": 1.0315023218842824e-05, + "loss": 0.7033, + "step": 11576 + }, + { + "epoch": 1.45, + "grad_norm": 89.27581024169922, + "learning_rate": 1.0314186503786137e-05, + "loss": 4.2024, + "step": 11577 + }, + { + "epoch": 1.45, + "grad_norm": 17.355918884277344, + "learning_rate": 1.031334978872945e-05, + "loss": 0.8522, + "step": 11578 + }, + { + "epoch": 1.45, + "grad_norm": 9.63662052154541, + "learning_rate": 1.0312513073672761e-05, + "loss": 0.4806, + "step": 11579 + }, + { + "epoch": 1.45, + "grad_norm": 4.776221752166748, + "learning_rate": 1.0311676358616073e-05, + "loss": 0.1719, + "step": 11580 + }, + { + "epoch": 1.45, + "grad_norm": 5.802083969116211, + "learning_rate": 1.0310839643559387e-05, + "loss": 1.5277, + "step": 11581 + }, + { + "epoch": 1.45, + "grad_norm": 4.599855899810791, + "learning_rate": 1.03100029285027e-05, + "loss": 0.6048, + "step": 11582 + }, + { + "epoch": 1.45, + "grad_norm": 9.42330551147461, + "learning_rate": 1.030916621344601e-05, + "loss": 0.913, + "step": 11583 + }, + { + "epoch": 1.45, + "grad_norm": 19.721864700317383, + "learning_rate": 1.0308329498389324e-05, + "loss": 1.861, + "step": 11584 + }, + { + "epoch": 1.45, + "grad_norm": 15.7676362991333, + "learning_rate": 1.0307492783332638e-05, + "loss": 1.5734, + "step": 11585 + }, + { + "epoch": 1.45, + "grad_norm": 16.899751663208008, + "learning_rate": 1.0306656068275948e-05, + "loss": 0.8004, + "step": 11586 + }, + { + "epoch": 1.45, + "grad_norm": 16.90408706665039, + "learning_rate": 1.0305819353219262e-05, + "loss": 1.9167, + "step": 11587 + }, + { + "epoch": 1.45, + "grad_norm": 5.868137359619141, + "learning_rate": 1.0304982638162576e-05, + "loss": 0.3671, + "step": 11588 + }, + { + "epoch": 1.45, + "grad_norm": 7.04013204574585, + "learning_rate": 1.0304145923105888e-05, + "loss": 1.5718, + "step": 11589 + }, + { + "epoch": 1.45, + "grad_norm": 9.058857917785645, + "learning_rate": 1.03033092080492e-05, + "loss": 0.279, + "step": 11590 + }, + { + "epoch": 1.45, + "grad_norm": 10.890242576599121, + "learning_rate": 1.0302472492992513e-05, + "loss": 1.5068, + "step": 11591 + }, + { + "epoch": 1.45, + "grad_norm": 11.963504791259766, + "learning_rate": 1.0301635777935825e-05, + "loss": 0.6266, + "step": 11592 + }, + { + "epoch": 1.45, + "grad_norm": 70.46163177490234, + "learning_rate": 1.0300799062879137e-05, + "loss": 0.6158, + "step": 11593 + }, + { + "epoch": 1.46, + "grad_norm": 14.41070556640625, + "learning_rate": 1.0299962347822449e-05, + "loss": 1.2228, + "step": 11594 + }, + { + "epoch": 1.46, + "grad_norm": 16.657543182373047, + "learning_rate": 1.0299125632765763e-05, + "loss": 2.6929, + "step": 11595 + }, + { + "epoch": 1.46, + "grad_norm": 6.03270149230957, + "learning_rate": 1.0298288917709075e-05, + "loss": 0.5633, + "step": 11596 + }, + { + "epoch": 1.46, + "grad_norm": 10.365429878234863, + "learning_rate": 1.0297452202652387e-05, + "loss": 0.3843, + "step": 11597 + }, + { + "epoch": 1.46, + "grad_norm": 4.687175273895264, + "learning_rate": 1.02966154875957e-05, + "loss": 0.2333, + "step": 11598 + }, + { + "epoch": 1.46, + "grad_norm": 20.45831298828125, + "learning_rate": 1.0295778772539014e-05, + "loss": 0.893, + "step": 11599 + }, + { + "epoch": 1.46, + "grad_norm": 9.342596054077148, + "learning_rate": 1.0294942057482324e-05, + "loss": 0.4243, + "step": 11600 + }, + { + "epoch": 1.46, + "eval_loss": 0.0890585333108902, + "eval_runtime": 95.654, + "eval_samples_per_second": 37.029, + "eval_steps_per_second": 37.029, + "step": 11600 + }, + { + "epoch": 1.46, + "grad_norm": 17.461505889892578, + "learning_rate": 1.0294105342425638e-05, + "loss": 1.1512, + "step": 11601 + }, + { + "epoch": 1.46, + "grad_norm": 20.406530380249023, + "learning_rate": 1.0293268627368951e-05, + "loss": 1.7207, + "step": 11602 + }, + { + "epoch": 1.46, + "grad_norm": 17.772703170776367, + "learning_rate": 1.0292431912312262e-05, + "loss": 2.3751, + "step": 11603 + }, + { + "epoch": 1.46, + "grad_norm": 22.063190460205078, + "learning_rate": 1.0291595197255575e-05, + "loss": 1.8715, + "step": 11604 + }, + { + "epoch": 1.46, + "grad_norm": 15.428668022155762, + "learning_rate": 1.0290758482198889e-05, + "loss": 1.2716, + "step": 11605 + }, + { + "epoch": 1.46, + "grad_norm": 13.446057319641113, + "learning_rate": 1.0289921767142201e-05, + "loss": 0.7319, + "step": 11606 + }, + { + "epoch": 1.46, + "grad_norm": 12.71236515045166, + "learning_rate": 1.0289085052085513e-05, + "loss": 1.4276, + "step": 11607 + }, + { + "epoch": 1.46, + "grad_norm": 25.381593704223633, + "learning_rate": 1.0288248337028825e-05, + "loss": 0.5396, + "step": 11608 + }, + { + "epoch": 1.46, + "grad_norm": 5.780124187469482, + "learning_rate": 1.0287411621972139e-05, + "loss": 0.513, + "step": 11609 + }, + { + "epoch": 1.46, + "grad_norm": 4.070555210113525, + "learning_rate": 1.028657490691545e-05, + "loss": 1.0341, + "step": 11610 + }, + { + "epoch": 1.46, + "grad_norm": 12.839869499206543, + "learning_rate": 1.0285738191858762e-05, + "loss": 1.2353, + "step": 11611 + }, + { + "epoch": 1.46, + "grad_norm": 30.747859954833984, + "learning_rate": 1.0284901476802076e-05, + "loss": 1.0197, + "step": 11612 + }, + { + "epoch": 1.46, + "grad_norm": 18.160499572753906, + "learning_rate": 1.028406476174539e-05, + "loss": 1.733, + "step": 11613 + }, + { + "epoch": 1.46, + "grad_norm": 13.152453422546387, + "learning_rate": 1.02832280466887e-05, + "loss": 0.8829, + "step": 11614 + }, + { + "epoch": 1.46, + "grad_norm": 17.087106704711914, + "learning_rate": 1.0282391331632014e-05, + "loss": 0.381, + "step": 11615 + }, + { + "epoch": 1.46, + "grad_norm": 12.67386245727539, + "learning_rate": 1.0281554616575327e-05, + "loss": 1.6281, + "step": 11616 + }, + { + "epoch": 1.46, + "grad_norm": 13.952394485473633, + "learning_rate": 1.0280717901518638e-05, + "loss": 2.1248, + "step": 11617 + }, + { + "epoch": 1.46, + "grad_norm": 32.914794921875, + "learning_rate": 1.0279881186461951e-05, + "loss": 0.4341, + "step": 11618 + }, + { + "epoch": 1.46, + "grad_norm": 91.3016586303711, + "learning_rate": 1.0279044471405263e-05, + "loss": 0.8319, + "step": 11619 + }, + { + "epoch": 1.46, + "grad_norm": 21.45525550842285, + "learning_rate": 1.0278207756348577e-05, + "loss": 1.3928, + "step": 11620 + }, + { + "epoch": 1.46, + "grad_norm": 6.4357380867004395, + "learning_rate": 1.0277371041291889e-05, + "loss": 0.0834, + "step": 11621 + }, + { + "epoch": 1.46, + "grad_norm": 137.15390014648438, + "learning_rate": 1.02765343262352e-05, + "loss": 1.8562, + "step": 11622 + }, + { + "epoch": 1.46, + "grad_norm": 14.621424674987793, + "learning_rate": 1.0275697611178514e-05, + "loss": 0.7229, + "step": 11623 + }, + { + "epoch": 1.46, + "grad_norm": 191.5143280029297, + "learning_rate": 1.0274860896121825e-05, + "loss": 2.8116, + "step": 11624 + }, + { + "epoch": 1.46, + "grad_norm": 4.596951961517334, + "learning_rate": 1.0274024181065138e-05, + "loss": 0.1174, + "step": 11625 + }, + { + "epoch": 1.46, + "grad_norm": 19.01567268371582, + "learning_rate": 1.0273187466008452e-05, + "loss": 0.8861, + "step": 11626 + }, + { + "epoch": 1.46, + "grad_norm": 16.008623123168945, + "learning_rate": 1.0272350750951766e-05, + "loss": 1.0557, + "step": 11627 + }, + { + "epoch": 1.46, + "grad_norm": 12.518770217895508, + "learning_rate": 1.0271514035895076e-05, + "loss": 0.4727, + "step": 11628 + }, + { + "epoch": 1.46, + "grad_norm": 6.306007385253906, + "learning_rate": 1.027067732083839e-05, + "loss": 0.7893, + "step": 11629 + }, + { + "epoch": 1.46, + "grad_norm": 20.78609275817871, + "learning_rate": 1.0269840605781703e-05, + "loss": 1.0765, + "step": 11630 + }, + { + "epoch": 1.46, + "grad_norm": 5.802038669586182, + "learning_rate": 1.0269003890725013e-05, + "loss": 1.2745, + "step": 11631 + }, + { + "epoch": 1.46, + "grad_norm": 40.01700210571289, + "learning_rate": 1.0268167175668327e-05, + "loss": 1.4334, + "step": 11632 + }, + { + "epoch": 1.46, + "grad_norm": 16.79182243347168, + "learning_rate": 1.0267330460611639e-05, + "loss": 1.6118, + "step": 11633 + }, + { + "epoch": 1.46, + "grad_norm": 14.402016639709473, + "learning_rate": 1.0266493745554953e-05, + "loss": 0.9737, + "step": 11634 + }, + { + "epoch": 1.46, + "grad_norm": 90.98631286621094, + "learning_rate": 1.0265657030498265e-05, + "loss": 0.965, + "step": 11635 + }, + { + "epoch": 1.46, + "grad_norm": 13.878726959228516, + "learning_rate": 1.0264820315441577e-05, + "loss": 1.6883, + "step": 11636 + }, + { + "epoch": 1.46, + "grad_norm": 14.278339385986328, + "learning_rate": 1.026398360038489e-05, + "loss": 1.9722, + "step": 11637 + }, + { + "epoch": 1.46, + "grad_norm": 14.49180793762207, + "learning_rate": 1.02631468853282e-05, + "loss": 1.575, + "step": 11638 + }, + { + "epoch": 1.46, + "grad_norm": 9.01636791229248, + "learning_rate": 1.0262310170271514e-05, + "loss": 1.0054, + "step": 11639 + }, + { + "epoch": 1.46, + "grad_norm": 16.10940933227539, + "learning_rate": 1.0261473455214828e-05, + "loss": 2.1795, + "step": 11640 + }, + { + "epoch": 1.46, + "grad_norm": 11.715025901794434, + "learning_rate": 1.0260636740158141e-05, + "loss": 1.2947, + "step": 11641 + }, + { + "epoch": 1.46, + "grad_norm": 8.381911277770996, + "learning_rate": 1.0259800025101452e-05, + "loss": 0.3407, + "step": 11642 + }, + { + "epoch": 1.46, + "grad_norm": 7.727115154266357, + "learning_rate": 1.0258963310044765e-05, + "loss": 0.5429, + "step": 11643 + }, + { + "epoch": 1.46, + "grad_norm": 9.851998329162598, + "learning_rate": 1.0258126594988079e-05, + "loss": 0.8527, + "step": 11644 + }, + { + "epoch": 1.46, + "grad_norm": 42.08799743652344, + "learning_rate": 1.025728987993139e-05, + "loss": 1.1845, + "step": 11645 + }, + { + "epoch": 1.46, + "grad_norm": 3.101151704788208, + "learning_rate": 1.0256453164874703e-05, + "loss": 0.1981, + "step": 11646 + }, + { + "epoch": 1.46, + "grad_norm": 9.115474700927734, + "learning_rate": 1.0255616449818015e-05, + "loss": 0.5998, + "step": 11647 + }, + { + "epoch": 1.46, + "grad_norm": 10.739004135131836, + "learning_rate": 1.0254779734761328e-05, + "loss": 0.6968, + "step": 11648 + }, + { + "epoch": 1.46, + "grad_norm": 23.58644676208496, + "learning_rate": 1.025394301970464e-05, + "loss": 1.9058, + "step": 11649 + }, + { + "epoch": 1.46, + "grad_norm": 44.03055191040039, + "learning_rate": 1.0253106304647952e-05, + "loss": 1.0997, + "step": 11650 + }, + { + "epoch": 1.46, + "grad_norm": 17.007707595825195, + "learning_rate": 1.0252269589591266e-05, + "loss": 1.3267, + "step": 11651 + }, + { + "epoch": 1.46, + "grad_norm": 6.813281536102295, + "learning_rate": 1.0251432874534576e-05, + "loss": 0.6019, + "step": 11652 + }, + { + "epoch": 1.46, + "grad_norm": 7.804632186889648, + "learning_rate": 1.025059615947789e-05, + "loss": 0.4428, + "step": 11653 + }, + { + "epoch": 1.46, + "grad_norm": 51.19334030151367, + "learning_rate": 1.0249759444421204e-05, + "loss": 3.2358, + "step": 11654 + }, + { + "epoch": 1.46, + "grad_norm": 95.2354965209961, + "learning_rate": 1.0248922729364517e-05, + "loss": 3.8895, + "step": 11655 + }, + { + "epoch": 1.46, + "grad_norm": 31.65764808654785, + "learning_rate": 1.0248086014307828e-05, + "loss": 1.4485, + "step": 11656 + }, + { + "epoch": 1.46, + "grad_norm": 15.785746574401855, + "learning_rate": 1.0247249299251141e-05, + "loss": 0.3068, + "step": 11657 + }, + { + "epoch": 1.46, + "grad_norm": 9.289689064025879, + "learning_rate": 1.0246412584194453e-05, + "loss": 1.1379, + "step": 11658 + }, + { + "epoch": 1.46, + "grad_norm": 15.987249374389648, + "learning_rate": 1.0245575869137765e-05, + "loss": 1.2005, + "step": 11659 + }, + { + "epoch": 1.46, + "grad_norm": 17.456310272216797, + "learning_rate": 1.0244739154081079e-05, + "loss": 1.5594, + "step": 11660 + }, + { + "epoch": 1.46, + "grad_norm": 15.669245719909668, + "learning_rate": 1.024390243902439e-05, + "loss": 1.0324, + "step": 11661 + }, + { + "epoch": 1.46, + "grad_norm": 6.8458757400512695, + "learning_rate": 1.0243065723967704e-05, + "loss": 0.7236, + "step": 11662 + }, + { + "epoch": 1.46, + "grad_norm": 16.892261505126953, + "learning_rate": 1.0242229008911016e-05, + "loss": 0.7325, + "step": 11663 + }, + { + "epoch": 1.46, + "grad_norm": 10.99298095703125, + "learning_rate": 1.0241392293854328e-05, + "loss": 1.5616, + "step": 11664 + }, + { + "epoch": 1.46, + "grad_norm": 18.590248107910156, + "learning_rate": 1.0240555578797642e-05, + "loss": 0.6841, + "step": 11665 + }, + { + "epoch": 1.46, + "grad_norm": 37.73554229736328, + "learning_rate": 1.0239718863740952e-05, + "loss": 2.2575, + "step": 11666 + }, + { + "epoch": 1.46, + "grad_norm": 23.0293025970459, + "learning_rate": 1.0238882148684266e-05, + "loss": 1.0603, + "step": 11667 + }, + { + "epoch": 1.46, + "grad_norm": 6.224422931671143, + "learning_rate": 1.023804543362758e-05, + "loss": 0.7531, + "step": 11668 + }, + { + "epoch": 1.46, + "grad_norm": 14.43862533569336, + "learning_rate": 1.0237208718570893e-05, + "loss": 1.8931, + "step": 11669 + }, + { + "epoch": 1.46, + "grad_norm": 34.57533645629883, + "learning_rate": 1.0236372003514203e-05, + "loss": 0.9463, + "step": 11670 + }, + { + "epoch": 1.46, + "grad_norm": 18.28287124633789, + "learning_rate": 1.0235535288457517e-05, + "loss": 1.6143, + "step": 11671 + }, + { + "epoch": 1.46, + "grad_norm": 20.80086898803711, + "learning_rate": 1.0234698573400829e-05, + "loss": 3.2864, + "step": 11672 + }, + { + "epoch": 1.46, + "grad_norm": 13.415273666381836, + "learning_rate": 1.0233861858344141e-05, + "loss": 0.6393, + "step": 11673 + }, + { + "epoch": 1.47, + "grad_norm": 12.326879501342773, + "learning_rate": 1.0233025143287455e-05, + "loss": 2.4646, + "step": 11674 + }, + { + "epoch": 1.47, + "grad_norm": 17.38701629638672, + "learning_rate": 1.0232188428230767e-05, + "loss": 1.5567, + "step": 11675 + }, + { + "epoch": 1.47, + "grad_norm": 8.564024925231934, + "learning_rate": 1.023135171317408e-05, + "loss": 1.2473, + "step": 11676 + }, + { + "epoch": 1.47, + "grad_norm": 9.077590942382812, + "learning_rate": 1.023051499811739e-05, + "loss": 1.4386, + "step": 11677 + }, + { + "epoch": 1.47, + "grad_norm": 11.411284446716309, + "learning_rate": 1.0229678283060704e-05, + "loss": 1.2173, + "step": 11678 + }, + { + "epoch": 1.47, + "grad_norm": 14.678159713745117, + "learning_rate": 1.0228841568004018e-05, + "loss": 0.8474, + "step": 11679 + }, + { + "epoch": 1.47, + "grad_norm": 6.511855125427246, + "learning_rate": 1.0228004852947328e-05, + "loss": 0.5764, + "step": 11680 + }, + { + "epoch": 1.47, + "grad_norm": 22.524940490722656, + "learning_rate": 1.0227168137890642e-05, + "loss": 1.9294, + "step": 11681 + }, + { + "epoch": 1.47, + "grad_norm": 8.448037147521973, + "learning_rate": 1.0226331422833955e-05, + "loss": 1.3012, + "step": 11682 + }, + { + "epoch": 1.47, + "grad_norm": 16.592552185058594, + "learning_rate": 1.0225494707777269e-05, + "loss": 1.6058, + "step": 11683 + }, + { + "epoch": 1.47, + "grad_norm": 145.56182861328125, + "learning_rate": 1.022465799272058e-05, + "loss": 3.3452, + "step": 11684 + }, + { + "epoch": 1.47, + "grad_norm": 19.358047485351562, + "learning_rate": 1.0223821277663893e-05, + "loss": 1.5458, + "step": 11685 + }, + { + "epoch": 1.47, + "grad_norm": 7.307069301605225, + "learning_rate": 1.0222984562607205e-05, + "loss": 0.6352, + "step": 11686 + }, + { + "epoch": 1.47, + "grad_norm": 23.20427131652832, + "learning_rate": 1.0222147847550517e-05, + "loss": 2.1221, + "step": 11687 + }, + { + "epoch": 1.47, + "grad_norm": 13.527782440185547, + "learning_rate": 1.022131113249383e-05, + "loss": 1.2126, + "step": 11688 + }, + { + "epoch": 1.47, + "grad_norm": 133.5238494873047, + "learning_rate": 1.0220474417437142e-05, + "loss": 1.5857, + "step": 11689 + }, + { + "epoch": 1.47, + "grad_norm": 21.087724685668945, + "learning_rate": 1.0219637702380456e-05, + "loss": 1.7069, + "step": 11690 + }, + { + "epoch": 1.47, + "grad_norm": 60.96128463745117, + "learning_rate": 1.0218800987323766e-05, + "loss": 1.9141, + "step": 11691 + }, + { + "epoch": 1.47, + "grad_norm": 9.16915225982666, + "learning_rate": 1.021796427226708e-05, + "loss": 0.2113, + "step": 11692 + }, + { + "epoch": 1.47, + "grad_norm": 14.474244117736816, + "learning_rate": 1.0217127557210394e-05, + "loss": 0.7099, + "step": 11693 + }, + { + "epoch": 1.47, + "grad_norm": 13.534197807312012, + "learning_rate": 1.0216290842153704e-05, + "loss": 0.325, + "step": 11694 + }, + { + "epoch": 1.47, + "grad_norm": 11.091919898986816, + "learning_rate": 1.0215454127097017e-05, + "loss": 1.1304, + "step": 11695 + }, + { + "epoch": 1.47, + "grad_norm": 8.387577056884766, + "learning_rate": 1.0214617412040331e-05, + "loss": 0.4027, + "step": 11696 + }, + { + "epoch": 1.47, + "grad_norm": 20.336320877075195, + "learning_rate": 1.0213780696983645e-05, + "loss": 0.6635, + "step": 11697 + }, + { + "epoch": 1.47, + "grad_norm": 5.192074298858643, + "learning_rate": 1.0212943981926955e-05, + "loss": 1.2929, + "step": 11698 + }, + { + "epoch": 1.47, + "grad_norm": 27.580411911010742, + "learning_rate": 1.0212107266870269e-05, + "loss": 2.0222, + "step": 11699 + }, + { + "epoch": 1.47, + "grad_norm": 30.743709564208984, + "learning_rate": 1.021127055181358e-05, + "loss": 2.0479, + "step": 11700 + }, + { + "epoch": 1.47, + "grad_norm": 67.473388671875, + "learning_rate": 1.0210433836756893e-05, + "loss": 2.0701, + "step": 11701 + }, + { + "epoch": 1.47, + "grad_norm": 37.5473518371582, + "learning_rate": 1.0209597121700206e-05, + "loss": 1.7707, + "step": 11702 + }, + { + "epoch": 1.47, + "grad_norm": 29.353015899658203, + "learning_rate": 1.0208760406643518e-05, + "loss": 2.1933, + "step": 11703 + }, + { + "epoch": 1.47, + "grad_norm": 7.22294282913208, + "learning_rate": 1.0207923691586832e-05, + "loss": 0.662, + "step": 11704 + }, + { + "epoch": 1.47, + "grad_norm": 12.743423461914062, + "learning_rate": 1.0207086976530142e-05, + "loss": 0.8054, + "step": 11705 + }, + { + "epoch": 1.47, + "grad_norm": 7.4735307693481445, + "learning_rate": 1.0206250261473456e-05, + "loss": 0.7347, + "step": 11706 + }, + { + "epoch": 1.47, + "grad_norm": 7.776078224182129, + "learning_rate": 1.020541354641677e-05, + "loss": 0.4297, + "step": 11707 + }, + { + "epoch": 1.47, + "grad_norm": 13.706832885742188, + "learning_rate": 1.020457683136008e-05, + "loss": 0.3821, + "step": 11708 + }, + { + "epoch": 1.47, + "grad_norm": 18.440196990966797, + "learning_rate": 1.0203740116303393e-05, + "loss": 0.7917, + "step": 11709 + }, + { + "epoch": 1.47, + "grad_norm": 38.80813980102539, + "learning_rate": 1.0202903401246707e-05, + "loss": 1.9899, + "step": 11710 + }, + { + "epoch": 1.47, + "grad_norm": 13.319124221801758, + "learning_rate": 1.0202066686190019e-05, + "loss": 1.7305, + "step": 11711 + }, + { + "epoch": 1.47, + "grad_norm": 7.006318092346191, + "learning_rate": 1.0201229971133331e-05, + "loss": 2.9458, + "step": 11712 + }, + { + "epoch": 1.47, + "grad_norm": 15.509191513061523, + "learning_rate": 1.0200393256076645e-05, + "loss": 0.6453, + "step": 11713 + }, + { + "epoch": 1.47, + "grad_norm": 12.68134593963623, + "learning_rate": 1.0199556541019956e-05, + "loss": 0.6937, + "step": 11714 + }, + { + "epoch": 1.47, + "grad_norm": 11.146038055419922, + "learning_rate": 1.0198719825963268e-05, + "loss": 0.6571, + "step": 11715 + }, + { + "epoch": 1.47, + "grad_norm": 26.770910263061523, + "learning_rate": 1.0197883110906582e-05, + "loss": 2.2434, + "step": 11716 + }, + { + "epoch": 1.47, + "grad_norm": 21.979290008544922, + "learning_rate": 1.0197046395849894e-05, + "loss": 2.1595, + "step": 11717 + }, + { + "epoch": 1.47, + "grad_norm": 13.98854923248291, + "learning_rate": 1.0196209680793208e-05, + "loss": 1.1727, + "step": 11718 + }, + { + "epoch": 1.47, + "grad_norm": 12.81928825378418, + "learning_rate": 1.0195372965736518e-05, + "loss": 2.488, + "step": 11719 + }, + { + "epoch": 1.47, + "grad_norm": 6.362330913543701, + "learning_rate": 1.0194536250679832e-05, + "loss": 0.8804, + "step": 11720 + }, + { + "epoch": 1.47, + "grad_norm": 17.94073486328125, + "learning_rate": 1.0193699535623145e-05, + "loss": 0.5014, + "step": 11721 + }, + { + "epoch": 1.47, + "grad_norm": 5.776565074920654, + "learning_rate": 1.0192862820566455e-05, + "loss": 0.8061, + "step": 11722 + }, + { + "epoch": 1.47, + "grad_norm": 12.142158508300781, + "learning_rate": 1.0192026105509769e-05, + "loss": 0.6426, + "step": 11723 + }, + { + "epoch": 1.47, + "grad_norm": 17.069988250732422, + "learning_rate": 1.0191189390453083e-05, + "loss": 0.7519, + "step": 11724 + }, + { + "epoch": 1.47, + "grad_norm": 7.901285171508789, + "learning_rate": 1.0190352675396395e-05, + "loss": 0.4464, + "step": 11725 + }, + { + "epoch": 1.47, + "grad_norm": 26.143484115600586, + "learning_rate": 1.0189515960339707e-05, + "loss": 1.8073, + "step": 11726 + }, + { + "epoch": 1.47, + "grad_norm": 19.156314849853516, + "learning_rate": 1.018867924528302e-05, + "loss": 1.5151, + "step": 11727 + }, + { + "epoch": 1.47, + "grad_norm": 36.94829559326172, + "learning_rate": 1.0187842530226332e-05, + "loss": 1.6474, + "step": 11728 + }, + { + "epoch": 1.47, + "grad_norm": 6.899235725402832, + "learning_rate": 1.0187005815169644e-05, + "loss": 1.2121, + "step": 11729 + }, + { + "epoch": 1.47, + "grad_norm": 12.01082992553711, + "learning_rate": 1.0186169100112956e-05, + "loss": 0.622, + "step": 11730 + }, + { + "epoch": 1.47, + "grad_norm": 4.53358268737793, + "learning_rate": 1.018533238505627e-05, + "loss": 0.4688, + "step": 11731 + }, + { + "epoch": 1.47, + "grad_norm": 21.91617774963379, + "learning_rate": 1.0184495669999584e-05, + "loss": 2.0592, + "step": 11732 + }, + { + "epoch": 1.47, + "grad_norm": 41.274417877197266, + "learning_rate": 1.0183658954942894e-05, + "loss": 0.9033, + "step": 11733 + }, + { + "epoch": 1.47, + "grad_norm": 8.857417106628418, + "learning_rate": 1.0182822239886207e-05, + "loss": 0.8782, + "step": 11734 + }, + { + "epoch": 1.47, + "grad_norm": 5.053219795227051, + "learning_rate": 1.0181985524829521e-05, + "loss": 0.6986, + "step": 11735 + }, + { + "epoch": 1.47, + "grad_norm": 31.58706283569336, + "learning_rate": 1.0181148809772831e-05, + "loss": 0.7091, + "step": 11736 + }, + { + "epoch": 1.47, + "grad_norm": 13.136528015136719, + "learning_rate": 1.0180312094716145e-05, + "loss": 0.911, + "step": 11737 + }, + { + "epoch": 1.47, + "grad_norm": 12.264043807983398, + "learning_rate": 1.0179475379659459e-05, + "loss": 1.101, + "step": 11738 + }, + { + "epoch": 1.47, + "grad_norm": 13.618952751159668, + "learning_rate": 1.017863866460277e-05, + "loss": 0.866, + "step": 11739 + }, + { + "epoch": 1.47, + "grad_norm": 24.54608154296875, + "learning_rate": 1.0177801949546083e-05, + "loss": 1.0775, + "step": 11740 + }, + { + "epoch": 1.47, + "grad_norm": 14.495532989501953, + "learning_rate": 1.0176965234489396e-05, + "loss": 1.7906, + "step": 11741 + }, + { + "epoch": 1.47, + "grad_norm": 16.60027503967285, + "learning_rate": 1.0176128519432708e-05, + "loss": 1.4876, + "step": 11742 + }, + { + "epoch": 1.47, + "grad_norm": 28.480043411254883, + "learning_rate": 1.017529180437602e-05, + "loss": 1.5874, + "step": 11743 + }, + { + "epoch": 1.47, + "grad_norm": 15.660050392150879, + "learning_rate": 1.0174455089319332e-05, + "loss": 1.6086, + "step": 11744 + }, + { + "epoch": 1.47, + "grad_norm": 9.23560619354248, + "learning_rate": 1.0173618374262646e-05, + "loss": 0.8364, + "step": 11745 + }, + { + "epoch": 1.47, + "grad_norm": 8.3224458694458, + "learning_rate": 1.017278165920596e-05, + "loss": 0.8419, + "step": 11746 + }, + { + "epoch": 1.47, + "grad_norm": 22.351551055908203, + "learning_rate": 1.017194494414927e-05, + "loss": 1.2834, + "step": 11747 + }, + { + "epoch": 1.47, + "grad_norm": 11.089461326599121, + "learning_rate": 1.0171108229092583e-05, + "loss": 1.9218, + "step": 11748 + }, + { + "epoch": 1.47, + "grad_norm": 15.574318885803223, + "learning_rate": 1.0170271514035897e-05, + "loss": 0.879, + "step": 11749 + }, + { + "epoch": 1.47, + "grad_norm": 3.7962749004364014, + "learning_rate": 1.0169434798979207e-05, + "loss": 0.4112, + "step": 11750 + }, + { + "epoch": 1.47, + "grad_norm": 6.679157733917236, + "learning_rate": 1.016859808392252e-05, + "loss": 0.5855, + "step": 11751 + }, + { + "epoch": 1.47, + "grad_norm": 17.797319412231445, + "learning_rate": 1.0167761368865834e-05, + "loss": 2.4238, + "step": 11752 + }, + { + "epoch": 1.47, + "grad_norm": 19.303560256958008, + "learning_rate": 1.0166924653809146e-05, + "loss": 1.5157, + "step": 11753 + }, + { + "epoch": 1.48, + "grad_norm": 23.224414825439453, + "learning_rate": 1.0166087938752458e-05, + "loss": 3.1815, + "step": 11754 + }, + { + "epoch": 1.48, + "grad_norm": 27.317115783691406, + "learning_rate": 1.0165251223695772e-05, + "loss": 1.9885, + "step": 11755 + }, + { + "epoch": 1.48, + "grad_norm": 16.916921615600586, + "learning_rate": 1.0164414508639084e-05, + "loss": 0.9612, + "step": 11756 + }, + { + "epoch": 1.48, + "grad_norm": 8.244033813476562, + "learning_rate": 1.0163577793582396e-05, + "loss": 0.4901, + "step": 11757 + }, + { + "epoch": 1.48, + "grad_norm": 29.34827423095703, + "learning_rate": 1.0162741078525708e-05, + "loss": 1.8972, + "step": 11758 + }, + { + "epoch": 1.48, + "grad_norm": 8.552237510681152, + "learning_rate": 1.0161904363469022e-05, + "loss": 0.9352, + "step": 11759 + }, + { + "epoch": 1.48, + "grad_norm": 17.028141021728516, + "learning_rate": 1.0161067648412335e-05, + "loss": 0.9987, + "step": 11760 + }, + { + "epoch": 1.48, + "grad_norm": 28.46116828918457, + "learning_rate": 1.0160230933355645e-05, + "loss": 1.5082, + "step": 11761 + }, + { + "epoch": 1.48, + "grad_norm": 2.872819185256958, + "learning_rate": 1.0159394218298959e-05, + "loss": 0.2531, + "step": 11762 + }, + { + "epoch": 1.48, + "grad_norm": 23.133909225463867, + "learning_rate": 1.0158557503242273e-05, + "loss": 0.9149, + "step": 11763 + }, + { + "epoch": 1.48, + "grad_norm": 20.6193904876709, + "learning_rate": 1.0157720788185583e-05, + "loss": 0.6258, + "step": 11764 + }, + { + "epoch": 1.48, + "grad_norm": 14.060830116271973, + "learning_rate": 1.0156884073128897e-05, + "loss": 0.976, + "step": 11765 + }, + { + "epoch": 1.48, + "grad_norm": 11.146706581115723, + "learning_rate": 1.015604735807221e-05, + "loss": 0.2321, + "step": 11766 + }, + { + "epoch": 1.48, + "grad_norm": 36.6379280090332, + "learning_rate": 1.0155210643015522e-05, + "loss": 2.1008, + "step": 11767 + }, + { + "epoch": 1.48, + "grad_norm": 6.3384222984313965, + "learning_rate": 1.0154373927958834e-05, + "loss": 0.1667, + "step": 11768 + }, + { + "epoch": 1.48, + "grad_norm": 35.882598876953125, + "learning_rate": 1.0153537212902146e-05, + "loss": 0.6095, + "step": 11769 + }, + { + "epoch": 1.48, + "grad_norm": 15.172981262207031, + "learning_rate": 1.015270049784546e-05, + "loss": 0.8346, + "step": 11770 + }, + { + "epoch": 1.48, + "grad_norm": 32.03757858276367, + "learning_rate": 1.0151863782788772e-05, + "loss": 1.836, + "step": 11771 + }, + { + "epoch": 1.48, + "grad_norm": 12.183686256408691, + "learning_rate": 1.0151027067732084e-05, + "loss": 0.9534, + "step": 11772 + }, + { + "epoch": 1.48, + "grad_norm": 13.156387329101562, + "learning_rate": 1.0150190352675397e-05, + "loss": 0.6516, + "step": 11773 + }, + { + "epoch": 1.48, + "grad_norm": 78.90950775146484, + "learning_rate": 1.0149353637618711e-05, + "loss": 3.9082, + "step": 11774 + }, + { + "epoch": 1.48, + "grad_norm": 67.05281066894531, + "learning_rate": 1.0148516922562021e-05, + "loss": 3.3256, + "step": 11775 + }, + { + "epoch": 1.48, + "grad_norm": 5.540832042694092, + "learning_rate": 1.0147680207505335e-05, + "loss": 0.3358, + "step": 11776 + }, + { + "epoch": 1.48, + "grad_norm": 11.787993431091309, + "learning_rate": 1.0146843492448649e-05, + "loss": 0.7127, + "step": 11777 + }, + { + "epoch": 1.48, + "grad_norm": 62.75838851928711, + "learning_rate": 1.0146006777391959e-05, + "loss": 2.0421, + "step": 11778 + }, + { + "epoch": 1.48, + "grad_norm": 24.62049674987793, + "learning_rate": 1.0145170062335272e-05, + "loss": 0.9993, + "step": 11779 + }, + { + "epoch": 1.48, + "grad_norm": 22.030406951904297, + "learning_rate": 1.0144333347278586e-05, + "loss": 2.3176, + "step": 11780 + }, + { + "epoch": 1.48, + "grad_norm": 8.144664764404297, + "learning_rate": 1.0143496632221898e-05, + "loss": 0.5781, + "step": 11781 + }, + { + "epoch": 1.48, + "grad_norm": 9.777544021606445, + "learning_rate": 1.014265991716521e-05, + "loss": 0.6678, + "step": 11782 + }, + { + "epoch": 1.48, + "grad_norm": 8.740224838256836, + "learning_rate": 1.0141823202108522e-05, + "loss": 0.853, + "step": 11783 + }, + { + "epoch": 1.48, + "grad_norm": 12.109756469726562, + "learning_rate": 1.0140986487051836e-05, + "loss": 0.6272, + "step": 11784 + }, + { + "epoch": 1.48, + "grad_norm": 14.739649772644043, + "learning_rate": 1.0140149771995148e-05, + "loss": 0.5708, + "step": 11785 + }, + { + "epoch": 1.48, + "grad_norm": 7.770986557006836, + "learning_rate": 1.013931305693846e-05, + "loss": 1.0056, + "step": 11786 + }, + { + "epoch": 1.48, + "grad_norm": 13.864361763000488, + "learning_rate": 1.0138476341881773e-05, + "loss": 0.5382, + "step": 11787 + }, + { + "epoch": 1.48, + "grad_norm": 15.930030822753906, + "learning_rate": 1.0137639626825087e-05, + "loss": 2.6395, + "step": 11788 + }, + { + "epoch": 1.48, + "grad_norm": 11.665167808532715, + "learning_rate": 1.0136802911768397e-05, + "loss": 1.5669, + "step": 11789 + }, + { + "epoch": 1.48, + "grad_norm": 11.066813468933105, + "learning_rate": 1.013596619671171e-05, + "loss": 1.2515, + "step": 11790 + }, + { + "epoch": 1.48, + "grad_norm": 59.150367736816406, + "learning_rate": 1.0135129481655024e-05, + "loss": 2.82, + "step": 11791 + }, + { + "epoch": 1.48, + "grad_norm": 8.148847579956055, + "learning_rate": 1.0134292766598335e-05, + "loss": 0.5652, + "step": 11792 + }, + { + "epoch": 1.48, + "grad_norm": 11.0548095703125, + "learning_rate": 1.0133456051541648e-05, + "loss": 1.6382, + "step": 11793 + }, + { + "epoch": 1.48, + "grad_norm": 8.091031074523926, + "learning_rate": 1.0132619336484962e-05, + "loss": 0.7803, + "step": 11794 + }, + { + "epoch": 1.48, + "grad_norm": 13.253259658813477, + "learning_rate": 1.0131782621428274e-05, + "loss": 0.6461, + "step": 11795 + }, + { + "epoch": 1.48, + "grad_norm": 33.466670989990234, + "learning_rate": 1.0130945906371586e-05, + "loss": 1.5478, + "step": 11796 + }, + { + "epoch": 1.48, + "grad_norm": 44.28044509887695, + "learning_rate": 1.0130109191314898e-05, + "loss": 1.8743, + "step": 11797 + }, + { + "epoch": 1.48, + "grad_norm": 15.960838317871094, + "learning_rate": 1.0129272476258211e-05, + "loss": 0.6709, + "step": 11798 + }, + { + "epoch": 1.48, + "grad_norm": 18.216751098632812, + "learning_rate": 1.0128435761201523e-05, + "loss": 1.6836, + "step": 11799 + }, + { + "epoch": 1.48, + "grad_norm": 23.818519592285156, + "learning_rate": 1.0127599046144835e-05, + "loss": 0.7374, + "step": 11800 + }, + { + "epoch": 1.48, + "grad_norm": 13.40870475769043, + "learning_rate": 1.0126762331088149e-05, + "loss": 0.9878, + "step": 11801 + }, + { + "epoch": 1.48, + "grad_norm": 33.63993835449219, + "learning_rate": 1.0125925616031463e-05, + "loss": 1.8692, + "step": 11802 + }, + { + "epoch": 1.48, + "grad_norm": 24.853561401367188, + "learning_rate": 1.0125088900974773e-05, + "loss": 0.7594, + "step": 11803 + }, + { + "epoch": 1.48, + "grad_norm": 4.262231826782227, + "learning_rate": 1.0124252185918087e-05, + "loss": 1.6934, + "step": 11804 + }, + { + "epoch": 1.48, + "grad_norm": 17.921628952026367, + "learning_rate": 1.01234154708614e-05, + "loss": 2.2214, + "step": 11805 + }, + { + "epoch": 1.48, + "grad_norm": 22.366456985473633, + "learning_rate": 1.012257875580471e-05, + "loss": 2.1575, + "step": 11806 + }, + { + "epoch": 1.48, + "grad_norm": 17.08475112915039, + "learning_rate": 1.0121742040748024e-05, + "loss": 1.195, + "step": 11807 + }, + { + "epoch": 1.48, + "grad_norm": 28.62654685974121, + "learning_rate": 1.0120905325691338e-05, + "loss": 1.8959, + "step": 11808 + }, + { + "epoch": 1.48, + "grad_norm": 17.814733505249023, + "learning_rate": 1.012006861063465e-05, + "loss": 1.4859, + "step": 11809 + }, + { + "epoch": 1.48, + "grad_norm": 39.33662033081055, + "learning_rate": 1.0119231895577962e-05, + "loss": 1.2965, + "step": 11810 + }, + { + "epoch": 1.48, + "grad_norm": 7.899166584014893, + "learning_rate": 1.0118395180521274e-05, + "loss": 0.5111, + "step": 11811 + }, + { + "epoch": 1.48, + "grad_norm": 4.4526448249816895, + "learning_rate": 1.0117558465464587e-05, + "loss": 0.1027, + "step": 11812 + }, + { + "epoch": 1.48, + "grad_norm": 10.049847602844238, + "learning_rate": 1.01167217504079e-05, + "loss": 1.8753, + "step": 11813 + }, + { + "epoch": 1.48, + "grad_norm": 12.997302055358887, + "learning_rate": 1.0115885035351211e-05, + "loss": 0.8018, + "step": 11814 + }, + { + "epoch": 1.48, + "grad_norm": 8.1620454788208, + "learning_rate": 1.0115048320294525e-05, + "loss": 0.5171, + "step": 11815 + }, + { + "epoch": 1.48, + "grad_norm": 19.684127807617188, + "learning_rate": 1.0114211605237839e-05, + "loss": 1.2105, + "step": 11816 + }, + { + "epoch": 1.48, + "grad_norm": 32.90684127807617, + "learning_rate": 1.0113374890181149e-05, + "loss": 1.4728, + "step": 11817 + }, + { + "epoch": 1.48, + "grad_norm": 4.621243953704834, + "learning_rate": 1.0112538175124462e-05, + "loss": 0.1001, + "step": 11818 + }, + { + "epoch": 1.48, + "grad_norm": 11.834315299987793, + "learning_rate": 1.0111701460067776e-05, + "loss": 1.2464, + "step": 11819 + }, + { + "epoch": 1.48, + "grad_norm": 48.37236404418945, + "learning_rate": 1.0110864745011086e-05, + "loss": 2.0795, + "step": 11820 + }, + { + "epoch": 1.48, + "grad_norm": 24.91161346435547, + "learning_rate": 1.01100280299544e-05, + "loss": 1.2457, + "step": 11821 + }, + { + "epoch": 1.48, + "grad_norm": 9.79609489440918, + "learning_rate": 1.0109191314897712e-05, + "loss": 0.3236, + "step": 11822 + }, + { + "epoch": 1.48, + "grad_norm": 16.0001163482666, + "learning_rate": 1.0108354599841026e-05, + "loss": 1.9801, + "step": 11823 + }, + { + "epoch": 1.48, + "grad_norm": 6.595298767089844, + "learning_rate": 1.0107517884784338e-05, + "loss": 1.3833, + "step": 11824 + }, + { + "epoch": 1.48, + "grad_norm": 20.870195388793945, + "learning_rate": 1.010668116972765e-05, + "loss": 1.9719, + "step": 11825 + }, + { + "epoch": 1.48, + "grad_norm": 30.148893356323242, + "learning_rate": 1.0105844454670963e-05, + "loss": 0.4573, + "step": 11826 + }, + { + "epoch": 1.48, + "grad_norm": 14.521225929260254, + "learning_rate": 1.0105007739614275e-05, + "loss": 0.3858, + "step": 11827 + }, + { + "epoch": 1.48, + "grad_norm": 7.839728355407715, + "learning_rate": 1.0104171024557587e-05, + "loss": 1.4502, + "step": 11828 + }, + { + "epoch": 1.48, + "grad_norm": 11.859395980834961, + "learning_rate": 1.01033343095009e-05, + "loss": 1.0715, + "step": 11829 + }, + { + "epoch": 1.48, + "grad_norm": 15.987799644470215, + "learning_rate": 1.0102497594444214e-05, + "loss": 1.0691, + "step": 11830 + }, + { + "epoch": 1.48, + "grad_norm": 7.629037857055664, + "learning_rate": 1.0101660879387525e-05, + "loss": 0.5356, + "step": 11831 + }, + { + "epoch": 1.48, + "grad_norm": 3.65165638923645, + "learning_rate": 1.0100824164330838e-05, + "loss": 0.2964, + "step": 11832 + }, + { + "epoch": 1.49, + "grad_norm": 9.224053382873535, + "learning_rate": 1.0099987449274152e-05, + "loss": 1.3201, + "step": 11833 + }, + { + "epoch": 1.49, + "grad_norm": 22.002063751220703, + "learning_rate": 1.0099150734217462e-05, + "loss": 1.6502, + "step": 11834 + }, + { + "epoch": 1.49, + "grad_norm": 13.647278785705566, + "learning_rate": 1.0098314019160776e-05, + "loss": 0.9005, + "step": 11835 + }, + { + "epoch": 1.49, + "grad_norm": 9.864240646362305, + "learning_rate": 1.0097477304104088e-05, + "loss": 0.5403, + "step": 11836 + }, + { + "epoch": 1.49, + "grad_norm": 11.673495292663574, + "learning_rate": 1.0096640589047401e-05, + "loss": 0.8332, + "step": 11837 + }, + { + "epoch": 1.49, + "grad_norm": 15.670794486999512, + "learning_rate": 1.0095803873990713e-05, + "loss": 1.6456, + "step": 11838 + }, + { + "epoch": 1.49, + "grad_norm": 33.64060592651367, + "learning_rate": 1.0094967158934025e-05, + "loss": 1.1097, + "step": 11839 + }, + { + "epoch": 1.49, + "grad_norm": 7.889852046966553, + "learning_rate": 1.0094130443877339e-05, + "loss": 0.3595, + "step": 11840 + }, + { + "epoch": 1.49, + "grad_norm": 14.493781089782715, + "learning_rate": 1.009329372882065e-05, + "loss": 0.4723, + "step": 11841 + }, + { + "epoch": 1.49, + "grad_norm": 21.568572998046875, + "learning_rate": 1.0092457013763963e-05, + "loss": 0.2411, + "step": 11842 + }, + { + "epoch": 1.49, + "grad_norm": 16.096431732177734, + "learning_rate": 1.0091620298707277e-05, + "loss": 1.5888, + "step": 11843 + }, + { + "epoch": 1.49, + "grad_norm": 11.522690773010254, + "learning_rate": 1.009078358365059e-05, + "loss": 0.1546, + "step": 11844 + }, + { + "epoch": 1.49, + "grad_norm": 44.40749740600586, + "learning_rate": 1.00899468685939e-05, + "loss": 0.5052, + "step": 11845 + }, + { + "epoch": 1.49, + "grad_norm": 12.55154037475586, + "learning_rate": 1.0089110153537214e-05, + "loss": 1.7197, + "step": 11846 + }, + { + "epoch": 1.49, + "grad_norm": 15.280006408691406, + "learning_rate": 1.0088273438480528e-05, + "loss": 1.7976, + "step": 11847 + }, + { + "epoch": 1.49, + "grad_norm": 25.104217529296875, + "learning_rate": 1.0087436723423838e-05, + "loss": 1.8859, + "step": 11848 + }, + { + "epoch": 1.49, + "grad_norm": 20.182933807373047, + "learning_rate": 1.0086600008367152e-05, + "loss": 0.8879, + "step": 11849 + }, + { + "epoch": 1.49, + "grad_norm": 10.312606811523438, + "learning_rate": 1.0085763293310464e-05, + "loss": 1.0224, + "step": 11850 + }, + { + "epoch": 1.49, + "grad_norm": 17.0831298828125, + "learning_rate": 1.0084926578253777e-05, + "loss": 1.0411, + "step": 11851 + }, + { + "epoch": 1.49, + "grad_norm": 8.599349975585938, + "learning_rate": 1.008408986319709e-05, + "loss": 1.3262, + "step": 11852 + }, + { + "epoch": 1.49, + "grad_norm": 16.35350227355957, + "learning_rate": 1.0083253148140401e-05, + "loss": 0.7316, + "step": 11853 + }, + { + "epoch": 1.49, + "grad_norm": 213.29212951660156, + "learning_rate": 1.0082416433083715e-05, + "loss": 1.3068, + "step": 11854 + }, + { + "epoch": 1.49, + "grad_norm": 63.58739471435547, + "learning_rate": 1.0081579718027025e-05, + "loss": 1.922, + "step": 11855 + }, + { + "epoch": 1.49, + "grad_norm": 41.69790267944336, + "learning_rate": 1.0080743002970339e-05, + "loss": 1.5676, + "step": 11856 + }, + { + "epoch": 1.49, + "grad_norm": 34.60993194580078, + "learning_rate": 1.0079906287913652e-05, + "loss": 0.8385, + "step": 11857 + }, + { + "epoch": 1.49, + "grad_norm": 67.94532775878906, + "learning_rate": 1.0079069572856966e-05, + "loss": 2.7407, + "step": 11858 + }, + { + "epoch": 1.49, + "grad_norm": 12.518784523010254, + "learning_rate": 1.0078232857800276e-05, + "loss": 2.2123, + "step": 11859 + }, + { + "epoch": 1.49, + "grad_norm": 19.069976806640625, + "learning_rate": 1.007739614274359e-05, + "loss": 1.5177, + "step": 11860 + }, + { + "epoch": 1.49, + "grad_norm": 92.6587142944336, + "learning_rate": 1.0076559427686904e-05, + "loss": 2.7086, + "step": 11861 + }, + { + "epoch": 1.49, + "grad_norm": 23.413423538208008, + "learning_rate": 1.0075722712630214e-05, + "loss": 1.2732, + "step": 11862 + }, + { + "epoch": 1.49, + "grad_norm": 15.536075592041016, + "learning_rate": 1.0074885997573528e-05, + "loss": 2.6961, + "step": 11863 + }, + { + "epoch": 1.49, + "grad_norm": 22.799394607543945, + "learning_rate": 1.007404928251684e-05, + "loss": 1.3043, + "step": 11864 + }, + { + "epoch": 1.49, + "grad_norm": 11.75416088104248, + "learning_rate": 1.0073212567460153e-05, + "loss": 1.0192, + "step": 11865 + }, + { + "epoch": 1.49, + "grad_norm": 19.864070892333984, + "learning_rate": 1.0072375852403465e-05, + "loss": 2.7687, + "step": 11866 + }, + { + "epoch": 1.49, + "grad_norm": 5.751663684844971, + "learning_rate": 1.0071539137346777e-05, + "loss": 0.5699, + "step": 11867 + }, + { + "epoch": 1.49, + "grad_norm": 33.524044036865234, + "learning_rate": 1.007070242229009e-05, + "loss": 1.9562, + "step": 11868 + }, + { + "epoch": 1.49, + "grad_norm": 8.924762725830078, + "learning_rate": 1.0069865707233401e-05, + "loss": 2.3523, + "step": 11869 + }, + { + "epoch": 1.49, + "grad_norm": 17.027732849121094, + "learning_rate": 1.0069028992176715e-05, + "loss": 0.8819, + "step": 11870 + }, + { + "epoch": 1.49, + "grad_norm": 4.360075950622559, + "learning_rate": 1.0068192277120028e-05, + "loss": 0.2553, + "step": 11871 + }, + { + "epoch": 1.49, + "grad_norm": 7.308431148529053, + "learning_rate": 1.0067355562063342e-05, + "loss": 0.9271, + "step": 11872 + }, + { + "epoch": 1.49, + "grad_norm": 6.431609630584717, + "learning_rate": 1.0066518847006652e-05, + "loss": 0.7778, + "step": 11873 + }, + { + "epoch": 1.49, + "grad_norm": 115.9853286743164, + "learning_rate": 1.0065682131949966e-05, + "loss": 2.2345, + "step": 11874 + }, + { + "epoch": 1.49, + "grad_norm": 18.295310974121094, + "learning_rate": 1.0064845416893278e-05, + "loss": 2.7087, + "step": 11875 + }, + { + "epoch": 1.49, + "grad_norm": 12.80589485168457, + "learning_rate": 1.006400870183659e-05, + "loss": 0.4732, + "step": 11876 + }, + { + "epoch": 1.49, + "grad_norm": 15.119963645935059, + "learning_rate": 1.0063171986779903e-05, + "loss": 1.844, + "step": 11877 + }, + { + "epoch": 1.49, + "grad_norm": 41.419559478759766, + "learning_rate": 1.0062335271723215e-05, + "loss": 2.2955, + "step": 11878 + }, + { + "epoch": 1.49, + "grad_norm": 14.625669479370117, + "learning_rate": 1.0061498556666529e-05, + "loss": 2.0124, + "step": 11879 + }, + { + "epoch": 1.49, + "grad_norm": 4.444066524505615, + "learning_rate": 1.006066184160984e-05, + "loss": 1.4116, + "step": 11880 + }, + { + "epoch": 1.49, + "grad_norm": 12.593189239501953, + "learning_rate": 1.0059825126553153e-05, + "loss": 0.9024, + "step": 11881 + }, + { + "epoch": 1.49, + "grad_norm": 9.293560028076172, + "learning_rate": 1.0058988411496467e-05, + "loss": 0.8693, + "step": 11882 + }, + { + "epoch": 1.49, + "grad_norm": 14.213584899902344, + "learning_rate": 1.0058151696439777e-05, + "loss": 1.1086, + "step": 11883 + }, + { + "epoch": 1.49, + "grad_norm": 20.005144119262695, + "learning_rate": 1.005731498138309e-05, + "loss": 1.6349, + "step": 11884 + }, + { + "epoch": 1.49, + "grad_norm": 6.507580757141113, + "learning_rate": 1.0056478266326404e-05, + "loss": 1.2457, + "step": 11885 + }, + { + "epoch": 1.49, + "grad_norm": 28.846912384033203, + "learning_rate": 1.0055641551269718e-05, + "loss": 2.2322, + "step": 11886 + }, + { + "epoch": 1.49, + "grad_norm": 16.472408294677734, + "learning_rate": 1.0054804836213028e-05, + "loss": 1.6491, + "step": 11887 + }, + { + "epoch": 1.49, + "grad_norm": 6.475840091705322, + "learning_rate": 1.0053968121156342e-05, + "loss": 1.383, + "step": 11888 + }, + { + "epoch": 1.49, + "grad_norm": 8.640379905700684, + "learning_rate": 1.0053131406099654e-05, + "loss": 1.076, + "step": 11889 + }, + { + "epoch": 1.49, + "grad_norm": 9.46177864074707, + "learning_rate": 1.0052294691042966e-05, + "loss": 0.9874, + "step": 11890 + }, + { + "epoch": 1.49, + "grad_norm": 23.13226318359375, + "learning_rate": 1.005145797598628e-05, + "loss": 1.8804, + "step": 11891 + }, + { + "epoch": 1.49, + "grad_norm": 76.4865951538086, + "learning_rate": 1.0050621260929591e-05, + "loss": 2.7426, + "step": 11892 + }, + { + "epoch": 1.49, + "grad_norm": 33.265350341796875, + "learning_rate": 1.0049784545872905e-05, + "loss": 0.8085, + "step": 11893 + }, + { + "epoch": 1.49, + "grad_norm": 11.67562484741211, + "learning_rate": 1.0048947830816215e-05, + "loss": 1.2942, + "step": 11894 + }, + { + "epoch": 1.49, + "grad_norm": 10.46027660369873, + "learning_rate": 1.0048111115759529e-05, + "loss": 0.5026, + "step": 11895 + }, + { + "epoch": 1.49, + "grad_norm": 17.637737274169922, + "learning_rate": 1.0047274400702842e-05, + "loss": 0.471, + "step": 11896 + }, + { + "epoch": 1.49, + "grad_norm": 76.73388671875, + "learning_rate": 1.0046437685646153e-05, + "loss": 1.0768, + "step": 11897 + }, + { + "epoch": 1.49, + "grad_norm": 9.723772048950195, + "learning_rate": 1.0045600970589466e-05, + "loss": 1.6841, + "step": 11898 + }, + { + "epoch": 1.49, + "grad_norm": 10.220072746276855, + "learning_rate": 1.004476425553278e-05, + "loss": 2.0881, + "step": 11899 + }, + { + "epoch": 1.49, + "grad_norm": 2.628190040588379, + "learning_rate": 1.0043927540476094e-05, + "loss": 0.1753, + "step": 11900 + }, + { + "epoch": 1.49, + "grad_norm": 10.272156715393066, + "learning_rate": 1.0043090825419404e-05, + "loss": 0.6721, + "step": 11901 + }, + { + "epoch": 1.49, + "grad_norm": 9.18677806854248, + "learning_rate": 1.0042254110362717e-05, + "loss": 1.1368, + "step": 11902 + }, + { + "epoch": 1.49, + "grad_norm": 13.048550605773926, + "learning_rate": 1.004141739530603e-05, + "loss": 0.9223, + "step": 11903 + }, + { + "epoch": 1.49, + "grad_norm": 16.681678771972656, + "learning_rate": 1.0040580680249341e-05, + "loss": 2.6372, + "step": 11904 + }, + { + "epoch": 1.49, + "grad_norm": 5.473873615264893, + "learning_rate": 1.0039743965192655e-05, + "loss": 1.0693, + "step": 11905 + }, + { + "epoch": 1.49, + "grad_norm": 14.33332633972168, + "learning_rate": 1.0038907250135967e-05, + "loss": 0.9577, + "step": 11906 + }, + { + "epoch": 1.49, + "grad_norm": 14.142997741699219, + "learning_rate": 1.003807053507928e-05, + "loss": 1.1256, + "step": 11907 + }, + { + "epoch": 1.49, + "grad_norm": 21.48374366760254, + "learning_rate": 1.0037233820022591e-05, + "loss": 2.2423, + "step": 11908 + }, + { + "epoch": 1.49, + "grad_norm": 28.318239212036133, + "learning_rate": 1.0036397104965905e-05, + "loss": 3.5075, + "step": 11909 + }, + { + "epoch": 1.49, + "grad_norm": 10.63522720336914, + "learning_rate": 1.0035560389909218e-05, + "loss": 0.3394, + "step": 11910 + }, + { + "epoch": 1.49, + "grad_norm": 15.294273376464844, + "learning_rate": 1.0034723674852528e-05, + "loss": 1.1554, + "step": 11911 + }, + { + "epoch": 1.49, + "grad_norm": 14.271414756774902, + "learning_rate": 1.0033886959795842e-05, + "loss": 0.4573, + "step": 11912 + }, + { + "epoch": 1.5, + "grad_norm": 1.3673518896102905, + "learning_rate": 1.0033050244739156e-05, + "loss": 0.037, + "step": 11913 + }, + { + "epoch": 1.5, + "grad_norm": 11.795802116394043, + "learning_rate": 1.0032213529682468e-05, + "loss": 0.5738, + "step": 11914 + }, + { + "epoch": 1.5, + "grad_norm": 182.52854919433594, + "learning_rate": 1.003137681462578e-05, + "loss": 1.6654, + "step": 11915 + }, + { + "epoch": 1.5, + "grad_norm": 16.431949615478516, + "learning_rate": 1.0030540099569093e-05, + "loss": 0.8176, + "step": 11916 + }, + { + "epoch": 1.5, + "grad_norm": 27.6535701751709, + "learning_rate": 1.0029703384512405e-05, + "loss": 2.2308, + "step": 11917 + }, + { + "epoch": 1.5, + "grad_norm": 16.68084144592285, + "learning_rate": 1.0028866669455717e-05, + "loss": 1.2446, + "step": 11918 + }, + { + "epoch": 1.5, + "grad_norm": 7.015377521514893, + "learning_rate": 1.0028029954399031e-05, + "loss": 1.03, + "step": 11919 + }, + { + "epoch": 1.5, + "grad_norm": 59.408451080322266, + "learning_rate": 1.0027193239342343e-05, + "loss": 1.3593, + "step": 11920 + }, + { + "epoch": 1.5, + "grad_norm": 112.39595794677734, + "learning_rate": 1.0026356524285656e-05, + "loss": 2.6855, + "step": 11921 + }, + { + "epoch": 1.5, + "grad_norm": 7.35781192779541, + "learning_rate": 1.0025519809228967e-05, + "loss": 2.5156, + "step": 11922 + }, + { + "epoch": 1.5, + "grad_norm": 3.9637932777404785, + "learning_rate": 1.002468309417228e-05, + "loss": 0.4721, + "step": 11923 + }, + { + "epoch": 1.5, + "grad_norm": 49.070621490478516, + "learning_rate": 1.0023846379115594e-05, + "loss": 0.9895, + "step": 11924 + }, + { + "epoch": 1.5, + "grad_norm": 17.80061912536621, + "learning_rate": 1.0023009664058904e-05, + "loss": 0.7227, + "step": 11925 + }, + { + "epoch": 1.5, + "grad_norm": 12.987116813659668, + "learning_rate": 1.0022172949002218e-05, + "loss": 0.3634, + "step": 11926 + }, + { + "epoch": 1.5, + "grad_norm": 6.7396697998046875, + "learning_rate": 1.0021336233945532e-05, + "loss": 0.936, + "step": 11927 + }, + { + "epoch": 1.5, + "grad_norm": 7.231090068817139, + "learning_rate": 1.0020499518888844e-05, + "loss": 0.8827, + "step": 11928 + }, + { + "epoch": 1.5, + "grad_norm": 4.010375499725342, + "learning_rate": 1.0019662803832155e-05, + "loss": 0.4028, + "step": 11929 + }, + { + "epoch": 1.5, + "grad_norm": 7.310550689697266, + "learning_rate": 1.0018826088775469e-05, + "loss": 0.2468, + "step": 11930 + }, + { + "epoch": 1.5, + "grad_norm": 12.716805458068848, + "learning_rate": 1.0017989373718781e-05, + "loss": 0.373, + "step": 11931 + }, + { + "epoch": 1.5, + "grad_norm": 98.79090118408203, + "learning_rate": 1.0017152658662093e-05, + "loss": 1.7562, + "step": 11932 + }, + { + "epoch": 1.5, + "grad_norm": 7.764037609100342, + "learning_rate": 1.0016315943605405e-05, + "loss": 0.3759, + "step": 11933 + }, + { + "epoch": 1.5, + "grad_norm": 6.833138465881348, + "learning_rate": 1.0015479228548719e-05, + "loss": 0.3039, + "step": 11934 + }, + { + "epoch": 1.5, + "grad_norm": 8.394643783569336, + "learning_rate": 1.0014642513492032e-05, + "loss": 0.3746, + "step": 11935 + }, + { + "epoch": 1.5, + "grad_norm": 13.278569221496582, + "learning_rate": 1.0013805798435343e-05, + "loss": 0.7913, + "step": 11936 + }, + { + "epoch": 1.5, + "grad_norm": 123.66014099121094, + "learning_rate": 1.0012969083378656e-05, + "loss": 0.8893, + "step": 11937 + }, + { + "epoch": 1.5, + "grad_norm": 4.467344284057617, + "learning_rate": 1.001213236832197e-05, + "loss": 0.2551, + "step": 11938 + }, + { + "epoch": 1.5, + "grad_norm": 34.95133590698242, + "learning_rate": 1.001129565326528e-05, + "loss": 2.1504, + "step": 11939 + }, + { + "epoch": 1.5, + "grad_norm": 34.456600189208984, + "learning_rate": 1.0010458938208594e-05, + "loss": 2.4087, + "step": 11940 + }, + { + "epoch": 1.5, + "grad_norm": 15.087719917297363, + "learning_rate": 1.0009622223151907e-05, + "loss": 1.2157, + "step": 11941 + }, + { + "epoch": 1.5, + "grad_norm": 33.45515441894531, + "learning_rate": 1.000878550809522e-05, + "loss": 1.5863, + "step": 11942 + }, + { + "epoch": 1.5, + "grad_norm": 11.268248558044434, + "learning_rate": 1.0007948793038531e-05, + "loss": 0.7718, + "step": 11943 + }, + { + "epoch": 1.5, + "grad_norm": 197.91110229492188, + "learning_rate": 1.0007112077981845e-05, + "loss": 2.3887, + "step": 11944 + }, + { + "epoch": 1.5, + "grad_norm": 10.988476753234863, + "learning_rate": 1.0006275362925157e-05, + "loss": 1.7683, + "step": 11945 + }, + { + "epoch": 1.5, + "grad_norm": 9.742283821105957, + "learning_rate": 1.0005438647868469e-05, + "loss": 1.1903, + "step": 11946 + }, + { + "epoch": 1.5, + "grad_norm": 17.585189819335938, + "learning_rate": 1.000460193281178e-05, + "loss": 1.2422, + "step": 11947 + }, + { + "epoch": 1.5, + "grad_norm": 4.5751471519470215, + "learning_rate": 1.0003765217755094e-05, + "loss": 0.0918, + "step": 11948 + }, + { + "epoch": 1.5, + "grad_norm": 12.938639640808105, + "learning_rate": 1.0002928502698408e-05, + "loss": 0.7853, + "step": 11949 + }, + { + "epoch": 1.5, + "grad_norm": 25.627052307128906, + "learning_rate": 1.0002091787641718e-05, + "loss": 2.8248, + "step": 11950 + }, + { + "epoch": 1.5, + "grad_norm": 55.588802337646484, + "learning_rate": 1.0001255072585032e-05, + "loss": 2.367, + "step": 11951 + }, + { + "epoch": 1.5, + "grad_norm": 9.790284156799316, + "learning_rate": 1.0000418357528346e-05, + "loss": 0.5722, + "step": 11952 + }, + { + "epoch": 1.5, + "grad_norm": 141.81297302246094, + "learning_rate": 9.999581642471658e-06, + "loss": 1.7636, + "step": 11953 + }, + { + "epoch": 1.5, + "grad_norm": 11.381845474243164, + "learning_rate": 9.99874492741497e-06, + "loss": 0.326, + "step": 11954 + }, + { + "epoch": 1.5, + "grad_norm": 21.792057037353516, + "learning_rate": 9.997908212358283e-06, + "loss": 1.5584, + "step": 11955 + }, + { + "epoch": 1.5, + "grad_norm": 6.758552074432373, + "learning_rate": 9.997071497301595e-06, + "loss": 0.5703, + "step": 11956 + }, + { + "epoch": 1.5, + "grad_norm": 15.068915367126465, + "learning_rate": 9.996234782244907e-06, + "loss": 2.2138, + "step": 11957 + }, + { + "epoch": 1.5, + "grad_norm": 25.37798309326172, + "learning_rate": 9.99539806718822e-06, + "loss": 2.4027, + "step": 11958 + }, + { + "epoch": 1.5, + "grad_norm": 16.0709171295166, + "learning_rate": 9.994561352131533e-06, + "loss": 1.8126, + "step": 11959 + }, + { + "epoch": 1.5, + "grad_norm": 19.48594856262207, + "learning_rate": 9.993724637074845e-06, + "loss": 1.2439, + "step": 11960 + }, + { + "epoch": 1.5, + "grad_norm": 10.05398178100586, + "learning_rate": 9.992887922018157e-06, + "loss": 0.5253, + "step": 11961 + }, + { + "epoch": 1.5, + "grad_norm": 17.040876388549805, + "learning_rate": 9.99205120696147e-06, + "loss": 0.7245, + "step": 11962 + }, + { + "epoch": 1.5, + "grad_norm": 11.07220458984375, + "learning_rate": 9.991214491904782e-06, + "loss": 1.1667, + "step": 11963 + }, + { + "epoch": 1.5, + "grad_norm": 25.889963150024414, + "learning_rate": 9.990377776848094e-06, + "loss": 1.5575, + "step": 11964 + }, + { + "epoch": 1.5, + "grad_norm": 6.3265814781188965, + "learning_rate": 9.989541061791408e-06, + "loss": 0.2746, + "step": 11965 + }, + { + "epoch": 1.5, + "grad_norm": 22.10900115966797, + "learning_rate": 9.98870434673472e-06, + "loss": 1.2605, + "step": 11966 + }, + { + "epoch": 1.5, + "grad_norm": 15.18563461303711, + "learning_rate": 9.987867631678033e-06, + "loss": 1.4546, + "step": 11967 + }, + { + "epoch": 1.5, + "grad_norm": 16.048877716064453, + "learning_rate": 9.987030916621345e-06, + "loss": 0.7405, + "step": 11968 + }, + { + "epoch": 1.5, + "grad_norm": 12.02132511138916, + "learning_rate": 9.986194201564659e-06, + "loss": 1.2132, + "step": 11969 + }, + { + "epoch": 1.5, + "grad_norm": 9.891904830932617, + "learning_rate": 9.985357486507971e-06, + "loss": 2.4866, + "step": 11970 + }, + { + "epoch": 1.5, + "grad_norm": 5.975489139556885, + "learning_rate": 9.984520771451283e-06, + "loss": 1.2783, + "step": 11971 + }, + { + "epoch": 1.5, + "grad_norm": 5.509067535400391, + "learning_rate": 9.983684056394597e-06, + "loss": 0.5775, + "step": 11972 + }, + { + "epoch": 1.5, + "grad_norm": 7.878676891326904, + "learning_rate": 9.982847341337909e-06, + "loss": 1.0309, + "step": 11973 + }, + { + "epoch": 1.5, + "grad_norm": 9.921243667602539, + "learning_rate": 9.98201062628122e-06, + "loss": 0.5512, + "step": 11974 + }, + { + "epoch": 1.5, + "grad_norm": 68.78978729248047, + "learning_rate": 9.981173911224533e-06, + "loss": 2.0138, + "step": 11975 + }, + { + "epoch": 1.5, + "grad_norm": 12.189335823059082, + "learning_rate": 9.980337196167846e-06, + "loss": 0.6528, + "step": 11976 + }, + { + "epoch": 1.5, + "grad_norm": 8.658945083618164, + "learning_rate": 9.979500481111158e-06, + "loss": 0.5057, + "step": 11977 + }, + { + "epoch": 1.5, + "grad_norm": 51.37008285522461, + "learning_rate": 9.97866376605447e-06, + "loss": 1.6466, + "step": 11978 + }, + { + "epoch": 1.5, + "grad_norm": 6.714479923248291, + "learning_rate": 9.977827050997784e-06, + "loss": 0.7203, + "step": 11979 + }, + { + "epoch": 1.5, + "grad_norm": 25.759918212890625, + "learning_rate": 9.976990335941096e-06, + "loss": 1.7984, + "step": 11980 + }, + { + "epoch": 1.5, + "grad_norm": 29.47275161743164, + "learning_rate": 9.97615362088441e-06, + "loss": 0.9928, + "step": 11981 + }, + { + "epoch": 1.5, + "grad_norm": 8.146074295043945, + "learning_rate": 9.975316905827721e-06, + "loss": 0.5818, + "step": 11982 + }, + { + "epoch": 1.5, + "grad_norm": 7.408407211303711, + "learning_rate": 9.974480190771035e-06, + "loss": 0.173, + "step": 11983 + }, + { + "epoch": 1.5, + "grad_norm": 10.282600402832031, + "learning_rate": 9.973643475714347e-06, + "loss": 1.6004, + "step": 11984 + }, + { + "epoch": 1.5, + "grad_norm": 8.530019760131836, + "learning_rate": 9.972806760657659e-06, + "loss": 0.836, + "step": 11985 + }, + { + "epoch": 1.5, + "grad_norm": 8.769789695739746, + "learning_rate": 9.97197004560097e-06, + "loss": 0.6612, + "step": 11986 + }, + { + "epoch": 1.5, + "grad_norm": 48.7542610168457, + "learning_rate": 9.971133330544284e-06, + "loss": 1.4747, + "step": 11987 + }, + { + "epoch": 1.5, + "grad_norm": 24.739336013793945, + "learning_rate": 9.970296615487596e-06, + "loss": 0.6906, + "step": 11988 + }, + { + "epoch": 1.5, + "grad_norm": 15.494034767150879, + "learning_rate": 9.969459900430908e-06, + "loss": 0.6135, + "step": 11989 + }, + { + "epoch": 1.5, + "grad_norm": 13.59770393371582, + "learning_rate": 9.968623185374222e-06, + "loss": 1.0242, + "step": 11990 + }, + { + "epoch": 1.5, + "grad_norm": 17.987462997436523, + "learning_rate": 9.967786470317534e-06, + "loss": 1.5997, + "step": 11991 + }, + { + "epoch": 1.5, + "grad_norm": 13.37804126739502, + "learning_rate": 9.966949755260846e-06, + "loss": 0.8806, + "step": 11992 + }, + { + "epoch": 1.51, + "grad_norm": 10.933453559875488, + "learning_rate": 9.96611304020416e-06, + "loss": 0.7462, + "step": 11993 + }, + { + "epoch": 1.51, + "grad_norm": 8.850107192993164, + "learning_rate": 9.965276325147472e-06, + "loss": 0.9161, + "step": 11994 + }, + { + "epoch": 1.51, + "grad_norm": 10.267704963684082, + "learning_rate": 9.964439610090785e-06, + "loss": 1.1713, + "step": 11995 + }, + { + "epoch": 1.51, + "grad_norm": 9.207832336425781, + "learning_rate": 9.963602895034097e-06, + "loss": 0.4839, + "step": 11996 + }, + { + "epoch": 1.51, + "grad_norm": 16.171329498291016, + "learning_rate": 9.96276617997741e-06, + "loss": 2.0192, + "step": 11997 + }, + { + "epoch": 1.51, + "grad_norm": 16.221424102783203, + "learning_rate": 9.961929464920723e-06, + "loss": 0.9424, + "step": 11998 + }, + { + "epoch": 1.51, + "grad_norm": 28.946025848388672, + "learning_rate": 9.961092749864035e-06, + "loss": 1.7608, + "step": 11999 + }, + { + "epoch": 1.51, + "grad_norm": 6.948938369750977, + "learning_rate": 9.960256034807347e-06, + "loss": 0.6615, + "step": 12000 + }, + { + "epoch": 1.51, + "eval_loss": 0.08853369951248169, + "eval_runtime": 95.5204, + "eval_samples_per_second": 37.081, + "eval_steps_per_second": 37.081, + "step": 12000 + }, + { + "epoch": 1.51, + "grad_norm": 8.19046401977539, + "learning_rate": 9.95941931975066e-06, + "loss": 2.1622, + "step": 12001 + }, + { + "epoch": 1.51, + "grad_norm": 11.977171897888184, + "learning_rate": 9.958582604693972e-06, + "loss": 1.024, + "step": 12002 + }, + { + "epoch": 1.51, + "grad_norm": 13.06263542175293, + "learning_rate": 9.957745889637284e-06, + "loss": 0.3926, + "step": 12003 + }, + { + "epoch": 1.51, + "grad_norm": 22.671180725097656, + "learning_rate": 9.956909174580598e-06, + "loss": 1.0962, + "step": 12004 + }, + { + "epoch": 1.51, + "grad_norm": 23.219268798828125, + "learning_rate": 9.95607245952391e-06, + "loss": 1.1272, + "step": 12005 + }, + { + "epoch": 1.51, + "grad_norm": 14.273165702819824, + "learning_rate": 9.955235744467222e-06, + "loss": 1.0359, + "step": 12006 + }, + { + "epoch": 1.51, + "grad_norm": 12.979179382324219, + "learning_rate": 9.954399029410535e-06, + "loss": 1.2394, + "step": 12007 + }, + { + "epoch": 1.51, + "grad_norm": 22.269634246826172, + "learning_rate": 9.953562314353847e-06, + "loss": 1.9392, + "step": 12008 + }, + { + "epoch": 1.51, + "grad_norm": 21.703006744384766, + "learning_rate": 9.952725599297161e-06, + "loss": 2.0548, + "step": 12009 + }, + { + "epoch": 1.51, + "grad_norm": 16.14089584350586, + "learning_rate": 9.951888884240473e-06, + "loss": 0.9636, + "step": 12010 + }, + { + "epoch": 1.51, + "grad_norm": 17.844032287597656, + "learning_rate": 9.951052169183787e-06, + "loss": 1.3561, + "step": 12011 + }, + { + "epoch": 1.51, + "grad_norm": 4.559155464172363, + "learning_rate": 9.950215454127099e-06, + "loss": 0.1477, + "step": 12012 + }, + { + "epoch": 1.51, + "grad_norm": 10.72029972076416, + "learning_rate": 9.94937873907041e-06, + "loss": 0.7245, + "step": 12013 + }, + { + "epoch": 1.51, + "grad_norm": 11.713088035583496, + "learning_rate": 9.948542024013722e-06, + "loss": 1.2342, + "step": 12014 + }, + { + "epoch": 1.51, + "grad_norm": 15.37183666229248, + "learning_rate": 9.947705308957034e-06, + "loss": 1.161, + "step": 12015 + }, + { + "epoch": 1.51, + "grad_norm": 11.082987785339355, + "learning_rate": 9.946868593900348e-06, + "loss": 1.2575, + "step": 12016 + }, + { + "epoch": 1.51, + "grad_norm": 16.346601486206055, + "learning_rate": 9.94603187884366e-06, + "loss": 1.1751, + "step": 12017 + }, + { + "epoch": 1.51, + "grad_norm": 8.998777389526367, + "learning_rate": 9.945195163786974e-06, + "loss": 2.0404, + "step": 12018 + }, + { + "epoch": 1.51, + "grad_norm": 8.834567070007324, + "learning_rate": 9.944358448730286e-06, + "loss": 1.4698, + "step": 12019 + }, + { + "epoch": 1.51, + "grad_norm": 37.00133514404297, + "learning_rate": 9.943521733673598e-06, + "loss": 2.067, + "step": 12020 + }, + { + "epoch": 1.51, + "grad_norm": 13.402094841003418, + "learning_rate": 9.942685018616911e-06, + "loss": 0.5756, + "step": 12021 + }, + { + "epoch": 1.51, + "grad_norm": 14.236288070678711, + "learning_rate": 9.941848303560223e-06, + "loss": 1.0936, + "step": 12022 + }, + { + "epoch": 1.51, + "grad_norm": 10.448648452758789, + "learning_rate": 9.941011588503537e-06, + "loss": 1.0993, + "step": 12023 + }, + { + "epoch": 1.51, + "grad_norm": 11.812121391296387, + "learning_rate": 9.940174873446849e-06, + "loss": 0.9132, + "step": 12024 + }, + { + "epoch": 1.51, + "grad_norm": 59.94866943359375, + "learning_rate": 9.939338158390162e-06, + "loss": 1.2881, + "step": 12025 + }, + { + "epoch": 1.51, + "grad_norm": 8.625561714172363, + "learning_rate": 9.938501443333474e-06, + "loss": 0.7345, + "step": 12026 + }, + { + "epoch": 1.51, + "grad_norm": 16.29551124572754, + "learning_rate": 9.937664728276786e-06, + "loss": 0.7359, + "step": 12027 + }, + { + "epoch": 1.51, + "grad_norm": 7.665041446685791, + "learning_rate": 9.936828013220098e-06, + "loss": 0.733, + "step": 12028 + }, + { + "epoch": 1.51, + "grad_norm": 11.574849128723145, + "learning_rate": 9.93599129816341e-06, + "loss": 1.3198, + "step": 12029 + }, + { + "epoch": 1.51, + "grad_norm": 6.037437438964844, + "learning_rate": 9.935154583106724e-06, + "loss": 1.5453, + "step": 12030 + }, + { + "epoch": 1.51, + "grad_norm": 14.004814147949219, + "learning_rate": 9.934317868050036e-06, + "loss": 0.5832, + "step": 12031 + }, + { + "epoch": 1.51, + "grad_norm": 15.221983909606934, + "learning_rate": 9.93348115299335e-06, + "loss": 0.9652, + "step": 12032 + }, + { + "epoch": 1.51, + "grad_norm": 5.101906776428223, + "learning_rate": 9.932644437936661e-06, + "loss": 0.4963, + "step": 12033 + }, + { + "epoch": 1.51, + "grad_norm": 10.096247673034668, + "learning_rate": 9.931807722879973e-06, + "loss": 0.887, + "step": 12034 + }, + { + "epoch": 1.51, + "grad_norm": 111.1515884399414, + "learning_rate": 9.930971007823287e-06, + "loss": 4.7195, + "step": 12035 + }, + { + "epoch": 1.51, + "grad_norm": 21.20841407775879, + "learning_rate": 9.930134292766599e-06, + "loss": 1.7936, + "step": 12036 + }, + { + "epoch": 1.51, + "grad_norm": 27.25119400024414, + "learning_rate": 9.929297577709913e-06, + "loss": 1.3645, + "step": 12037 + }, + { + "epoch": 1.51, + "grad_norm": 52.72514343261719, + "learning_rate": 9.928460862653225e-06, + "loss": 0.5084, + "step": 12038 + }, + { + "epoch": 1.51, + "grad_norm": 6.796664714813232, + "learning_rate": 9.927624147596537e-06, + "loss": 0.2302, + "step": 12039 + }, + { + "epoch": 1.51, + "grad_norm": 13.538002967834473, + "learning_rate": 9.92678743253985e-06, + "loss": 0.735, + "step": 12040 + }, + { + "epoch": 1.51, + "grad_norm": 68.86786651611328, + "learning_rate": 9.925950717483162e-06, + "loss": 3.0572, + "step": 12041 + }, + { + "epoch": 1.51, + "grad_norm": 13.591713905334473, + "learning_rate": 9.925114002426474e-06, + "loss": 2.0597, + "step": 12042 + }, + { + "epoch": 1.51, + "grad_norm": 4.568581581115723, + "learning_rate": 9.924277287369786e-06, + "loss": 1.7249, + "step": 12043 + }, + { + "epoch": 1.51, + "grad_norm": 11.677054405212402, + "learning_rate": 9.9234405723131e-06, + "loss": 1.9962, + "step": 12044 + }, + { + "epoch": 1.51, + "grad_norm": 7.531463623046875, + "learning_rate": 9.922603857256412e-06, + "loss": 0.4709, + "step": 12045 + }, + { + "epoch": 1.51, + "grad_norm": 17.33578872680664, + "learning_rate": 9.921767142199724e-06, + "loss": 0.9133, + "step": 12046 + }, + { + "epoch": 1.51, + "grad_norm": 14.018386840820312, + "learning_rate": 9.920930427143037e-06, + "loss": 1.7442, + "step": 12047 + }, + { + "epoch": 1.51, + "grad_norm": 16.10979652404785, + "learning_rate": 9.92009371208635e-06, + "loss": 0.998, + "step": 12048 + }, + { + "epoch": 1.51, + "grad_norm": 29.893558502197266, + "learning_rate": 9.919256997029663e-06, + "loss": 2.026, + "step": 12049 + }, + { + "epoch": 1.51, + "grad_norm": 8.823445320129395, + "learning_rate": 9.918420281972975e-06, + "loss": 1.3437, + "step": 12050 + }, + { + "epoch": 1.51, + "grad_norm": 13.906524658203125, + "learning_rate": 9.917583566916289e-06, + "loss": 1.0415, + "step": 12051 + }, + { + "epoch": 1.51, + "grad_norm": 5.505490303039551, + "learning_rate": 9.9167468518596e-06, + "loss": 1.6757, + "step": 12052 + }, + { + "epoch": 1.51, + "grad_norm": 59.318851470947266, + "learning_rate": 9.915910136802912e-06, + "loss": 0.7066, + "step": 12053 + }, + { + "epoch": 1.51, + "grad_norm": 12.240217208862305, + "learning_rate": 9.915073421746226e-06, + "loss": 0.3382, + "step": 12054 + }, + { + "epoch": 1.51, + "grad_norm": 30.698877334594727, + "learning_rate": 9.914236706689538e-06, + "loss": 1.1438, + "step": 12055 + }, + { + "epoch": 1.51, + "grad_norm": 17.633760452270508, + "learning_rate": 9.91339999163285e-06, + "loss": 1.299, + "step": 12056 + }, + { + "epoch": 1.51, + "grad_norm": 13.139379501342773, + "learning_rate": 9.912563276576162e-06, + "loss": 0.7885, + "step": 12057 + }, + { + "epoch": 1.51, + "grad_norm": 18.261993408203125, + "learning_rate": 9.911726561519476e-06, + "loss": 2.2095, + "step": 12058 + }, + { + "epoch": 1.51, + "grad_norm": 9.138144493103027, + "learning_rate": 9.910889846462788e-06, + "loss": 1.2991, + "step": 12059 + }, + { + "epoch": 1.51, + "grad_norm": 35.964271545410156, + "learning_rate": 9.9100531314061e-06, + "loss": 1.6079, + "step": 12060 + }, + { + "epoch": 1.51, + "grad_norm": 16.519386291503906, + "learning_rate": 9.909216416349413e-06, + "loss": 2.1746, + "step": 12061 + }, + { + "epoch": 1.51, + "grad_norm": 46.806793212890625, + "learning_rate": 9.908379701292725e-06, + "loss": 2.4845, + "step": 12062 + }, + { + "epoch": 1.51, + "grad_norm": 11.36434555053711, + "learning_rate": 9.907542986236039e-06, + "loss": 0.4807, + "step": 12063 + }, + { + "epoch": 1.51, + "grad_norm": 15.425649642944336, + "learning_rate": 9.90670627117935e-06, + "loss": 2.0082, + "step": 12064 + }, + { + "epoch": 1.51, + "grad_norm": 14.873329162597656, + "learning_rate": 9.905869556122664e-06, + "loss": 0.9925, + "step": 12065 + }, + { + "epoch": 1.51, + "grad_norm": 5.44581413269043, + "learning_rate": 9.905032841065976e-06, + "loss": 0.3184, + "step": 12066 + }, + { + "epoch": 1.51, + "grad_norm": 24.377872467041016, + "learning_rate": 9.904196126009288e-06, + "loss": 1.3422, + "step": 12067 + }, + { + "epoch": 1.51, + "grad_norm": 16.013059616088867, + "learning_rate": 9.9033594109526e-06, + "loss": 1.8209, + "step": 12068 + }, + { + "epoch": 1.51, + "grad_norm": 7.558819770812988, + "learning_rate": 9.902522695895914e-06, + "loss": 1.0303, + "step": 12069 + }, + { + "epoch": 1.51, + "grad_norm": 38.249698638916016, + "learning_rate": 9.901685980839226e-06, + "loss": 1.2263, + "step": 12070 + }, + { + "epoch": 1.51, + "grad_norm": 5.480305194854736, + "learning_rate": 9.900849265782538e-06, + "loss": 0.4937, + "step": 12071 + }, + { + "epoch": 1.52, + "grad_norm": 16.048473358154297, + "learning_rate": 9.900012550725851e-06, + "loss": 2.5661, + "step": 12072 + }, + { + "epoch": 1.52, + "grad_norm": 16.503463745117188, + "learning_rate": 9.899175835669163e-06, + "loss": 1.272, + "step": 12073 + }, + { + "epoch": 1.52, + "grad_norm": 10.952848434448242, + "learning_rate": 9.898339120612475e-06, + "loss": 0.7935, + "step": 12074 + }, + { + "epoch": 1.52, + "grad_norm": 89.3608627319336, + "learning_rate": 9.897502405555789e-06, + "loss": 1.0207, + "step": 12075 + }, + { + "epoch": 1.52, + "grad_norm": 17.02195167541504, + "learning_rate": 9.896665690499101e-06, + "loss": 1.5251, + "step": 12076 + }, + { + "epoch": 1.52, + "grad_norm": 4.599850177764893, + "learning_rate": 9.895828975442415e-06, + "loss": 0.1726, + "step": 12077 + }, + { + "epoch": 1.52, + "grad_norm": 21.801647186279297, + "learning_rate": 9.894992260385727e-06, + "loss": 1.5907, + "step": 12078 + }, + { + "epoch": 1.52, + "grad_norm": 11.036595344543457, + "learning_rate": 9.89415554532904e-06, + "loss": 0.6715, + "step": 12079 + }, + { + "epoch": 1.52, + "grad_norm": 42.203365325927734, + "learning_rate": 9.893318830272352e-06, + "loss": 1.5699, + "step": 12080 + }, + { + "epoch": 1.52, + "grad_norm": 9.643888473510742, + "learning_rate": 9.892482115215664e-06, + "loss": 1.0253, + "step": 12081 + }, + { + "epoch": 1.52, + "grad_norm": 9.382494926452637, + "learning_rate": 9.891645400158976e-06, + "loss": 0.5185, + "step": 12082 + }, + { + "epoch": 1.52, + "grad_norm": 8.488576889038086, + "learning_rate": 9.89080868510229e-06, + "loss": 0.8755, + "step": 12083 + }, + { + "epoch": 1.52, + "grad_norm": 20.551118850708008, + "learning_rate": 9.889971970045602e-06, + "loss": 3.2268, + "step": 12084 + }, + { + "epoch": 1.52, + "grad_norm": 18.992177963256836, + "learning_rate": 9.889135254988914e-06, + "loss": 1.0521, + "step": 12085 + }, + { + "epoch": 1.52, + "grad_norm": 11.187843322753906, + "learning_rate": 9.888298539932227e-06, + "loss": 1.2365, + "step": 12086 + }, + { + "epoch": 1.52, + "grad_norm": 9.308290481567383, + "learning_rate": 9.88746182487554e-06, + "loss": 0.4318, + "step": 12087 + }, + { + "epoch": 1.52, + "grad_norm": 1.222276210784912, + "learning_rate": 9.886625109818851e-06, + "loss": 0.0171, + "step": 12088 + }, + { + "epoch": 1.52, + "grad_norm": 98.22897338867188, + "learning_rate": 9.885788394762165e-06, + "loss": 0.9283, + "step": 12089 + }, + { + "epoch": 1.52, + "grad_norm": 9.085124969482422, + "learning_rate": 9.884951679705477e-06, + "loss": 0.4415, + "step": 12090 + }, + { + "epoch": 1.52, + "grad_norm": 33.56439208984375, + "learning_rate": 9.88411496464879e-06, + "loss": 2.3724, + "step": 12091 + }, + { + "epoch": 1.52, + "grad_norm": 3.0026347637176514, + "learning_rate": 9.883278249592102e-06, + "loss": 0.0828, + "step": 12092 + }, + { + "epoch": 1.52, + "grad_norm": 109.67359161376953, + "learning_rate": 9.882441534535416e-06, + "loss": 1.0562, + "step": 12093 + }, + { + "epoch": 1.52, + "grad_norm": 8.376280784606934, + "learning_rate": 9.881604819478728e-06, + "loss": 1.3205, + "step": 12094 + }, + { + "epoch": 1.52, + "grad_norm": 11.192483901977539, + "learning_rate": 9.88076810442204e-06, + "loss": 0.6056, + "step": 12095 + }, + { + "epoch": 1.52, + "grad_norm": 75.01764678955078, + "learning_rate": 9.879931389365352e-06, + "loss": 1.1883, + "step": 12096 + }, + { + "epoch": 1.52, + "grad_norm": 11.584369659423828, + "learning_rate": 9.879094674308664e-06, + "loss": 1.0283, + "step": 12097 + }, + { + "epoch": 1.52, + "grad_norm": 27.986286163330078, + "learning_rate": 9.878257959251977e-06, + "loss": 2.203, + "step": 12098 + }, + { + "epoch": 1.52, + "grad_norm": 14.805858612060547, + "learning_rate": 9.87742124419529e-06, + "loss": 0.8525, + "step": 12099 + }, + { + "epoch": 1.52, + "grad_norm": 18.489295959472656, + "learning_rate": 9.876584529138603e-06, + "loss": 1.4147, + "step": 12100 + }, + { + "epoch": 1.52, + "grad_norm": 6.825009822845459, + "learning_rate": 9.875747814081915e-06, + "loss": 0.8631, + "step": 12101 + }, + { + "epoch": 1.52, + "grad_norm": 8.646465301513672, + "learning_rate": 9.874911099025227e-06, + "loss": 0.8423, + "step": 12102 + }, + { + "epoch": 1.52, + "grad_norm": 3.3560781478881836, + "learning_rate": 9.87407438396854e-06, + "loss": 0.1448, + "step": 12103 + }, + { + "epoch": 1.52, + "grad_norm": 28.39167022705078, + "learning_rate": 9.873237668911853e-06, + "loss": 0.6005, + "step": 12104 + }, + { + "epoch": 1.52, + "grad_norm": 6.240129470825195, + "learning_rate": 9.872400953855166e-06, + "loss": 0.9824, + "step": 12105 + }, + { + "epoch": 1.52, + "grad_norm": 47.242679595947266, + "learning_rate": 9.871564238798478e-06, + "loss": 0.8535, + "step": 12106 + }, + { + "epoch": 1.52, + "grad_norm": 35.06230545043945, + "learning_rate": 9.87072752374179e-06, + "loss": 1.4405, + "step": 12107 + }, + { + "epoch": 1.52, + "grad_norm": 12.651790618896484, + "learning_rate": 9.869890808685104e-06, + "loss": 0.5753, + "step": 12108 + }, + { + "epoch": 1.52, + "grad_norm": 11.116722106933594, + "learning_rate": 9.869054093628416e-06, + "loss": 0.5279, + "step": 12109 + }, + { + "epoch": 1.52, + "grad_norm": 14.222530364990234, + "learning_rate": 9.868217378571728e-06, + "loss": 0.3965, + "step": 12110 + }, + { + "epoch": 1.52, + "grad_norm": 22.473535537719727, + "learning_rate": 9.86738066351504e-06, + "loss": 0.8926, + "step": 12111 + }, + { + "epoch": 1.52, + "grad_norm": 27.788347244262695, + "learning_rate": 9.866543948458353e-06, + "loss": 1.889, + "step": 12112 + }, + { + "epoch": 1.52, + "grad_norm": 12.01479721069336, + "learning_rate": 9.865707233401665e-06, + "loss": 0.5314, + "step": 12113 + }, + { + "epoch": 1.52, + "grad_norm": 9.29796314239502, + "learning_rate": 9.864870518344979e-06, + "loss": 1.585, + "step": 12114 + }, + { + "epoch": 1.52, + "grad_norm": 13.794572830200195, + "learning_rate": 9.864033803288291e-06, + "loss": 2.1457, + "step": 12115 + }, + { + "epoch": 1.52, + "grad_norm": 10.506523132324219, + "learning_rate": 9.863197088231603e-06, + "loss": 1.245, + "step": 12116 + }, + { + "epoch": 1.52, + "grad_norm": 9.19401741027832, + "learning_rate": 9.862360373174916e-06, + "loss": 0.4909, + "step": 12117 + }, + { + "epoch": 1.52, + "grad_norm": 17.951099395751953, + "learning_rate": 9.861523658118228e-06, + "loss": 1.224, + "step": 12118 + }, + { + "epoch": 1.52, + "grad_norm": 7.209202289581299, + "learning_rate": 9.860686943061542e-06, + "loss": 2.3347, + "step": 12119 + }, + { + "epoch": 1.52, + "grad_norm": 25.041492462158203, + "learning_rate": 9.859850228004854e-06, + "loss": 0.6295, + "step": 12120 + }, + { + "epoch": 1.52, + "grad_norm": 17.921358108520508, + "learning_rate": 9.859013512948166e-06, + "loss": 2.9414, + "step": 12121 + }, + { + "epoch": 1.52, + "grad_norm": 24.798927307128906, + "learning_rate": 9.85817679789148e-06, + "loss": 1.6251, + "step": 12122 + }, + { + "epoch": 1.52, + "grad_norm": 23.21161460876465, + "learning_rate": 9.857340082834792e-06, + "loss": 1.481, + "step": 12123 + }, + { + "epoch": 1.52, + "grad_norm": 16.68880844116211, + "learning_rate": 9.856503367778104e-06, + "loss": 2.7578, + "step": 12124 + }, + { + "epoch": 1.52, + "grad_norm": 21.62554931640625, + "learning_rate": 9.855666652721416e-06, + "loss": 1.2452, + "step": 12125 + }, + { + "epoch": 1.52, + "grad_norm": 44.3754997253418, + "learning_rate": 9.85482993766473e-06, + "loss": 1.2022, + "step": 12126 + }, + { + "epoch": 1.52, + "grad_norm": 15.518648147583008, + "learning_rate": 9.853993222608041e-06, + "loss": 1.9063, + "step": 12127 + }, + { + "epoch": 1.52, + "grad_norm": 12.686768531799316, + "learning_rate": 9.853156507551355e-06, + "loss": 2.1749, + "step": 12128 + }, + { + "epoch": 1.52, + "grad_norm": 19.951292037963867, + "learning_rate": 9.852319792494667e-06, + "loss": 1.0317, + "step": 12129 + }, + { + "epoch": 1.52, + "grad_norm": 12.443441390991211, + "learning_rate": 9.851483077437979e-06, + "loss": 0.7286, + "step": 12130 + }, + { + "epoch": 1.52, + "grad_norm": 20.19542694091797, + "learning_rate": 9.850646362381292e-06, + "loss": 1.6099, + "step": 12131 + }, + { + "epoch": 1.52, + "grad_norm": 9.66685962677002, + "learning_rate": 9.849809647324604e-06, + "loss": 1.0508, + "step": 12132 + }, + { + "epoch": 1.52, + "grad_norm": 39.90351867675781, + "learning_rate": 9.848972932267918e-06, + "loss": 1.2983, + "step": 12133 + }, + { + "epoch": 1.52, + "grad_norm": 17.983383178710938, + "learning_rate": 9.84813621721123e-06, + "loss": 0.6442, + "step": 12134 + }, + { + "epoch": 1.52, + "grad_norm": 20.91132926940918, + "learning_rate": 9.847299502154542e-06, + "loss": 2.1058, + "step": 12135 + }, + { + "epoch": 1.52, + "grad_norm": 34.87514877319336, + "learning_rate": 9.846462787097855e-06, + "loss": 3.0117, + "step": 12136 + }, + { + "epoch": 1.52, + "grad_norm": 17.203081130981445, + "learning_rate": 9.845626072041167e-06, + "loss": 0.8342, + "step": 12137 + }, + { + "epoch": 1.52, + "grad_norm": 94.10826110839844, + "learning_rate": 9.84478935698448e-06, + "loss": 1.5379, + "step": 12138 + }, + { + "epoch": 1.52, + "grad_norm": 61.7381706237793, + "learning_rate": 9.843952641927791e-06, + "loss": 2.9657, + "step": 12139 + }, + { + "epoch": 1.52, + "grad_norm": 62.37911605834961, + "learning_rate": 9.843115926871105e-06, + "loss": 1.5554, + "step": 12140 + }, + { + "epoch": 1.52, + "grad_norm": 2.827415943145752, + "learning_rate": 9.842279211814417e-06, + "loss": 0.3588, + "step": 12141 + }, + { + "epoch": 1.52, + "grad_norm": 55.246517181396484, + "learning_rate": 9.84144249675773e-06, + "loss": 1.7747, + "step": 12142 + }, + { + "epoch": 1.52, + "grad_norm": 12.423073768615723, + "learning_rate": 9.840605781701043e-06, + "loss": 0.5489, + "step": 12143 + }, + { + "epoch": 1.52, + "grad_norm": 72.16214752197266, + "learning_rate": 9.839769066644355e-06, + "loss": 2.5331, + "step": 12144 + }, + { + "epoch": 1.52, + "grad_norm": 11.032279968261719, + "learning_rate": 9.838932351587668e-06, + "loss": 0.7265, + "step": 12145 + }, + { + "epoch": 1.52, + "grad_norm": 24.480714797973633, + "learning_rate": 9.83809563653098e-06, + "loss": 1.3422, + "step": 12146 + }, + { + "epoch": 1.52, + "grad_norm": 7.498961448669434, + "learning_rate": 9.837258921474294e-06, + "loss": 0.615, + "step": 12147 + }, + { + "epoch": 1.52, + "grad_norm": 1.6980987787246704, + "learning_rate": 9.836422206417606e-06, + "loss": 0.0274, + "step": 12148 + }, + { + "epoch": 1.52, + "grad_norm": 40.88421630859375, + "learning_rate": 9.835585491360918e-06, + "loss": 1.3161, + "step": 12149 + }, + { + "epoch": 1.52, + "grad_norm": 11.074402809143066, + "learning_rate": 9.83474877630423e-06, + "loss": 1.012, + "step": 12150 + }, + { + "epoch": 1.52, + "grad_norm": 11.468755722045898, + "learning_rate": 9.833912061247543e-06, + "loss": 0.818, + "step": 12151 + }, + { + "epoch": 1.53, + "grad_norm": 22.78920555114746, + "learning_rate": 9.833075346190855e-06, + "loss": 1.5699, + "step": 12152 + }, + { + "epoch": 1.53, + "grad_norm": 6.2296528816223145, + "learning_rate": 9.832238631134167e-06, + "loss": 0.2785, + "step": 12153 + }, + { + "epoch": 1.53, + "grad_norm": 33.91434097290039, + "learning_rate": 9.83140191607748e-06, + "loss": 1.6412, + "step": 12154 + }, + { + "epoch": 1.53, + "grad_norm": 9.941302299499512, + "learning_rate": 9.830565201020793e-06, + "loss": 1.4619, + "step": 12155 + }, + { + "epoch": 1.53, + "grad_norm": 8.366573333740234, + "learning_rate": 9.829728485964106e-06, + "loss": 0.5372, + "step": 12156 + }, + { + "epoch": 1.53, + "grad_norm": 7.877475261688232, + "learning_rate": 9.828891770907418e-06, + "loss": 1.6623, + "step": 12157 + }, + { + "epoch": 1.53, + "grad_norm": 6.994818687438965, + "learning_rate": 9.82805505585073e-06, + "loss": 1.0525, + "step": 12158 + }, + { + "epoch": 1.53, + "grad_norm": 16.33742904663086, + "learning_rate": 9.827218340794044e-06, + "loss": 1.203, + "step": 12159 + }, + { + "epoch": 1.53, + "grad_norm": 9.728849411010742, + "learning_rate": 9.826381625737356e-06, + "loss": 1.2409, + "step": 12160 + }, + { + "epoch": 1.53, + "grad_norm": 8.891305923461914, + "learning_rate": 9.82554491068067e-06, + "loss": 0.1906, + "step": 12161 + }, + { + "epoch": 1.53, + "grad_norm": 17.807693481445312, + "learning_rate": 9.824708195623982e-06, + "loss": 1.5453, + "step": 12162 + }, + { + "epoch": 1.53, + "grad_norm": 18.572694778442383, + "learning_rate": 9.823871480567294e-06, + "loss": 1.3513, + "step": 12163 + }, + { + "epoch": 1.53, + "grad_norm": 12.814281463623047, + "learning_rate": 9.823034765510605e-06, + "loss": 1.9288, + "step": 12164 + }, + { + "epoch": 1.53, + "grad_norm": 10.732547760009766, + "learning_rate": 9.822198050453919e-06, + "loss": 0.8764, + "step": 12165 + }, + { + "epoch": 1.53, + "grad_norm": 23.086669921875, + "learning_rate": 9.821361335397231e-06, + "loss": 0.7889, + "step": 12166 + }, + { + "epoch": 1.53, + "grad_norm": 6.248536586761475, + "learning_rate": 9.820524620340543e-06, + "loss": 0.1673, + "step": 12167 + }, + { + "epoch": 1.53, + "grad_norm": 13.271327018737793, + "learning_rate": 9.819687905283857e-06, + "loss": 0.4454, + "step": 12168 + }, + { + "epoch": 1.53, + "grad_norm": 37.87796401977539, + "learning_rate": 9.818851190227169e-06, + "loss": 1.4341, + "step": 12169 + }, + { + "epoch": 1.53, + "grad_norm": 17.368616104125977, + "learning_rate": 9.818014475170482e-06, + "loss": 1.4188, + "step": 12170 + }, + { + "epoch": 1.53, + "grad_norm": 20.290699005126953, + "learning_rate": 9.817177760113794e-06, + "loss": 0.4338, + "step": 12171 + }, + { + "epoch": 1.53, + "grad_norm": 14.502118110656738, + "learning_rate": 9.816341045057106e-06, + "loss": 1.9969, + "step": 12172 + }, + { + "epoch": 1.53, + "grad_norm": 14.237765312194824, + "learning_rate": 9.81550433000042e-06, + "loss": 1.4782, + "step": 12173 + }, + { + "epoch": 1.53, + "grad_norm": 20.91703987121582, + "learning_rate": 9.814667614943732e-06, + "loss": 0.6778, + "step": 12174 + }, + { + "epoch": 1.53, + "grad_norm": 21.437503814697266, + "learning_rate": 9.813830899887045e-06, + "loss": 1.3012, + "step": 12175 + }, + { + "epoch": 1.53, + "grad_norm": 12.240067481994629, + "learning_rate": 9.812994184830357e-06, + "loss": 2.2071, + "step": 12176 + }, + { + "epoch": 1.53, + "grad_norm": 6.719193935394287, + "learning_rate": 9.81215746977367e-06, + "loss": 0.2437, + "step": 12177 + }, + { + "epoch": 1.53, + "grad_norm": 36.355289459228516, + "learning_rate": 9.811320754716981e-06, + "loss": 0.8935, + "step": 12178 + }, + { + "epoch": 1.53, + "grad_norm": 17.33690643310547, + "learning_rate": 9.810484039660293e-06, + "loss": 0.8497, + "step": 12179 + }, + { + "epoch": 1.53, + "grad_norm": 10.777118682861328, + "learning_rate": 9.809647324603607e-06, + "loss": 1.6085, + "step": 12180 + }, + { + "epoch": 1.53, + "grad_norm": 41.65799331665039, + "learning_rate": 9.808810609546919e-06, + "loss": 1.9959, + "step": 12181 + }, + { + "epoch": 1.53, + "grad_norm": 69.19049835205078, + "learning_rate": 9.807973894490233e-06, + "loss": 1.7897, + "step": 12182 + }, + { + "epoch": 1.53, + "grad_norm": 8.965083122253418, + "learning_rate": 9.807137179433544e-06, + "loss": 1.1403, + "step": 12183 + }, + { + "epoch": 1.53, + "grad_norm": 20.19078254699707, + "learning_rate": 9.806300464376858e-06, + "loss": 1.2492, + "step": 12184 + }, + { + "epoch": 1.53, + "grad_norm": 25.6923885345459, + "learning_rate": 9.80546374932017e-06, + "loss": 0.946, + "step": 12185 + }, + { + "epoch": 1.53, + "grad_norm": 140.13514709472656, + "learning_rate": 9.804627034263482e-06, + "loss": 1.6308, + "step": 12186 + }, + { + "epoch": 1.53, + "grad_norm": 44.56431579589844, + "learning_rate": 9.803790319206796e-06, + "loss": 1.266, + "step": 12187 + }, + { + "epoch": 1.53, + "grad_norm": 8.407170295715332, + "learning_rate": 9.802953604150108e-06, + "loss": 0.8196, + "step": 12188 + }, + { + "epoch": 1.53, + "grad_norm": 19.636802673339844, + "learning_rate": 9.80211688909342e-06, + "loss": 1.4294, + "step": 12189 + }, + { + "epoch": 1.53, + "grad_norm": 203.92636108398438, + "learning_rate": 9.801280174036733e-06, + "loss": 2.3192, + "step": 12190 + }, + { + "epoch": 1.53, + "grad_norm": 7.777228832244873, + "learning_rate": 9.800443458980045e-06, + "loss": 0.907, + "step": 12191 + }, + { + "epoch": 1.53, + "grad_norm": 10.198338508605957, + "learning_rate": 9.799606743923357e-06, + "loss": 0.3146, + "step": 12192 + }, + { + "epoch": 1.53, + "grad_norm": 28.228805541992188, + "learning_rate": 9.798770028866669e-06, + "loss": 2.2984, + "step": 12193 + }, + { + "epoch": 1.53, + "grad_norm": 8.053431510925293, + "learning_rate": 9.797933313809983e-06, + "loss": 0.3273, + "step": 12194 + }, + { + "epoch": 1.53, + "grad_norm": 14.494138717651367, + "learning_rate": 9.797096598753295e-06, + "loss": 2.056, + "step": 12195 + }, + { + "epoch": 1.53, + "grad_norm": 15.551849365234375, + "learning_rate": 9.796259883696608e-06, + "loss": 1.273, + "step": 12196 + }, + { + "epoch": 1.53, + "grad_norm": 9.078676223754883, + "learning_rate": 9.79542316863992e-06, + "loss": 1.6286, + "step": 12197 + }, + { + "epoch": 1.53, + "grad_norm": 22.557003021240234, + "learning_rate": 9.794586453583234e-06, + "loss": 0.9189, + "step": 12198 + }, + { + "epoch": 1.53, + "grad_norm": 9.545269012451172, + "learning_rate": 9.793749738526546e-06, + "loss": 0.9631, + "step": 12199 + }, + { + "epoch": 1.53, + "grad_norm": 12.034403800964355, + "learning_rate": 9.792913023469858e-06, + "loss": 0.7735, + "step": 12200 + }, + { + "epoch": 1.53, + "grad_norm": 73.09227752685547, + "learning_rate": 9.792076308413172e-06, + "loss": 1.7054, + "step": 12201 + }, + { + "epoch": 1.53, + "grad_norm": 40.55620193481445, + "learning_rate": 9.791239593356483e-06, + "loss": 1.1453, + "step": 12202 + }, + { + "epoch": 1.53, + "grad_norm": 15.765918731689453, + "learning_rate": 9.790402878299795e-06, + "loss": 0.5375, + "step": 12203 + }, + { + "epoch": 1.53, + "grad_norm": 16.58881378173828, + "learning_rate": 9.789566163243109e-06, + "loss": 0.804, + "step": 12204 + }, + { + "epoch": 1.53, + "grad_norm": 4.995907783508301, + "learning_rate": 9.788729448186421e-06, + "loss": 0.2483, + "step": 12205 + }, + { + "epoch": 1.53, + "grad_norm": 10.38837718963623, + "learning_rate": 9.787892733129733e-06, + "loss": 0.9539, + "step": 12206 + }, + { + "epoch": 1.53, + "grad_norm": 16.520999908447266, + "learning_rate": 9.787056018073045e-06, + "loss": 0.8792, + "step": 12207 + }, + { + "epoch": 1.53, + "grad_norm": 20.41761589050293, + "learning_rate": 9.786219303016359e-06, + "loss": 1.134, + "step": 12208 + }, + { + "epoch": 1.53, + "grad_norm": 207.6176300048828, + "learning_rate": 9.78538258795967e-06, + "loss": 1.5874, + "step": 12209 + }, + { + "epoch": 1.53, + "grad_norm": 17.356752395629883, + "learning_rate": 9.784545872902984e-06, + "loss": 1.6989, + "step": 12210 + }, + { + "epoch": 1.53, + "grad_norm": 17.773801803588867, + "learning_rate": 9.783709157846296e-06, + "loss": 0.7458, + "step": 12211 + }, + { + "epoch": 1.53, + "grad_norm": 5.917456150054932, + "learning_rate": 9.78287244278961e-06, + "loss": 1.4179, + "step": 12212 + }, + { + "epoch": 1.53, + "grad_norm": 55.027442932128906, + "learning_rate": 9.782035727732922e-06, + "loss": 4.6883, + "step": 12213 + }, + { + "epoch": 1.53, + "grad_norm": 33.28546905517578, + "learning_rate": 9.781199012676234e-06, + "loss": 2.6266, + "step": 12214 + }, + { + "epoch": 1.53, + "grad_norm": 11.39370059967041, + "learning_rate": 9.780362297619547e-06, + "loss": 1.6386, + "step": 12215 + }, + { + "epoch": 1.53, + "grad_norm": 71.60021209716797, + "learning_rate": 9.77952558256286e-06, + "loss": 1.3342, + "step": 12216 + }, + { + "epoch": 1.53, + "grad_norm": 2.1320292949676514, + "learning_rate": 9.778688867506171e-06, + "loss": 0.0901, + "step": 12217 + }, + { + "epoch": 1.53, + "grad_norm": 36.4505729675293, + "learning_rate": 9.777852152449483e-06, + "loss": 1.1238, + "step": 12218 + }, + { + "epoch": 1.53, + "grad_norm": 11.367158889770508, + "learning_rate": 9.777015437392797e-06, + "loss": 2.6591, + "step": 12219 + }, + { + "epoch": 1.53, + "grad_norm": 16.82743263244629, + "learning_rate": 9.776178722336109e-06, + "loss": 1.5801, + "step": 12220 + }, + { + "epoch": 1.53, + "grad_norm": 11.159351348876953, + "learning_rate": 9.77534200727942e-06, + "loss": 1.8518, + "step": 12221 + }, + { + "epoch": 1.53, + "grad_norm": 14.667262077331543, + "learning_rate": 9.774505292222734e-06, + "loss": 1.4004, + "step": 12222 + }, + { + "epoch": 1.53, + "grad_norm": 7.793519496917725, + "learning_rate": 9.773668577166046e-06, + "loss": 0.5112, + "step": 12223 + }, + { + "epoch": 1.53, + "grad_norm": 4.275669574737549, + "learning_rate": 9.77283186210936e-06, + "loss": 0.5828, + "step": 12224 + }, + { + "epoch": 1.53, + "grad_norm": 38.114418029785156, + "learning_rate": 9.771995147052672e-06, + "loss": 1.5612, + "step": 12225 + }, + { + "epoch": 1.53, + "grad_norm": 6.480970859527588, + "learning_rate": 9.771158431995984e-06, + "loss": 0.2714, + "step": 12226 + }, + { + "epoch": 1.53, + "grad_norm": 32.20256805419922, + "learning_rate": 9.770321716939298e-06, + "loss": 0.5955, + "step": 12227 + }, + { + "epoch": 1.53, + "grad_norm": 10.788589477539062, + "learning_rate": 9.76948500188261e-06, + "loss": 2.9237, + "step": 12228 + }, + { + "epoch": 1.53, + "grad_norm": 20.747760772705078, + "learning_rate": 9.768648286825923e-06, + "loss": 1.2273, + "step": 12229 + }, + { + "epoch": 1.53, + "grad_norm": 12.737183570861816, + "learning_rate": 9.767811571769235e-06, + "loss": 1.8664, + "step": 12230 + }, + { + "epoch": 1.53, + "grad_norm": 10.263840675354004, + "learning_rate": 9.766974856712547e-06, + "loss": 0.4733, + "step": 12231 + }, + { + "epoch": 1.54, + "grad_norm": 22.47993278503418, + "learning_rate": 9.766138141655859e-06, + "loss": 2.0756, + "step": 12232 + }, + { + "epoch": 1.54, + "grad_norm": 15.143706321716309, + "learning_rate": 9.765301426599173e-06, + "loss": 0.8412, + "step": 12233 + }, + { + "epoch": 1.54, + "grad_norm": 7.365832805633545, + "learning_rate": 9.764464711542485e-06, + "loss": 0.2762, + "step": 12234 + }, + { + "epoch": 1.54, + "grad_norm": 8.56747055053711, + "learning_rate": 9.763627996485797e-06, + "loss": 0.4914, + "step": 12235 + }, + { + "epoch": 1.54, + "grad_norm": 56.73297882080078, + "learning_rate": 9.76279128142911e-06, + "loss": 0.9943, + "step": 12236 + }, + { + "epoch": 1.54, + "grad_norm": 2.1040303707122803, + "learning_rate": 9.761954566372422e-06, + "loss": 0.0166, + "step": 12237 + }, + { + "epoch": 1.54, + "grad_norm": 19.92567253112793, + "learning_rate": 9.761117851315736e-06, + "loss": 0.2826, + "step": 12238 + }, + { + "epoch": 1.54, + "grad_norm": 40.35111618041992, + "learning_rate": 9.760281136259048e-06, + "loss": 2.1398, + "step": 12239 + }, + { + "epoch": 1.54, + "grad_norm": 7.14793586730957, + "learning_rate": 9.75944442120236e-06, + "loss": 1.2138, + "step": 12240 + }, + { + "epoch": 1.54, + "grad_norm": 125.70104217529297, + "learning_rate": 9.758607706145673e-06, + "loss": 1.2716, + "step": 12241 + }, + { + "epoch": 1.54, + "grad_norm": 27.32240867614746, + "learning_rate": 9.757770991088985e-06, + "loss": 3.5865, + "step": 12242 + }, + { + "epoch": 1.54, + "grad_norm": 28.979501724243164, + "learning_rate": 9.756934276032299e-06, + "loss": 1.6377, + "step": 12243 + }, + { + "epoch": 1.54, + "grad_norm": 9.755829811096191, + "learning_rate": 9.756097560975611e-06, + "loss": 1.2625, + "step": 12244 + }, + { + "epoch": 1.54, + "grad_norm": 15.504048347473145, + "learning_rate": 9.755260845918923e-06, + "loss": 1.3819, + "step": 12245 + }, + { + "epoch": 1.54, + "grad_norm": 13.636133193969727, + "learning_rate": 9.754424130862235e-06, + "loss": 1.5915, + "step": 12246 + }, + { + "epoch": 1.54, + "grad_norm": 9.099166870117188, + "learning_rate": 9.753587415805549e-06, + "loss": 1.1964, + "step": 12247 + }, + { + "epoch": 1.54, + "grad_norm": 18.510513305664062, + "learning_rate": 9.75275070074886e-06, + "loss": 1.5004, + "step": 12248 + }, + { + "epoch": 1.54, + "grad_norm": 32.700531005859375, + "learning_rate": 9.751913985692172e-06, + "loss": 1.1932, + "step": 12249 + }, + { + "epoch": 1.54, + "grad_norm": 18.577503204345703, + "learning_rate": 9.751077270635486e-06, + "loss": 1.8497, + "step": 12250 + }, + { + "epoch": 1.54, + "grad_norm": 8.545149803161621, + "learning_rate": 9.750240555578798e-06, + "loss": 1.2184, + "step": 12251 + }, + { + "epoch": 1.54, + "grad_norm": 22.767305374145508, + "learning_rate": 9.749403840522112e-06, + "loss": 1.8692, + "step": 12252 + }, + { + "epoch": 1.54, + "grad_norm": 59.19416046142578, + "learning_rate": 9.748567125465424e-06, + "loss": 2.6589, + "step": 12253 + }, + { + "epoch": 1.54, + "grad_norm": 6.677204608917236, + "learning_rate": 9.747730410408736e-06, + "loss": 0.2296, + "step": 12254 + }, + { + "epoch": 1.54, + "grad_norm": 11.923628807067871, + "learning_rate": 9.74689369535205e-06, + "loss": 1.023, + "step": 12255 + }, + { + "epoch": 1.54, + "grad_norm": 9.695693016052246, + "learning_rate": 9.746056980295361e-06, + "loss": 1.4392, + "step": 12256 + }, + { + "epoch": 1.54, + "grad_norm": 47.62434768676758, + "learning_rate": 9.745220265238675e-06, + "loss": 2.246, + "step": 12257 + }, + { + "epoch": 1.54, + "grad_norm": 3.9003381729125977, + "learning_rate": 9.744383550181987e-06, + "loss": 0.0779, + "step": 12258 + }, + { + "epoch": 1.54, + "grad_norm": 20.364971160888672, + "learning_rate": 9.743546835125299e-06, + "loss": 0.5831, + "step": 12259 + }, + { + "epoch": 1.54, + "grad_norm": 9.905691146850586, + "learning_rate": 9.74271012006861e-06, + "loss": 1.1294, + "step": 12260 + }, + { + "epoch": 1.54, + "grad_norm": 24.679933547973633, + "learning_rate": 9.741873405011923e-06, + "loss": 2.6039, + "step": 12261 + }, + { + "epoch": 1.54, + "grad_norm": 9.308293342590332, + "learning_rate": 9.741036689955236e-06, + "loss": 0.9243, + "step": 12262 + }, + { + "epoch": 1.54, + "grad_norm": 11.862445831298828, + "learning_rate": 9.740199974898548e-06, + "loss": 2.648, + "step": 12263 + }, + { + "epoch": 1.54, + "grad_norm": 12.263579368591309, + "learning_rate": 9.739363259841862e-06, + "loss": 1.5139, + "step": 12264 + }, + { + "epoch": 1.54, + "grad_norm": 17.342958450317383, + "learning_rate": 9.738526544785174e-06, + "loss": 2.0516, + "step": 12265 + }, + { + "epoch": 1.54, + "grad_norm": 12.240121841430664, + "learning_rate": 9.737689829728488e-06, + "loss": 0.8314, + "step": 12266 + }, + { + "epoch": 1.54, + "grad_norm": 8.022343635559082, + "learning_rate": 9.7368531146718e-06, + "loss": 0.6965, + "step": 12267 + }, + { + "epoch": 1.54, + "grad_norm": 554.3936767578125, + "learning_rate": 9.736016399615111e-06, + "loss": 2.0198, + "step": 12268 + }, + { + "epoch": 1.54, + "grad_norm": 36.34038543701172, + "learning_rate": 9.735179684558425e-06, + "loss": 1.6037, + "step": 12269 + }, + { + "epoch": 1.54, + "grad_norm": 19.262128829956055, + "learning_rate": 9.734342969501737e-06, + "loss": 0.7036, + "step": 12270 + }, + { + "epoch": 1.54, + "grad_norm": 61.7270622253418, + "learning_rate": 9.733506254445049e-06, + "loss": 0.816, + "step": 12271 + }, + { + "epoch": 1.54, + "grad_norm": 7.3316450119018555, + "learning_rate": 9.732669539388363e-06, + "loss": 0.7578, + "step": 12272 + }, + { + "epoch": 1.54, + "grad_norm": 26.127300262451172, + "learning_rate": 9.731832824331675e-06, + "loss": 0.7447, + "step": 12273 + }, + { + "epoch": 1.54, + "grad_norm": 28.84157371520996, + "learning_rate": 9.730996109274987e-06, + "loss": 1.1236, + "step": 12274 + }, + { + "epoch": 1.54, + "grad_norm": 16.017126083374023, + "learning_rate": 9.730159394218299e-06, + "loss": 0.8141, + "step": 12275 + }, + { + "epoch": 1.54, + "grad_norm": 3.271279811859131, + "learning_rate": 9.729322679161612e-06, + "loss": 0.2226, + "step": 12276 + }, + { + "epoch": 1.54, + "grad_norm": 12.12415885925293, + "learning_rate": 9.728485964104924e-06, + "loss": 1.3616, + "step": 12277 + }, + { + "epoch": 1.54, + "grad_norm": 10.111727714538574, + "learning_rate": 9.727649249048238e-06, + "loss": 1.0948, + "step": 12278 + }, + { + "epoch": 1.54, + "grad_norm": 10.25155258178711, + "learning_rate": 9.72681253399155e-06, + "loss": 0.543, + "step": 12279 + }, + { + "epoch": 1.54, + "grad_norm": 28.79137420654297, + "learning_rate": 9.725975818934863e-06, + "loss": 0.8573, + "step": 12280 + }, + { + "epoch": 1.54, + "grad_norm": 17.782323837280273, + "learning_rate": 9.725139103878175e-06, + "loss": 1.3399, + "step": 12281 + }, + { + "epoch": 1.54, + "grad_norm": 13.894097328186035, + "learning_rate": 9.724302388821487e-06, + "loss": 1.2142, + "step": 12282 + }, + { + "epoch": 1.54, + "grad_norm": 11.390398979187012, + "learning_rate": 9.723465673764801e-06, + "loss": 0.8592, + "step": 12283 + }, + { + "epoch": 1.54, + "grad_norm": 24.50271987915039, + "learning_rate": 9.722628958708113e-06, + "loss": 1.0732, + "step": 12284 + }, + { + "epoch": 1.54, + "grad_norm": 16.299976348876953, + "learning_rate": 9.721792243651425e-06, + "loss": 2.596, + "step": 12285 + }, + { + "epoch": 1.54, + "grad_norm": 8.97154426574707, + "learning_rate": 9.720955528594738e-06, + "loss": 1.5355, + "step": 12286 + }, + { + "epoch": 1.54, + "grad_norm": 27.228649139404297, + "learning_rate": 9.72011881353805e-06, + "loss": 1.1823, + "step": 12287 + }, + { + "epoch": 1.54, + "grad_norm": 17.106494903564453, + "learning_rate": 9.719282098481362e-06, + "loss": 0.4884, + "step": 12288 + }, + { + "epoch": 1.54, + "grad_norm": 20.892436981201172, + "learning_rate": 9.718445383424674e-06, + "loss": 1.7475, + "step": 12289 + }, + { + "epoch": 1.54, + "grad_norm": 21.541584014892578, + "learning_rate": 9.717608668367988e-06, + "loss": 2.45, + "step": 12290 + }, + { + "epoch": 1.54, + "grad_norm": 31.563264846801758, + "learning_rate": 9.7167719533113e-06, + "loss": 1.8688, + "step": 12291 + }, + { + "epoch": 1.54, + "grad_norm": 12.500767707824707, + "learning_rate": 9.715935238254614e-06, + "loss": 1.4335, + "step": 12292 + }, + { + "epoch": 1.54, + "grad_norm": 15.389634132385254, + "learning_rate": 9.715098523197926e-06, + "loss": 1.8416, + "step": 12293 + }, + { + "epoch": 1.54, + "grad_norm": 34.42812728881836, + "learning_rate": 9.71426180814124e-06, + "loss": 1.1489, + "step": 12294 + }, + { + "epoch": 1.54, + "grad_norm": 7.696777820587158, + "learning_rate": 9.713425093084551e-06, + "loss": 0.5016, + "step": 12295 + }, + { + "epoch": 1.54, + "grad_norm": 24.3090877532959, + "learning_rate": 9.712588378027863e-06, + "loss": 0.8746, + "step": 12296 + }, + { + "epoch": 1.54, + "grad_norm": 9.006400108337402, + "learning_rate": 9.711751662971177e-06, + "loss": 1.0452, + "step": 12297 + }, + { + "epoch": 1.54, + "grad_norm": 9.543442726135254, + "learning_rate": 9.710914947914489e-06, + "loss": 1.1973, + "step": 12298 + }, + { + "epoch": 1.54, + "grad_norm": 10.206832885742188, + "learning_rate": 9.7100782328578e-06, + "loss": 2.1005, + "step": 12299 + }, + { + "epoch": 1.54, + "grad_norm": 20.02605438232422, + "learning_rate": 9.709241517801113e-06, + "loss": 1.5466, + "step": 12300 + }, + { + "epoch": 1.54, + "grad_norm": 14.613446235656738, + "learning_rate": 9.708404802744426e-06, + "loss": 0.6017, + "step": 12301 + }, + { + "epoch": 1.54, + "grad_norm": 26.277055740356445, + "learning_rate": 9.707568087687738e-06, + "loss": 1.4789, + "step": 12302 + }, + { + "epoch": 1.54, + "grad_norm": 24.248592376708984, + "learning_rate": 9.70673137263105e-06, + "loss": 1.5301, + "step": 12303 + }, + { + "epoch": 1.54, + "grad_norm": 11.80296516418457, + "learning_rate": 9.705894657574364e-06, + "loss": 0.5371, + "step": 12304 + }, + { + "epoch": 1.54, + "grad_norm": 10.005497932434082, + "learning_rate": 9.705057942517676e-06, + "loss": 1.7511, + "step": 12305 + }, + { + "epoch": 1.54, + "grad_norm": 14.882233619689941, + "learning_rate": 9.70422122746099e-06, + "loss": 1.5295, + "step": 12306 + }, + { + "epoch": 1.54, + "grad_norm": 14.475922584533691, + "learning_rate": 9.703384512404301e-06, + "loss": 0.8347, + "step": 12307 + }, + { + "epoch": 1.54, + "grad_norm": 10.40028190612793, + "learning_rate": 9.702547797347615e-06, + "loss": 0.6573, + "step": 12308 + }, + { + "epoch": 1.54, + "grad_norm": 12.468551635742188, + "learning_rate": 9.701711082290927e-06, + "loss": 1.6914, + "step": 12309 + }, + { + "epoch": 1.54, + "grad_norm": 45.54451370239258, + "learning_rate": 9.700874367234239e-06, + "loss": 1.5934, + "step": 12310 + }, + { + "epoch": 1.55, + "grad_norm": 2.441662311553955, + "learning_rate": 9.700037652177553e-06, + "loss": 0.1754, + "step": 12311 + }, + { + "epoch": 1.55, + "grad_norm": 8.537643432617188, + "learning_rate": 9.699200937120865e-06, + "loss": 0.2869, + "step": 12312 + }, + { + "epoch": 1.55, + "grad_norm": 17.57350730895996, + "learning_rate": 9.698364222064177e-06, + "loss": 2.5289, + "step": 12313 + }, + { + "epoch": 1.55, + "grad_norm": 61.0333366394043, + "learning_rate": 9.697527507007488e-06, + "loss": 1.6432, + "step": 12314 + }, + { + "epoch": 1.55, + "grad_norm": 9.310492515563965, + "learning_rate": 9.696690791950802e-06, + "loss": 0.2658, + "step": 12315 + }, + { + "epoch": 1.55, + "grad_norm": 28.702024459838867, + "learning_rate": 9.695854076894114e-06, + "loss": 1.0645, + "step": 12316 + }, + { + "epoch": 1.55, + "grad_norm": 15.340718269348145, + "learning_rate": 9.695017361837426e-06, + "loss": 1.0408, + "step": 12317 + }, + { + "epoch": 1.55, + "grad_norm": 16.828536987304688, + "learning_rate": 9.69418064678074e-06, + "loss": 2.4937, + "step": 12318 + }, + { + "epoch": 1.55, + "grad_norm": 20.91560935974121, + "learning_rate": 9.693343931724052e-06, + "loss": 1.9895, + "step": 12319 + }, + { + "epoch": 1.55, + "grad_norm": 10.444477081298828, + "learning_rate": 9.692507216667365e-06, + "loss": 0.3833, + "step": 12320 + }, + { + "epoch": 1.55, + "grad_norm": 12.016524314880371, + "learning_rate": 9.691670501610677e-06, + "loss": 1.4067, + "step": 12321 + }, + { + "epoch": 1.55, + "grad_norm": 9.002107620239258, + "learning_rate": 9.690833786553991e-06, + "loss": 0.2779, + "step": 12322 + }, + { + "epoch": 1.55, + "grad_norm": 21.251346588134766, + "learning_rate": 9.689997071497303e-06, + "loss": 0.6225, + "step": 12323 + }, + { + "epoch": 1.55, + "grad_norm": 17.89685821533203, + "learning_rate": 9.689160356440615e-06, + "loss": 2.476, + "step": 12324 + }, + { + "epoch": 1.55, + "grad_norm": 15.73957633972168, + "learning_rate": 9.688323641383928e-06, + "loss": 1.623, + "step": 12325 + }, + { + "epoch": 1.55, + "grad_norm": 5.568100929260254, + "learning_rate": 9.68748692632724e-06, + "loss": 0.6575, + "step": 12326 + }, + { + "epoch": 1.55, + "grad_norm": 10.989686012268066, + "learning_rate": 9.686650211270552e-06, + "loss": 3.0087, + "step": 12327 + }, + { + "epoch": 1.55, + "grad_norm": 17.007863998413086, + "learning_rate": 9.685813496213864e-06, + "loss": 1.5266, + "step": 12328 + }, + { + "epoch": 1.55, + "grad_norm": 18.261709213256836, + "learning_rate": 9.684976781157178e-06, + "loss": 1.0027, + "step": 12329 + }, + { + "epoch": 1.55, + "grad_norm": 12.94047737121582, + "learning_rate": 9.68414006610049e-06, + "loss": 0.7465, + "step": 12330 + }, + { + "epoch": 1.55, + "grad_norm": 16.564481735229492, + "learning_rate": 9.683303351043802e-06, + "loss": 1.8334, + "step": 12331 + }, + { + "epoch": 1.55, + "grad_norm": 65.0897216796875, + "learning_rate": 9.682466635987116e-06, + "loss": 0.7476, + "step": 12332 + }, + { + "epoch": 1.55, + "grad_norm": 9.423556327819824, + "learning_rate": 9.681629920930427e-06, + "loss": 0.4993, + "step": 12333 + }, + { + "epoch": 1.55, + "grad_norm": 22.15186309814453, + "learning_rate": 9.680793205873741e-06, + "loss": 1.0165, + "step": 12334 + }, + { + "epoch": 1.55, + "grad_norm": 28.680299758911133, + "learning_rate": 9.679956490817053e-06, + "loss": 2.543, + "step": 12335 + }, + { + "epoch": 1.55, + "grad_norm": 12.244583129882812, + "learning_rate": 9.679119775760367e-06, + "loss": 2.632, + "step": 12336 + }, + { + "epoch": 1.55, + "grad_norm": 8.169486999511719, + "learning_rate": 9.678283060703679e-06, + "loss": 0.3672, + "step": 12337 + }, + { + "epoch": 1.55, + "grad_norm": 3.6350631713867188, + "learning_rate": 9.67744634564699e-06, + "loss": 0.2428, + "step": 12338 + }, + { + "epoch": 1.55, + "grad_norm": 169.20904541015625, + "learning_rate": 9.676609630590304e-06, + "loss": 2.6483, + "step": 12339 + }, + { + "epoch": 1.55, + "grad_norm": 12.080873489379883, + "learning_rate": 9.675772915533616e-06, + "loss": 0.572, + "step": 12340 + }, + { + "epoch": 1.55, + "grad_norm": 44.13920974731445, + "learning_rate": 9.674936200476928e-06, + "loss": 0.9094, + "step": 12341 + }, + { + "epoch": 1.55, + "grad_norm": 21.638214111328125, + "learning_rate": 9.67409948542024e-06, + "loss": 1.5159, + "step": 12342 + }, + { + "epoch": 1.55, + "grad_norm": 10.509679794311523, + "learning_rate": 9.673262770363554e-06, + "loss": 2.5113, + "step": 12343 + }, + { + "epoch": 1.55, + "grad_norm": 27.861263275146484, + "learning_rate": 9.672426055306866e-06, + "loss": 1.6951, + "step": 12344 + }, + { + "epoch": 1.55, + "grad_norm": 10.870193481445312, + "learning_rate": 9.671589340250178e-06, + "loss": 0.6811, + "step": 12345 + }, + { + "epoch": 1.55, + "grad_norm": 10.583845138549805, + "learning_rate": 9.670752625193491e-06, + "loss": 0.6466, + "step": 12346 + }, + { + "epoch": 1.55, + "grad_norm": 8.40594482421875, + "learning_rate": 9.669915910136803e-06, + "loss": 0.6483, + "step": 12347 + }, + { + "epoch": 1.55, + "grad_norm": 6.930304527282715, + "learning_rate": 9.669079195080117e-06, + "loss": 1.7079, + "step": 12348 + }, + { + "epoch": 1.55, + "grad_norm": 9.957571029663086, + "learning_rate": 9.668242480023429e-06, + "loss": 0.6104, + "step": 12349 + }, + { + "epoch": 1.55, + "grad_norm": 8.939764976501465, + "learning_rate": 9.667405764966743e-06, + "loss": 1.1139, + "step": 12350 + }, + { + "epoch": 1.55, + "grad_norm": 7.519237041473389, + "learning_rate": 9.666569049910055e-06, + "loss": 0.6042, + "step": 12351 + }, + { + "epoch": 1.55, + "grad_norm": 15.627189636230469, + "learning_rate": 9.665732334853366e-06, + "loss": 1.5947, + "step": 12352 + }, + { + "epoch": 1.55, + "grad_norm": 7.847267150878906, + "learning_rate": 9.664895619796678e-06, + "loss": 1.2697, + "step": 12353 + }, + { + "epoch": 1.55, + "grad_norm": 7.785482883453369, + "learning_rate": 9.664058904739992e-06, + "loss": 0.6465, + "step": 12354 + }, + { + "epoch": 1.55, + "grad_norm": 20.836950302124023, + "learning_rate": 9.663222189683304e-06, + "loss": 1.9692, + "step": 12355 + }, + { + "epoch": 1.55, + "grad_norm": 10.960698127746582, + "learning_rate": 9.662385474626616e-06, + "loss": 1.3315, + "step": 12356 + }, + { + "epoch": 1.55, + "grad_norm": 17.833974838256836, + "learning_rate": 9.66154875956993e-06, + "loss": 1.0398, + "step": 12357 + }, + { + "epoch": 1.55, + "grad_norm": 24.96609115600586, + "learning_rate": 9.660712044513242e-06, + "loss": 1.0752, + "step": 12358 + }, + { + "epoch": 1.55, + "grad_norm": 13.910917282104492, + "learning_rate": 9.659875329456554e-06, + "loss": 1.4323, + "step": 12359 + }, + { + "epoch": 1.55, + "grad_norm": 19.315662384033203, + "learning_rate": 9.659038614399867e-06, + "loss": 0.7993, + "step": 12360 + }, + { + "epoch": 1.55, + "grad_norm": 8.594182968139648, + "learning_rate": 9.658201899343179e-06, + "loss": 1.0659, + "step": 12361 + }, + { + "epoch": 1.55, + "grad_norm": 15.608436584472656, + "learning_rate": 9.657365184286493e-06, + "loss": 0.6294, + "step": 12362 + }, + { + "epoch": 1.55, + "grad_norm": 88.83906555175781, + "learning_rate": 9.656528469229805e-06, + "loss": 1.8261, + "step": 12363 + }, + { + "epoch": 1.55, + "grad_norm": 80.29641723632812, + "learning_rate": 9.655691754173118e-06, + "loss": 1.2879, + "step": 12364 + }, + { + "epoch": 1.55, + "grad_norm": 14.70491886138916, + "learning_rate": 9.65485503911643e-06, + "loss": 1.9687, + "step": 12365 + }, + { + "epoch": 1.55, + "grad_norm": 15.118184089660645, + "learning_rate": 9.654018324059742e-06, + "loss": 0.9066, + "step": 12366 + }, + { + "epoch": 1.55, + "grad_norm": 5.322723388671875, + "learning_rate": 9.653181609003054e-06, + "loss": 0.3755, + "step": 12367 + }, + { + "epoch": 1.55, + "grad_norm": 26.06890869140625, + "learning_rate": 9.652344893946368e-06, + "loss": 1.4103, + "step": 12368 + }, + { + "epoch": 1.55, + "grad_norm": 22.888280868530273, + "learning_rate": 9.65150817888968e-06, + "loss": 2.17, + "step": 12369 + }, + { + "epoch": 1.55, + "grad_norm": 20.88540267944336, + "learning_rate": 9.650671463832992e-06, + "loss": 1.7313, + "step": 12370 + }, + { + "epoch": 1.55, + "grad_norm": 8.6563138961792, + "learning_rate": 9.649834748776305e-06, + "loss": 0.4957, + "step": 12371 + }, + { + "epoch": 1.55, + "grad_norm": 44.91715621948242, + "learning_rate": 9.648998033719617e-06, + "loss": 1.5765, + "step": 12372 + }, + { + "epoch": 1.55, + "grad_norm": 127.05415344238281, + "learning_rate": 9.64816131866293e-06, + "loss": 3.4356, + "step": 12373 + }, + { + "epoch": 1.55, + "grad_norm": 13.144569396972656, + "learning_rate": 9.647324603606243e-06, + "loss": 2.4144, + "step": 12374 + }, + { + "epoch": 1.55, + "grad_norm": 16.216175079345703, + "learning_rate": 9.646487888549555e-06, + "loss": 1.3727, + "step": 12375 + }, + { + "epoch": 1.55, + "grad_norm": 15.892380714416504, + "learning_rate": 9.645651173492869e-06, + "loss": 0.7439, + "step": 12376 + }, + { + "epoch": 1.55, + "grad_norm": 11.031373977661133, + "learning_rate": 9.64481445843618e-06, + "loss": 2.0348, + "step": 12377 + }, + { + "epoch": 1.55, + "grad_norm": 7.820834159851074, + "learning_rate": 9.643977743379494e-06, + "loss": 0.2667, + "step": 12378 + }, + { + "epoch": 1.55, + "grad_norm": 7.736286163330078, + "learning_rate": 9.643141028322806e-06, + "loss": 1.1379, + "step": 12379 + }, + { + "epoch": 1.55, + "grad_norm": 90.63997650146484, + "learning_rate": 9.642304313266118e-06, + "loss": 1.5963, + "step": 12380 + }, + { + "epoch": 1.55, + "grad_norm": 8.048603057861328, + "learning_rate": 9.64146759820943e-06, + "loss": 0.8324, + "step": 12381 + }, + { + "epoch": 1.55, + "grad_norm": 15.25834846496582, + "learning_rate": 9.640630883152742e-06, + "loss": 1.8656, + "step": 12382 + }, + { + "epoch": 1.55, + "grad_norm": 4.8124003410339355, + "learning_rate": 9.639794168096056e-06, + "loss": 1.7805, + "step": 12383 + }, + { + "epoch": 1.55, + "grad_norm": 12.141240119934082, + "learning_rate": 9.638957453039368e-06, + "loss": 1.5664, + "step": 12384 + }, + { + "epoch": 1.55, + "grad_norm": 12.14017391204834, + "learning_rate": 9.638120737982681e-06, + "loss": 2.1406, + "step": 12385 + }, + { + "epoch": 1.55, + "grad_norm": 7.157753944396973, + "learning_rate": 9.637284022925993e-06, + "loss": 0.9424, + "step": 12386 + }, + { + "epoch": 1.55, + "grad_norm": 20.844449996948242, + "learning_rate": 9.636447307869305e-06, + "loss": 2.1457, + "step": 12387 + }, + { + "epoch": 1.55, + "grad_norm": 5.263124465942383, + "learning_rate": 9.635610592812619e-06, + "loss": 0.2392, + "step": 12388 + }, + { + "epoch": 1.55, + "grad_norm": 15.502119064331055, + "learning_rate": 9.63477387775593e-06, + "loss": 0.6059, + "step": 12389 + }, + { + "epoch": 1.55, + "grad_norm": 5.6785054206848145, + "learning_rate": 9.633937162699244e-06, + "loss": 1.3948, + "step": 12390 + }, + { + "epoch": 1.56, + "grad_norm": 11.73088264465332, + "learning_rate": 9.633100447642556e-06, + "loss": 1.4068, + "step": 12391 + }, + { + "epoch": 1.56, + "grad_norm": 13.06839656829834, + "learning_rate": 9.63226373258587e-06, + "loss": 1.001, + "step": 12392 + }, + { + "epoch": 1.56, + "grad_norm": 8.036245346069336, + "learning_rate": 9.631427017529182e-06, + "loss": 2.2736, + "step": 12393 + }, + { + "epoch": 1.56, + "grad_norm": 15.826783180236816, + "learning_rate": 9.630590302472494e-06, + "loss": 2.1359, + "step": 12394 + }, + { + "epoch": 1.56, + "grad_norm": 19.438514709472656, + "learning_rate": 9.629753587415806e-06, + "loss": 1.0111, + "step": 12395 + }, + { + "epoch": 1.56, + "grad_norm": 14.289900779724121, + "learning_rate": 9.628916872359118e-06, + "loss": 1.2293, + "step": 12396 + }, + { + "epoch": 1.56, + "grad_norm": 72.85143280029297, + "learning_rate": 9.628080157302432e-06, + "loss": 1.9941, + "step": 12397 + }, + { + "epoch": 1.56, + "grad_norm": 12.056010246276855, + "learning_rate": 9.627243442245743e-06, + "loss": 0.7306, + "step": 12398 + }, + { + "epoch": 1.56, + "grad_norm": 12.72350788116455, + "learning_rate": 9.626406727189055e-06, + "loss": 1.0371, + "step": 12399 + }, + { + "epoch": 1.56, + "grad_norm": 11.367460250854492, + "learning_rate": 9.625570012132369e-06, + "loss": 0.6634, + "step": 12400 + }, + { + "epoch": 1.56, + "eval_loss": 0.09418872743844986, + "eval_runtime": 96.357, + "eval_samples_per_second": 36.759, + "eval_steps_per_second": 36.759, + "step": 12400 + }, + { + "epoch": 1.56, + "grad_norm": 10.752225875854492, + "learning_rate": 9.624733297075681e-06, + "loss": 0.6213, + "step": 12401 + }, + { + "epoch": 1.56, + "grad_norm": 12.529314041137695, + "learning_rate": 9.623896582018995e-06, + "loss": 0.6571, + "step": 12402 + }, + { + "epoch": 1.56, + "grad_norm": 27.533313751220703, + "learning_rate": 9.623059866962307e-06, + "loss": 1.3716, + "step": 12403 + }, + { + "epoch": 1.56, + "grad_norm": 6.712658405303955, + "learning_rate": 9.62222315190562e-06, + "loss": 0.6971, + "step": 12404 + }, + { + "epoch": 1.56, + "grad_norm": 10.645259857177734, + "learning_rate": 9.621386436848932e-06, + "loss": 0.7859, + "step": 12405 + }, + { + "epoch": 1.56, + "grad_norm": 17.8518123626709, + "learning_rate": 9.620549721792244e-06, + "loss": 0.8945, + "step": 12406 + }, + { + "epoch": 1.56, + "grad_norm": 9.712150573730469, + "learning_rate": 9.619713006735558e-06, + "loss": 0.5132, + "step": 12407 + }, + { + "epoch": 1.56, + "grad_norm": 25.49022674560547, + "learning_rate": 9.61887629167887e-06, + "loss": 1.8113, + "step": 12408 + }, + { + "epoch": 1.56, + "grad_norm": 30.441619873046875, + "learning_rate": 9.618039576622182e-06, + "loss": 1.022, + "step": 12409 + }, + { + "epoch": 1.56, + "grad_norm": 13.423223495483398, + "learning_rate": 9.617202861565494e-06, + "loss": 1.046, + "step": 12410 + }, + { + "epoch": 1.56, + "grad_norm": 36.31425476074219, + "learning_rate": 9.616366146508807e-06, + "loss": 2.3593, + "step": 12411 + }, + { + "epoch": 1.56, + "grad_norm": 23.63006591796875, + "learning_rate": 9.61552943145212e-06, + "loss": 0.8683, + "step": 12412 + }, + { + "epoch": 1.56, + "grad_norm": 20.383546829223633, + "learning_rate": 9.614692716395431e-06, + "loss": 2.1879, + "step": 12413 + }, + { + "epoch": 1.56, + "grad_norm": 10.273029327392578, + "learning_rate": 9.613856001338745e-06, + "loss": 0.3497, + "step": 12414 + }, + { + "epoch": 1.56, + "grad_norm": 15.434338569641113, + "learning_rate": 9.613019286282057e-06, + "loss": 0.7268, + "step": 12415 + }, + { + "epoch": 1.56, + "grad_norm": 14.430562973022461, + "learning_rate": 9.61218257122537e-06, + "loss": 1.358, + "step": 12416 + }, + { + "epoch": 1.56, + "grad_norm": 7.518756866455078, + "learning_rate": 9.611345856168682e-06, + "loss": 0.7091, + "step": 12417 + }, + { + "epoch": 1.56, + "grad_norm": 58.18947982788086, + "learning_rate": 9.610509141111996e-06, + "loss": 1.5999, + "step": 12418 + }, + { + "epoch": 1.56, + "grad_norm": 16.584684371948242, + "learning_rate": 9.609672426055308e-06, + "loss": 0.9131, + "step": 12419 + }, + { + "epoch": 1.56, + "grad_norm": 24.425582885742188, + "learning_rate": 9.60883571099862e-06, + "loss": 1.1493, + "step": 12420 + }, + { + "epoch": 1.56, + "grad_norm": 112.81609344482422, + "learning_rate": 9.607998995941934e-06, + "loss": 2.2978, + "step": 12421 + }, + { + "epoch": 1.56, + "grad_norm": 34.88551712036133, + "learning_rate": 9.607162280885246e-06, + "loss": 1.3919, + "step": 12422 + }, + { + "epoch": 1.56, + "grad_norm": 9.413063049316406, + "learning_rate": 9.606325565828558e-06, + "loss": 0.5265, + "step": 12423 + }, + { + "epoch": 1.56, + "grad_norm": 10.930394172668457, + "learning_rate": 9.60548885077187e-06, + "loss": 0.9577, + "step": 12424 + }, + { + "epoch": 1.56, + "grad_norm": 22.859756469726562, + "learning_rate": 9.604652135715183e-06, + "loss": 1.2404, + "step": 12425 + }, + { + "epoch": 1.56, + "grad_norm": 11.711565017700195, + "learning_rate": 9.603815420658495e-06, + "loss": 0.7393, + "step": 12426 + }, + { + "epoch": 1.56, + "grad_norm": 27.851823806762695, + "learning_rate": 9.602978705601807e-06, + "loss": 1.8269, + "step": 12427 + }, + { + "epoch": 1.56, + "grad_norm": 35.024620056152344, + "learning_rate": 9.60214199054512e-06, + "loss": 2.4503, + "step": 12428 + }, + { + "epoch": 1.56, + "grad_norm": 11.001527786254883, + "learning_rate": 9.601305275488433e-06, + "loss": 2.1216, + "step": 12429 + }, + { + "epoch": 1.56, + "grad_norm": 14.854911804199219, + "learning_rate": 9.600468560431746e-06, + "loss": 0.6204, + "step": 12430 + }, + { + "epoch": 1.56, + "grad_norm": 15.91850471496582, + "learning_rate": 9.599631845375058e-06, + "loss": 1.0912, + "step": 12431 + }, + { + "epoch": 1.56, + "grad_norm": 5.914279460906982, + "learning_rate": 9.598795130318372e-06, + "loss": 0.6524, + "step": 12432 + }, + { + "epoch": 1.56, + "grad_norm": 15.950666427612305, + "learning_rate": 9.597958415261684e-06, + "loss": 1.0621, + "step": 12433 + }, + { + "epoch": 1.56, + "grad_norm": 16.08356285095215, + "learning_rate": 9.597121700204996e-06, + "loss": 1.4678, + "step": 12434 + }, + { + "epoch": 1.56, + "grad_norm": 4.259883880615234, + "learning_rate": 9.596284985148308e-06, + "loss": 0.2889, + "step": 12435 + }, + { + "epoch": 1.56, + "grad_norm": 19.73095703125, + "learning_rate": 9.595448270091621e-06, + "loss": 1.0126, + "step": 12436 + }, + { + "epoch": 1.56, + "grad_norm": 12.76375675201416, + "learning_rate": 9.594611555034933e-06, + "loss": 2.8601, + "step": 12437 + }, + { + "epoch": 1.56, + "grad_norm": 17.41053009033203, + "learning_rate": 9.593774839978245e-06, + "loss": 0.9852, + "step": 12438 + }, + { + "epoch": 1.56, + "grad_norm": 26.395841598510742, + "learning_rate": 9.592938124921559e-06, + "loss": 1.3814, + "step": 12439 + }, + { + "epoch": 1.56, + "grad_norm": 17.374435424804688, + "learning_rate": 9.592101409864871e-06, + "loss": 2.6592, + "step": 12440 + }, + { + "epoch": 1.56, + "grad_norm": 24.25494384765625, + "learning_rate": 9.591264694808183e-06, + "loss": 1.4085, + "step": 12441 + }, + { + "epoch": 1.56, + "grad_norm": 14.291749954223633, + "learning_rate": 9.590427979751497e-06, + "loss": 1.3867, + "step": 12442 + }, + { + "epoch": 1.56, + "grad_norm": 43.80595779418945, + "learning_rate": 9.589591264694809e-06, + "loss": 1.5136, + "step": 12443 + }, + { + "epoch": 1.56, + "grad_norm": 25.88615608215332, + "learning_rate": 9.588754549638122e-06, + "loss": 1.3238, + "step": 12444 + }, + { + "epoch": 1.56, + "grad_norm": 9.90751838684082, + "learning_rate": 9.587917834581434e-06, + "loss": 0.4805, + "step": 12445 + }, + { + "epoch": 1.56, + "grad_norm": 25.095247268676758, + "learning_rate": 9.587081119524748e-06, + "loss": 1.7575, + "step": 12446 + }, + { + "epoch": 1.56, + "grad_norm": 4.303838729858398, + "learning_rate": 9.58624440446806e-06, + "loss": 0.3622, + "step": 12447 + }, + { + "epoch": 1.56, + "grad_norm": 11.002155303955078, + "learning_rate": 9.585407689411372e-06, + "loss": 0.9787, + "step": 12448 + }, + { + "epoch": 1.56, + "grad_norm": 24.475616455078125, + "learning_rate": 9.584570974354684e-06, + "loss": 1.2444, + "step": 12449 + }, + { + "epoch": 1.56, + "grad_norm": 27.28084373474121, + "learning_rate": 9.583734259297997e-06, + "loss": 1.5608, + "step": 12450 + }, + { + "epoch": 1.56, + "grad_norm": 18.78505516052246, + "learning_rate": 9.58289754424131e-06, + "loss": 1.3009, + "step": 12451 + }, + { + "epoch": 1.56, + "grad_norm": 19.323734283447266, + "learning_rate": 9.582060829184621e-06, + "loss": 0.9721, + "step": 12452 + }, + { + "epoch": 1.56, + "grad_norm": 4.087686061859131, + "learning_rate": 9.581224114127935e-06, + "loss": 0.2535, + "step": 12453 + }, + { + "epoch": 1.56, + "grad_norm": 19.48501205444336, + "learning_rate": 9.580387399071247e-06, + "loss": 0.4955, + "step": 12454 + }, + { + "epoch": 1.56, + "grad_norm": 13.740678787231445, + "learning_rate": 9.579550684014559e-06, + "loss": 0.8021, + "step": 12455 + }, + { + "epoch": 1.56, + "grad_norm": 4.655831813812256, + "learning_rate": 9.578713968957872e-06, + "loss": 0.6098, + "step": 12456 + }, + { + "epoch": 1.56, + "grad_norm": 15.411494255065918, + "learning_rate": 9.577877253901184e-06, + "loss": 1.0082, + "step": 12457 + }, + { + "epoch": 1.56, + "grad_norm": 4.377325534820557, + "learning_rate": 9.577040538844498e-06, + "loss": 0.2156, + "step": 12458 + }, + { + "epoch": 1.56, + "grad_norm": 11.858933448791504, + "learning_rate": 9.57620382378781e-06, + "loss": 2.2354, + "step": 12459 + }, + { + "epoch": 1.56, + "grad_norm": 6.7858195304870605, + "learning_rate": 9.575367108731124e-06, + "loss": 0.2565, + "step": 12460 + }, + { + "epoch": 1.56, + "grad_norm": 6.283478260040283, + "learning_rate": 9.574530393674436e-06, + "loss": 0.6546, + "step": 12461 + }, + { + "epoch": 1.56, + "grad_norm": 8.30803394317627, + "learning_rate": 9.573693678617748e-06, + "loss": 0.3907, + "step": 12462 + }, + { + "epoch": 1.56, + "grad_norm": 178.96331787109375, + "learning_rate": 9.57285696356106e-06, + "loss": 2.536, + "step": 12463 + }, + { + "epoch": 1.56, + "grad_norm": 17.98655128479004, + "learning_rate": 9.572020248504371e-06, + "loss": 2.4661, + "step": 12464 + }, + { + "epoch": 1.56, + "grad_norm": 6.036463737487793, + "learning_rate": 9.571183533447685e-06, + "loss": 0.405, + "step": 12465 + }, + { + "epoch": 1.56, + "grad_norm": 10.466330528259277, + "learning_rate": 9.570346818390997e-06, + "loss": 0.5749, + "step": 12466 + }, + { + "epoch": 1.56, + "grad_norm": 25.326618194580078, + "learning_rate": 9.56951010333431e-06, + "loss": 1.4268, + "step": 12467 + }, + { + "epoch": 1.56, + "grad_norm": 10.319133758544922, + "learning_rate": 9.568673388277623e-06, + "loss": 2.2784, + "step": 12468 + }, + { + "epoch": 1.56, + "grad_norm": 11.29590892791748, + "learning_rate": 9.567836673220935e-06, + "loss": 0.749, + "step": 12469 + }, + { + "epoch": 1.56, + "grad_norm": 53.39611053466797, + "learning_rate": 9.566999958164248e-06, + "loss": 1.4989, + "step": 12470 + }, + { + "epoch": 1.57, + "grad_norm": 12.615883827209473, + "learning_rate": 9.56616324310756e-06, + "loss": 1.1332, + "step": 12471 + }, + { + "epoch": 1.57, + "grad_norm": 8.729395866394043, + "learning_rate": 9.565326528050874e-06, + "loss": 1.5362, + "step": 12472 + }, + { + "epoch": 1.57, + "grad_norm": 8.843531608581543, + "learning_rate": 9.564489812994186e-06, + "loss": 0.8666, + "step": 12473 + }, + { + "epoch": 1.57, + "grad_norm": 15.752050399780273, + "learning_rate": 9.5636530979375e-06, + "loss": 3.0116, + "step": 12474 + }, + { + "epoch": 1.57, + "grad_norm": 9.753077507019043, + "learning_rate": 9.562816382880811e-06, + "loss": 0.9885, + "step": 12475 + }, + { + "epoch": 1.57, + "grad_norm": 12.625199317932129, + "learning_rate": 9.561979667824123e-06, + "loss": 1.6332, + "step": 12476 + }, + { + "epoch": 1.57, + "grad_norm": 12.897058486938477, + "learning_rate": 9.561142952767435e-06, + "loss": 0.9332, + "step": 12477 + }, + { + "epoch": 1.57, + "grad_norm": 23.377389907836914, + "learning_rate": 9.560306237710747e-06, + "loss": 1.4842, + "step": 12478 + }, + { + "epoch": 1.57, + "grad_norm": 18.772571563720703, + "learning_rate": 9.559469522654061e-06, + "loss": 0.9665, + "step": 12479 + }, + { + "epoch": 1.57, + "grad_norm": 15.308571815490723, + "learning_rate": 9.558632807597373e-06, + "loss": 1.1477, + "step": 12480 + }, + { + "epoch": 1.57, + "grad_norm": 15.69286060333252, + "learning_rate": 9.557796092540687e-06, + "loss": 0.5971, + "step": 12481 + }, + { + "epoch": 1.57, + "grad_norm": 4.5303850173950195, + "learning_rate": 9.556959377483999e-06, + "loss": 0.506, + "step": 12482 + }, + { + "epoch": 1.57, + "grad_norm": 22.065988540649414, + "learning_rate": 9.55612266242731e-06, + "loss": 1.6268, + "step": 12483 + }, + { + "epoch": 1.57, + "grad_norm": 68.5661392211914, + "learning_rate": 9.555285947370624e-06, + "loss": 1.652, + "step": 12484 + }, + { + "epoch": 1.57, + "grad_norm": 10.060134887695312, + "learning_rate": 9.554449232313936e-06, + "loss": 0.5053, + "step": 12485 + }, + { + "epoch": 1.57, + "grad_norm": 27.67550277709961, + "learning_rate": 9.55361251725725e-06, + "loss": 2.0167, + "step": 12486 + }, + { + "epoch": 1.57, + "grad_norm": 11.384492874145508, + "learning_rate": 9.552775802200562e-06, + "loss": 1.5019, + "step": 12487 + }, + { + "epoch": 1.57, + "grad_norm": 6.32214879989624, + "learning_rate": 9.551939087143874e-06, + "loss": 0.5649, + "step": 12488 + }, + { + "epoch": 1.57, + "grad_norm": 7.0320892333984375, + "learning_rate": 9.551102372087187e-06, + "loss": 0.3363, + "step": 12489 + }, + { + "epoch": 1.57, + "grad_norm": 4.202093601226807, + "learning_rate": 9.5502656570305e-06, + "loss": 0.1351, + "step": 12490 + }, + { + "epoch": 1.57, + "grad_norm": 13.17863941192627, + "learning_rate": 9.549428941973811e-06, + "loss": 1.2579, + "step": 12491 + }, + { + "epoch": 1.57, + "grad_norm": 10.442083358764648, + "learning_rate": 9.548592226917123e-06, + "loss": 1.7067, + "step": 12492 + }, + { + "epoch": 1.57, + "grad_norm": 8.665885925292969, + "learning_rate": 9.547755511860437e-06, + "loss": 1.1546, + "step": 12493 + }, + { + "epoch": 1.57, + "grad_norm": 6.95634651184082, + "learning_rate": 9.546918796803749e-06, + "loss": 0.2485, + "step": 12494 + }, + { + "epoch": 1.57, + "grad_norm": 26.555082321166992, + "learning_rate": 9.546082081747062e-06, + "loss": 1.2272, + "step": 12495 + }, + { + "epoch": 1.57, + "grad_norm": 12.900822639465332, + "learning_rate": 9.545245366690374e-06, + "loss": 1.0477, + "step": 12496 + }, + { + "epoch": 1.57, + "grad_norm": 8.701725959777832, + "learning_rate": 9.544408651633686e-06, + "loss": 0.8932, + "step": 12497 + }, + { + "epoch": 1.57, + "grad_norm": 35.35837173461914, + "learning_rate": 9.543571936577e-06, + "loss": 2.5717, + "step": 12498 + }, + { + "epoch": 1.57, + "grad_norm": 589.602783203125, + "learning_rate": 9.542735221520312e-06, + "loss": 1.1103, + "step": 12499 + }, + { + "epoch": 1.57, + "grad_norm": 9.06594467163086, + "learning_rate": 9.541898506463626e-06, + "loss": 0.4326, + "step": 12500 + }, + { + "epoch": 1.57, + "grad_norm": 117.72836303710938, + "learning_rate": 9.541061791406938e-06, + "loss": 0.6035, + "step": 12501 + }, + { + "epoch": 1.57, + "grad_norm": 83.0579833984375, + "learning_rate": 9.54022507635025e-06, + "loss": 0.5596, + "step": 12502 + }, + { + "epoch": 1.57, + "grad_norm": 13.687908172607422, + "learning_rate": 9.539388361293563e-06, + "loss": 1.0665, + "step": 12503 + }, + { + "epoch": 1.57, + "grad_norm": 8.826162338256836, + "learning_rate": 9.538551646236875e-06, + "loss": 1.4933, + "step": 12504 + }, + { + "epoch": 1.57, + "grad_norm": 56.926944732666016, + "learning_rate": 9.537714931180187e-06, + "loss": 0.7264, + "step": 12505 + }, + { + "epoch": 1.57, + "grad_norm": 19.74498176574707, + "learning_rate": 9.536878216123499e-06, + "loss": 2.171, + "step": 12506 + }, + { + "epoch": 1.57, + "grad_norm": 75.2907943725586, + "learning_rate": 9.536041501066813e-06, + "loss": 0.723, + "step": 12507 + }, + { + "epoch": 1.57, + "grad_norm": 7.58877420425415, + "learning_rate": 9.535204786010125e-06, + "loss": 1.0957, + "step": 12508 + }, + { + "epoch": 1.57, + "grad_norm": 12.512656211853027, + "learning_rate": 9.534368070953438e-06, + "loss": 1.226, + "step": 12509 + }, + { + "epoch": 1.57, + "grad_norm": 9.59494400024414, + "learning_rate": 9.53353135589675e-06, + "loss": 0.4392, + "step": 12510 + }, + { + "epoch": 1.57, + "grad_norm": 16.1082820892334, + "learning_rate": 9.532694640840062e-06, + "loss": 2.6572, + "step": 12511 + }, + { + "epoch": 1.57, + "grad_norm": 11.349607467651367, + "learning_rate": 9.531857925783376e-06, + "loss": 2.1882, + "step": 12512 + }, + { + "epoch": 1.57, + "grad_norm": 112.89188385009766, + "learning_rate": 9.531021210726688e-06, + "loss": 1.2279, + "step": 12513 + }, + { + "epoch": 1.57, + "grad_norm": 17.9888973236084, + "learning_rate": 9.530184495670001e-06, + "loss": 2.8782, + "step": 12514 + }, + { + "epoch": 1.57, + "grad_norm": 21.969684600830078, + "learning_rate": 9.529347780613313e-06, + "loss": 1.5233, + "step": 12515 + }, + { + "epoch": 1.57, + "grad_norm": 33.1344108581543, + "learning_rate": 9.528511065556625e-06, + "loss": 3.0303, + "step": 12516 + }, + { + "epoch": 1.57, + "grad_norm": 4.86375093460083, + "learning_rate": 9.527674350499937e-06, + "loss": 0.4456, + "step": 12517 + }, + { + "epoch": 1.57, + "grad_norm": 11.052216529846191, + "learning_rate": 9.526837635443251e-06, + "loss": 1.0125, + "step": 12518 + }, + { + "epoch": 1.57, + "grad_norm": 9.90180778503418, + "learning_rate": 9.526000920386563e-06, + "loss": 1.4413, + "step": 12519 + }, + { + "epoch": 1.57, + "grad_norm": 16.92300796508789, + "learning_rate": 9.525164205329875e-06, + "loss": 2.4217, + "step": 12520 + }, + { + "epoch": 1.57, + "grad_norm": 10.172425270080566, + "learning_rate": 9.524327490273188e-06, + "loss": 0.5772, + "step": 12521 + }, + { + "epoch": 1.57, + "grad_norm": 39.183902740478516, + "learning_rate": 9.5234907752165e-06, + "loss": 5.5313, + "step": 12522 + }, + { + "epoch": 1.57, + "grad_norm": 10.03840160369873, + "learning_rate": 9.522654060159814e-06, + "loss": 1.5756, + "step": 12523 + }, + { + "epoch": 1.57, + "grad_norm": 11.910656929016113, + "learning_rate": 9.521817345103126e-06, + "loss": 0.5367, + "step": 12524 + }, + { + "epoch": 1.57, + "grad_norm": 60.089820861816406, + "learning_rate": 9.520980630046438e-06, + "loss": 2.7753, + "step": 12525 + }, + { + "epoch": 1.57, + "grad_norm": 12.336292266845703, + "learning_rate": 9.520143914989752e-06, + "loss": 1.1547, + "step": 12526 + }, + { + "epoch": 1.57, + "grad_norm": 2.9278714656829834, + "learning_rate": 9.519307199933064e-06, + "loss": 0.065, + "step": 12527 + }, + { + "epoch": 1.57, + "grad_norm": 10.427318572998047, + "learning_rate": 9.518470484876377e-06, + "loss": 1.6682, + "step": 12528 + }, + { + "epoch": 1.57, + "grad_norm": 260.3096008300781, + "learning_rate": 9.51763376981969e-06, + "loss": 0.5499, + "step": 12529 + }, + { + "epoch": 1.57, + "grad_norm": 6.432458400726318, + "learning_rate": 9.516797054763001e-06, + "loss": 0.1183, + "step": 12530 + }, + { + "epoch": 1.57, + "grad_norm": 19.33864974975586, + "learning_rate": 9.515960339706313e-06, + "loss": 1.3014, + "step": 12531 + }, + { + "epoch": 1.57, + "grad_norm": 8.947526931762695, + "learning_rate": 9.515123624649627e-06, + "loss": 1.0892, + "step": 12532 + }, + { + "epoch": 1.57, + "grad_norm": 27.10247802734375, + "learning_rate": 9.514286909592939e-06, + "loss": 1.4986, + "step": 12533 + }, + { + "epoch": 1.57, + "grad_norm": 25.001182556152344, + "learning_rate": 9.51345019453625e-06, + "loss": 2.5495, + "step": 12534 + }, + { + "epoch": 1.57, + "grad_norm": 10.446853637695312, + "learning_rate": 9.512613479479564e-06, + "loss": 0.5085, + "step": 12535 + }, + { + "epoch": 1.57, + "grad_norm": 6.622890949249268, + "learning_rate": 9.511776764422876e-06, + "loss": 0.3592, + "step": 12536 + }, + { + "epoch": 1.57, + "grad_norm": 4.06160831451416, + "learning_rate": 9.51094004936619e-06, + "loss": 0.5385, + "step": 12537 + }, + { + "epoch": 1.57, + "grad_norm": 7.546213626861572, + "learning_rate": 9.510103334309502e-06, + "loss": 0.1624, + "step": 12538 + }, + { + "epoch": 1.57, + "grad_norm": 17.955360412597656, + "learning_rate": 9.509266619252814e-06, + "loss": 1.0174, + "step": 12539 + }, + { + "epoch": 1.57, + "grad_norm": 43.812557220458984, + "learning_rate": 9.508429904196127e-06, + "loss": 2.9263, + "step": 12540 + }, + { + "epoch": 1.57, + "grad_norm": 14.057463645935059, + "learning_rate": 9.50759318913944e-06, + "loss": 1.0632, + "step": 12541 + }, + { + "epoch": 1.57, + "grad_norm": 8.944059371948242, + "learning_rate": 9.506756474082753e-06, + "loss": 0.5991, + "step": 12542 + }, + { + "epoch": 1.57, + "grad_norm": 12.784570693969727, + "learning_rate": 9.505919759026065e-06, + "loss": 1.1579, + "step": 12543 + }, + { + "epoch": 1.57, + "grad_norm": 5.623643398284912, + "learning_rate": 9.505083043969377e-06, + "loss": 0.363, + "step": 12544 + }, + { + "epoch": 1.57, + "grad_norm": 7.4926652908325195, + "learning_rate": 9.504246328912689e-06, + "loss": 0.6149, + "step": 12545 + }, + { + "epoch": 1.57, + "grad_norm": 43.37441635131836, + "learning_rate": 9.503409613856001e-06, + "loss": 1.8677, + "step": 12546 + }, + { + "epoch": 1.57, + "grad_norm": 7.164695739746094, + "learning_rate": 9.502572898799315e-06, + "loss": 0.3867, + "step": 12547 + }, + { + "epoch": 1.57, + "grad_norm": 8.881974220275879, + "learning_rate": 9.501736183742626e-06, + "loss": 1.7908, + "step": 12548 + }, + { + "epoch": 1.57, + "grad_norm": 11.763453483581543, + "learning_rate": 9.50089946868594e-06, + "loss": 1.5111, + "step": 12549 + }, + { + "epoch": 1.58, + "grad_norm": 16.42769432067871, + "learning_rate": 9.500062753629252e-06, + "loss": 0.8511, + "step": 12550 + }, + { + "epoch": 1.58, + "grad_norm": 4.932626724243164, + "learning_rate": 9.499226038572566e-06, + "loss": 0.4485, + "step": 12551 + }, + { + "epoch": 1.58, + "grad_norm": 12.884479522705078, + "learning_rate": 9.498389323515878e-06, + "loss": 1.6867, + "step": 12552 + }, + { + "epoch": 1.58, + "grad_norm": 28.30213165283203, + "learning_rate": 9.49755260845919e-06, + "loss": 2.3976, + "step": 12553 + }, + { + "epoch": 1.58, + "grad_norm": 12.66464900970459, + "learning_rate": 9.496715893402503e-06, + "loss": 1.8408, + "step": 12554 + }, + { + "epoch": 1.58, + "grad_norm": 14.149888038635254, + "learning_rate": 9.495879178345815e-06, + "loss": 0.6018, + "step": 12555 + }, + { + "epoch": 1.58, + "grad_norm": 3.6592276096343994, + "learning_rate": 9.495042463289127e-06, + "loss": 0.1692, + "step": 12556 + }, + { + "epoch": 1.58, + "grad_norm": 130.72134399414062, + "learning_rate": 9.494205748232441e-06, + "loss": 2.7858, + "step": 12557 + }, + { + "epoch": 1.58, + "grad_norm": 19.557369232177734, + "learning_rate": 9.493369033175753e-06, + "loss": 0.714, + "step": 12558 + }, + { + "epoch": 1.58, + "grad_norm": 27.16683006286621, + "learning_rate": 9.492532318119065e-06, + "loss": 1.6699, + "step": 12559 + }, + { + "epoch": 1.58, + "grad_norm": 8.552977561950684, + "learning_rate": 9.491695603062377e-06, + "loss": 1.3134, + "step": 12560 + }, + { + "epoch": 1.58, + "grad_norm": 32.981197357177734, + "learning_rate": 9.49085888800569e-06, + "loss": 2.8792, + "step": 12561 + }, + { + "epoch": 1.58, + "grad_norm": 24.212053298950195, + "learning_rate": 9.490022172949002e-06, + "loss": 1.1681, + "step": 12562 + }, + { + "epoch": 1.58, + "grad_norm": 5.9965362548828125, + "learning_rate": 9.489185457892316e-06, + "loss": 0.2369, + "step": 12563 + }, + { + "epoch": 1.58, + "grad_norm": 8.74487590789795, + "learning_rate": 9.488348742835628e-06, + "loss": 0.6477, + "step": 12564 + }, + { + "epoch": 1.58, + "grad_norm": 7.746337413787842, + "learning_rate": 9.487512027778942e-06, + "loss": 1.4842, + "step": 12565 + }, + { + "epoch": 1.58, + "grad_norm": 6.999999046325684, + "learning_rate": 9.486675312722254e-06, + "loss": 2.7009, + "step": 12566 + }, + { + "epoch": 1.58, + "grad_norm": 3.5180628299713135, + "learning_rate": 9.485838597665565e-06, + "loss": 0.1262, + "step": 12567 + }, + { + "epoch": 1.58, + "grad_norm": 14.253186225891113, + "learning_rate": 9.485001882608879e-06, + "loss": 1.9534, + "step": 12568 + }, + { + "epoch": 1.58, + "grad_norm": 14.465633392333984, + "learning_rate": 9.484165167552191e-06, + "loss": 0.9027, + "step": 12569 + }, + { + "epoch": 1.58, + "grad_norm": 10.833093643188477, + "learning_rate": 9.483328452495503e-06, + "loss": 1.2568, + "step": 12570 + }, + { + "epoch": 1.58, + "grad_norm": 14.690328598022461, + "learning_rate": 9.482491737438817e-06, + "loss": 1.6662, + "step": 12571 + }, + { + "epoch": 1.58, + "grad_norm": 12.066746711730957, + "learning_rate": 9.481655022382129e-06, + "loss": 1.4585, + "step": 12572 + }, + { + "epoch": 1.58, + "grad_norm": 19.365251541137695, + "learning_rate": 9.48081830732544e-06, + "loss": 1.6843, + "step": 12573 + }, + { + "epoch": 1.58, + "grad_norm": 18.7397518157959, + "learning_rate": 9.479981592268753e-06, + "loss": 1.2523, + "step": 12574 + }, + { + "epoch": 1.58, + "grad_norm": 13.922237396240234, + "learning_rate": 9.479144877212066e-06, + "loss": 1.4934, + "step": 12575 + }, + { + "epoch": 1.58, + "grad_norm": 29.804685592651367, + "learning_rate": 9.478308162155378e-06, + "loss": 1.5592, + "step": 12576 + }, + { + "epoch": 1.58, + "grad_norm": 19.848363876342773, + "learning_rate": 9.477471447098692e-06, + "loss": 1.0612, + "step": 12577 + }, + { + "epoch": 1.58, + "grad_norm": 18.877689361572266, + "learning_rate": 9.476634732042004e-06, + "loss": 2.5891, + "step": 12578 + }, + { + "epoch": 1.58, + "grad_norm": 18.858428955078125, + "learning_rate": 9.475798016985316e-06, + "loss": 3.1228, + "step": 12579 + }, + { + "epoch": 1.58, + "grad_norm": 24.79368782043457, + "learning_rate": 9.47496130192863e-06, + "loss": 1.2129, + "step": 12580 + }, + { + "epoch": 1.58, + "grad_norm": 12.999039649963379, + "learning_rate": 9.474124586871941e-06, + "loss": 1.8608, + "step": 12581 + }, + { + "epoch": 1.58, + "grad_norm": 7.107621669769287, + "learning_rate": 9.473287871815255e-06, + "loss": 0.7415, + "step": 12582 + }, + { + "epoch": 1.58, + "grad_norm": 5.948925971984863, + "learning_rate": 9.472451156758567e-06, + "loss": 0.3267, + "step": 12583 + }, + { + "epoch": 1.58, + "grad_norm": 28.19964027404785, + "learning_rate": 9.471614441701879e-06, + "loss": 3.2042, + "step": 12584 + }, + { + "epoch": 1.58, + "grad_norm": 9.318622589111328, + "learning_rate": 9.470777726645193e-06, + "loss": 2.0595, + "step": 12585 + }, + { + "epoch": 1.58, + "grad_norm": 17.369403839111328, + "learning_rate": 9.469941011588504e-06, + "loss": 1.1902, + "step": 12586 + }, + { + "epoch": 1.58, + "grad_norm": 12.831024169921875, + "learning_rate": 9.469104296531816e-06, + "loss": 0.8384, + "step": 12587 + }, + { + "epoch": 1.58, + "grad_norm": 10.816106796264648, + "learning_rate": 9.468267581475128e-06, + "loss": 1.1297, + "step": 12588 + }, + { + "epoch": 1.58, + "grad_norm": 393.5542297363281, + "learning_rate": 9.467430866418442e-06, + "loss": 1.3763, + "step": 12589 + }, + { + "epoch": 1.58, + "grad_norm": 13.01297664642334, + "learning_rate": 9.466594151361754e-06, + "loss": 0.5997, + "step": 12590 + }, + { + "epoch": 1.58, + "grad_norm": 12.69599437713623, + "learning_rate": 9.465757436305068e-06, + "loss": 0.7092, + "step": 12591 + }, + { + "epoch": 1.58, + "grad_norm": 9.60859489440918, + "learning_rate": 9.46492072124838e-06, + "loss": 1.2716, + "step": 12592 + }, + { + "epoch": 1.58, + "grad_norm": 10.400321006774902, + "learning_rate": 9.464084006191692e-06, + "loss": 0.568, + "step": 12593 + }, + { + "epoch": 1.58, + "grad_norm": 140.9966583251953, + "learning_rate": 9.463247291135005e-06, + "loss": 1.9688, + "step": 12594 + }, + { + "epoch": 1.58, + "grad_norm": 6.2600016593933105, + "learning_rate": 9.462410576078317e-06, + "loss": 1.351, + "step": 12595 + }, + { + "epoch": 1.58, + "grad_norm": 20.106847763061523, + "learning_rate": 9.46157386102163e-06, + "loss": 0.7045, + "step": 12596 + }, + { + "epoch": 1.58, + "grad_norm": 5.930684566497803, + "learning_rate": 9.460737145964943e-06, + "loss": 0.3359, + "step": 12597 + }, + { + "epoch": 1.58, + "grad_norm": 28.652748107910156, + "learning_rate": 9.459900430908255e-06, + "loss": 1.2072, + "step": 12598 + }, + { + "epoch": 1.58, + "grad_norm": 5.484364986419678, + "learning_rate": 9.459063715851567e-06, + "loss": 0.5294, + "step": 12599 + }, + { + "epoch": 1.58, + "grad_norm": 24.903980255126953, + "learning_rate": 9.45822700079488e-06, + "loss": 0.5383, + "step": 12600 + }, + { + "epoch": 1.58, + "grad_norm": 9.68040943145752, + "learning_rate": 9.457390285738192e-06, + "loss": 2.5263, + "step": 12601 + }, + { + "epoch": 1.58, + "grad_norm": 12.230031967163086, + "learning_rate": 9.456553570681504e-06, + "loss": 0.6999, + "step": 12602 + }, + { + "epoch": 1.58, + "grad_norm": 6.316253662109375, + "learning_rate": 9.455716855624818e-06, + "loss": 1.0758, + "step": 12603 + }, + { + "epoch": 1.58, + "grad_norm": 3.931807041168213, + "learning_rate": 9.45488014056813e-06, + "loss": 0.1845, + "step": 12604 + }, + { + "epoch": 1.58, + "grad_norm": 6.719529151916504, + "learning_rate": 9.454043425511443e-06, + "loss": 0.3294, + "step": 12605 + }, + { + "epoch": 1.58, + "grad_norm": 23.872106552124023, + "learning_rate": 9.453206710454755e-06, + "loss": 0.8509, + "step": 12606 + }, + { + "epoch": 1.58, + "grad_norm": 56.260189056396484, + "learning_rate": 9.452369995398067e-06, + "loss": 2.9153, + "step": 12607 + }, + { + "epoch": 1.58, + "grad_norm": 29.488616943359375, + "learning_rate": 9.451533280341381e-06, + "loss": 1.9577, + "step": 12608 + }, + { + "epoch": 1.58, + "grad_norm": 8.279169082641602, + "learning_rate": 9.450696565284693e-06, + "loss": 0.3277, + "step": 12609 + }, + { + "epoch": 1.58, + "grad_norm": 56.83429718017578, + "learning_rate": 9.449859850228007e-06, + "loss": 5.3126, + "step": 12610 + }, + { + "epoch": 1.58, + "grad_norm": 49.203041076660156, + "learning_rate": 9.449023135171319e-06, + "loss": 1.9299, + "step": 12611 + }, + { + "epoch": 1.58, + "grad_norm": 38.26557159423828, + "learning_rate": 9.44818642011463e-06, + "loss": 2.201, + "step": 12612 + }, + { + "epoch": 1.58, + "grad_norm": 6.391701698303223, + "learning_rate": 9.447349705057943e-06, + "loss": 0.5208, + "step": 12613 + }, + { + "epoch": 1.58, + "grad_norm": 26.8066349029541, + "learning_rate": 9.446512990001256e-06, + "loss": 1.7179, + "step": 12614 + }, + { + "epoch": 1.58, + "grad_norm": 20.002933502197266, + "learning_rate": 9.445676274944568e-06, + "loss": 1.5349, + "step": 12615 + }, + { + "epoch": 1.58, + "grad_norm": 20.30712890625, + "learning_rate": 9.44483955988788e-06, + "loss": 1.9082, + "step": 12616 + }, + { + "epoch": 1.58, + "grad_norm": 14.136492729187012, + "learning_rate": 9.444002844831194e-06, + "loss": 0.9949, + "step": 12617 + }, + { + "epoch": 1.58, + "grad_norm": 22.940683364868164, + "learning_rate": 9.443166129774506e-06, + "loss": 1.5033, + "step": 12618 + }, + { + "epoch": 1.58, + "grad_norm": 17.115327835083008, + "learning_rate": 9.44232941471782e-06, + "loss": 1.4698, + "step": 12619 + }, + { + "epoch": 1.58, + "grad_norm": 33.50325012207031, + "learning_rate": 9.441492699661131e-06, + "loss": 1.637, + "step": 12620 + }, + { + "epoch": 1.58, + "grad_norm": 38.30734634399414, + "learning_rate": 9.440655984604443e-06, + "loss": 1.4949, + "step": 12621 + }, + { + "epoch": 1.58, + "grad_norm": 5.381844520568848, + "learning_rate": 9.439819269547757e-06, + "loss": 0.3825, + "step": 12622 + }, + { + "epoch": 1.58, + "grad_norm": 8.267170906066895, + "learning_rate": 9.438982554491069e-06, + "loss": 0.4952, + "step": 12623 + }, + { + "epoch": 1.58, + "grad_norm": 25.081682205200195, + "learning_rate": 9.438145839434382e-06, + "loss": 1.7709, + "step": 12624 + }, + { + "epoch": 1.58, + "grad_norm": 3.4282147884368896, + "learning_rate": 9.437309124377694e-06, + "loss": 0.2464, + "step": 12625 + }, + { + "epoch": 1.58, + "grad_norm": 3.6459097862243652, + "learning_rate": 9.436472409321006e-06, + "loss": 0.2936, + "step": 12626 + }, + { + "epoch": 1.58, + "grad_norm": 25.131540298461914, + "learning_rate": 9.435635694264318e-06, + "loss": 0.7742, + "step": 12627 + }, + { + "epoch": 1.58, + "grad_norm": 13.032624244689941, + "learning_rate": 9.43479897920763e-06, + "loss": 1.2999, + "step": 12628 + }, + { + "epoch": 1.58, + "grad_norm": 13.023077011108398, + "learning_rate": 9.433962264150944e-06, + "loss": 1.355, + "step": 12629 + }, + { + "epoch": 1.59, + "grad_norm": 9.388630867004395, + "learning_rate": 9.433125549094256e-06, + "loss": 1.8461, + "step": 12630 + }, + { + "epoch": 1.59, + "grad_norm": 8.404267311096191, + "learning_rate": 9.43228883403757e-06, + "loss": 0.9715, + "step": 12631 + }, + { + "epoch": 1.59, + "grad_norm": 38.04804992675781, + "learning_rate": 9.431452118980882e-06, + "loss": 1.4528, + "step": 12632 + }, + { + "epoch": 1.59, + "grad_norm": 20.062524795532227, + "learning_rate": 9.430615403924195e-06, + "loss": 1.169, + "step": 12633 + }, + { + "epoch": 1.59, + "grad_norm": 186.04722595214844, + "learning_rate": 9.429778688867507e-06, + "loss": 1.8812, + "step": 12634 + }, + { + "epoch": 1.59, + "grad_norm": 8.48676586151123, + "learning_rate": 9.428941973810819e-06, + "loss": 1.0044, + "step": 12635 + }, + { + "epoch": 1.59, + "grad_norm": 4.133785724639893, + "learning_rate": 9.428105258754133e-06, + "loss": 1.0135, + "step": 12636 + }, + { + "epoch": 1.59, + "grad_norm": 19.21123504638672, + "learning_rate": 9.427268543697445e-06, + "loss": 0.8992, + "step": 12637 + }, + { + "epoch": 1.59, + "grad_norm": 7.901590824127197, + "learning_rate": 9.426431828640757e-06, + "loss": 0.7217, + "step": 12638 + }, + { + "epoch": 1.59, + "grad_norm": 21.584949493408203, + "learning_rate": 9.42559511358407e-06, + "loss": 1.4179, + "step": 12639 + }, + { + "epoch": 1.59, + "grad_norm": 10.24616813659668, + "learning_rate": 9.424758398527382e-06, + "loss": 0.419, + "step": 12640 + }, + { + "epoch": 1.59, + "grad_norm": 51.427574157714844, + "learning_rate": 9.423921683470694e-06, + "loss": 1.1515, + "step": 12641 + }, + { + "epoch": 1.59, + "grad_norm": 26.152320861816406, + "learning_rate": 9.423084968414006e-06, + "loss": 1.4699, + "step": 12642 + }, + { + "epoch": 1.59, + "grad_norm": 30.71882438659668, + "learning_rate": 9.42224825335732e-06, + "loss": 1.3176, + "step": 12643 + }, + { + "epoch": 1.59, + "grad_norm": 17.129243850708008, + "learning_rate": 9.421411538300632e-06, + "loss": 1.9202, + "step": 12644 + }, + { + "epoch": 1.59, + "grad_norm": 5.461491107940674, + "learning_rate": 9.420574823243945e-06, + "loss": 0.377, + "step": 12645 + }, + { + "epoch": 1.59, + "grad_norm": 90.69756317138672, + "learning_rate": 9.419738108187257e-06, + "loss": 1.5946, + "step": 12646 + }, + { + "epoch": 1.59, + "grad_norm": 11.628805160522461, + "learning_rate": 9.418901393130571e-06, + "loss": 1.3091, + "step": 12647 + }, + { + "epoch": 1.59, + "grad_norm": 17.364070892333984, + "learning_rate": 9.418064678073883e-06, + "loss": 1.3447, + "step": 12648 + }, + { + "epoch": 1.59, + "grad_norm": 14.625235557556152, + "learning_rate": 9.417227963017195e-06, + "loss": 1.1246, + "step": 12649 + }, + { + "epoch": 1.59, + "grad_norm": 22.807085037231445, + "learning_rate": 9.416391247960509e-06, + "loss": 0.6853, + "step": 12650 + }, + { + "epoch": 1.59, + "grad_norm": 3.4212002754211426, + "learning_rate": 9.41555453290382e-06, + "loss": 0.1706, + "step": 12651 + }, + { + "epoch": 1.59, + "grad_norm": 3.9466753005981445, + "learning_rate": 9.414717817847132e-06, + "loss": 0.3402, + "step": 12652 + }, + { + "epoch": 1.59, + "grad_norm": 7.102753162384033, + "learning_rate": 9.413881102790446e-06, + "loss": 0.689, + "step": 12653 + }, + { + "epoch": 1.59, + "grad_norm": 12.06006908416748, + "learning_rate": 9.413044387733758e-06, + "loss": 0.6928, + "step": 12654 + }, + { + "epoch": 1.59, + "grad_norm": 10.17564582824707, + "learning_rate": 9.41220767267707e-06, + "loss": 0.8857, + "step": 12655 + }, + { + "epoch": 1.59, + "grad_norm": 5.59531831741333, + "learning_rate": 9.411370957620382e-06, + "loss": 1.2905, + "step": 12656 + }, + { + "epoch": 1.59, + "grad_norm": 4.778902053833008, + "learning_rate": 9.410534242563696e-06, + "loss": 0.2996, + "step": 12657 + }, + { + "epoch": 1.59, + "grad_norm": 43.223758697509766, + "learning_rate": 9.409697527507008e-06, + "loss": 1.7344, + "step": 12658 + }, + { + "epoch": 1.59, + "grad_norm": 14.367379188537598, + "learning_rate": 9.408860812450321e-06, + "loss": 0.8148, + "step": 12659 + }, + { + "epoch": 1.59, + "grad_norm": 12.753751754760742, + "learning_rate": 9.408024097393633e-06, + "loss": 1.7903, + "step": 12660 + }, + { + "epoch": 1.59, + "grad_norm": 6.503912448883057, + "learning_rate": 9.407187382336947e-06, + "loss": 0.5994, + "step": 12661 + }, + { + "epoch": 1.59, + "grad_norm": 2.3146650791168213, + "learning_rate": 9.406350667280259e-06, + "loss": 0.0388, + "step": 12662 + }, + { + "epoch": 1.59, + "grad_norm": 12.779787063598633, + "learning_rate": 9.40551395222357e-06, + "loss": 1.737, + "step": 12663 + }, + { + "epoch": 1.59, + "grad_norm": 21.04336166381836, + "learning_rate": 9.404677237166884e-06, + "loss": 1.2667, + "step": 12664 + }, + { + "epoch": 1.59, + "grad_norm": 12.546812057495117, + "learning_rate": 9.403840522110196e-06, + "loss": 0.9729, + "step": 12665 + }, + { + "epoch": 1.59, + "grad_norm": 9.291451454162598, + "learning_rate": 9.403003807053508e-06, + "loss": 0.378, + "step": 12666 + }, + { + "epoch": 1.59, + "grad_norm": 2.457207441329956, + "learning_rate": 9.40216709199682e-06, + "loss": 0.0792, + "step": 12667 + }, + { + "epoch": 1.59, + "grad_norm": 13.121223449707031, + "learning_rate": 9.401330376940134e-06, + "loss": 0.7211, + "step": 12668 + }, + { + "epoch": 1.59, + "grad_norm": 63.11892318725586, + "learning_rate": 9.400493661883446e-06, + "loss": 1.0022, + "step": 12669 + }, + { + "epoch": 1.59, + "grad_norm": 6.179528713226318, + "learning_rate": 9.399656946826758e-06, + "loss": 0.3718, + "step": 12670 + }, + { + "epoch": 1.59, + "grad_norm": 83.38644409179688, + "learning_rate": 9.398820231770071e-06, + "loss": 2.4827, + "step": 12671 + }, + { + "epoch": 1.59, + "grad_norm": 50.963565826416016, + "learning_rate": 9.397983516713383e-06, + "loss": 3.134, + "step": 12672 + }, + { + "epoch": 1.59, + "grad_norm": 36.63359832763672, + "learning_rate": 9.397146801656697e-06, + "loss": 1.5886, + "step": 12673 + }, + { + "epoch": 1.59, + "grad_norm": 30.252046585083008, + "learning_rate": 9.396310086600009e-06, + "loss": 3.0493, + "step": 12674 + }, + { + "epoch": 1.59, + "grad_norm": 9.03751277923584, + "learning_rate": 9.395473371543323e-06, + "loss": 1.2414, + "step": 12675 + }, + { + "epoch": 1.59, + "grad_norm": 14.713725090026855, + "learning_rate": 9.394636656486635e-06, + "loss": 2.2616, + "step": 12676 + }, + { + "epoch": 1.59, + "grad_norm": 8.644331932067871, + "learning_rate": 9.393799941429947e-06, + "loss": 1.9946, + "step": 12677 + }, + { + "epoch": 1.59, + "grad_norm": 24.73609161376953, + "learning_rate": 9.39296322637326e-06, + "loss": 1.852, + "step": 12678 + }, + { + "epoch": 1.59, + "grad_norm": 23.22011375427246, + "learning_rate": 9.392126511316572e-06, + "loss": 1.1773, + "step": 12679 + }, + { + "epoch": 1.59, + "grad_norm": 7.167316913604736, + "learning_rate": 9.391289796259884e-06, + "loss": 0.0825, + "step": 12680 + }, + { + "epoch": 1.59, + "grad_norm": 20.045015335083008, + "learning_rate": 9.390453081203196e-06, + "loss": 2.0277, + "step": 12681 + }, + { + "epoch": 1.59, + "grad_norm": 50.51588439941406, + "learning_rate": 9.38961636614651e-06, + "loss": 1.8516, + "step": 12682 + }, + { + "epoch": 1.59, + "grad_norm": 41.1562614440918, + "learning_rate": 9.388779651089822e-06, + "loss": 0.3866, + "step": 12683 + }, + { + "epoch": 1.59, + "grad_norm": 13.490531921386719, + "learning_rate": 9.387942936033134e-06, + "loss": 1.4136, + "step": 12684 + }, + { + "epoch": 1.59, + "grad_norm": 15.883825302124023, + "learning_rate": 9.387106220976447e-06, + "loss": 1.1284, + "step": 12685 + }, + { + "epoch": 1.59, + "grad_norm": 11.150424003601074, + "learning_rate": 9.38626950591976e-06, + "loss": 0.5869, + "step": 12686 + }, + { + "epoch": 1.59, + "grad_norm": 10.384401321411133, + "learning_rate": 9.385432790863073e-06, + "loss": 2.0767, + "step": 12687 + }, + { + "epoch": 1.59, + "grad_norm": 52.81633377075195, + "learning_rate": 9.384596075806385e-06, + "loss": 1.0207, + "step": 12688 + }, + { + "epoch": 1.59, + "grad_norm": 10.21898078918457, + "learning_rate": 9.383759360749698e-06, + "loss": 1.3715, + "step": 12689 + }, + { + "epoch": 1.59, + "grad_norm": 105.48194122314453, + "learning_rate": 9.38292264569301e-06, + "loss": 1.9724, + "step": 12690 + }, + { + "epoch": 1.59, + "grad_norm": 19.446487426757812, + "learning_rate": 9.382085930636322e-06, + "loss": 1.4847, + "step": 12691 + }, + { + "epoch": 1.59, + "grad_norm": 12.214441299438477, + "learning_rate": 9.381249215579636e-06, + "loss": 0.5879, + "step": 12692 + }, + { + "epoch": 1.59, + "grad_norm": 410.44830322265625, + "learning_rate": 9.380412500522948e-06, + "loss": 1.2085, + "step": 12693 + }, + { + "epoch": 1.59, + "grad_norm": 32.169864654541016, + "learning_rate": 9.37957578546626e-06, + "loss": 1.2658, + "step": 12694 + }, + { + "epoch": 1.59, + "grad_norm": 18.428232192993164, + "learning_rate": 9.378739070409572e-06, + "loss": 1.5905, + "step": 12695 + }, + { + "epoch": 1.59, + "grad_norm": 45.33714294433594, + "learning_rate": 9.377902355352886e-06, + "loss": 1.3951, + "step": 12696 + }, + { + "epoch": 1.59, + "grad_norm": 17.632648468017578, + "learning_rate": 9.377065640296198e-06, + "loss": 1.3787, + "step": 12697 + }, + { + "epoch": 1.59, + "grad_norm": 7.390748500823975, + "learning_rate": 9.37622892523951e-06, + "loss": 0.3028, + "step": 12698 + }, + { + "epoch": 1.59, + "grad_norm": 60.25581741333008, + "learning_rate": 9.375392210182823e-06, + "loss": 2.4936, + "step": 12699 + }, + { + "epoch": 1.59, + "grad_norm": 43.981231689453125, + "learning_rate": 9.374555495126135e-06, + "loss": 0.7651, + "step": 12700 + }, + { + "epoch": 1.59, + "grad_norm": 11.389113426208496, + "learning_rate": 9.373718780069449e-06, + "loss": 0.4666, + "step": 12701 + }, + { + "epoch": 1.59, + "grad_norm": 6.8582377433776855, + "learning_rate": 9.37288206501276e-06, + "loss": 0.7208, + "step": 12702 + }, + { + "epoch": 1.59, + "grad_norm": 24.401554107666016, + "learning_rate": 9.372045349956074e-06, + "loss": 1.1998, + "step": 12703 + }, + { + "epoch": 1.59, + "grad_norm": 2.09210467338562, + "learning_rate": 9.371208634899386e-06, + "loss": 0.0478, + "step": 12704 + }, + { + "epoch": 1.59, + "grad_norm": 7.204529285430908, + "learning_rate": 9.370371919842698e-06, + "loss": 1.1222, + "step": 12705 + }, + { + "epoch": 1.59, + "grad_norm": 16.0089111328125, + "learning_rate": 9.369535204786012e-06, + "loss": 1.8902, + "step": 12706 + }, + { + "epoch": 1.59, + "grad_norm": 9.174287796020508, + "learning_rate": 9.368698489729324e-06, + "loss": 0.6599, + "step": 12707 + }, + { + "epoch": 1.59, + "grad_norm": 11.618083953857422, + "learning_rate": 9.367861774672636e-06, + "loss": 1.0567, + "step": 12708 + }, + { + "epoch": 1.59, + "grad_norm": 13.289433479309082, + "learning_rate": 9.367025059615948e-06, + "loss": 2.0885, + "step": 12709 + }, + { + "epoch": 1.6, + "grad_norm": 2.8225820064544678, + "learning_rate": 9.366188344559261e-06, + "loss": 0.8951, + "step": 12710 + }, + { + "epoch": 1.6, + "grad_norm": 6.53161096572876, + "learning_rate": 9.365351629502573e-06, + "loss": 0.2615, + "step": 12711 + }, + { + "epoch": 1.6, + "grad_norm": 14.11289119720459, + "learning_rate": 9.364514914445885e-06, + "loss": 0.9496, + "step": 12712 + }, + { + "epoch": 1.6, + "grad_norm": 89.39127349853516, + "learning_rate": 9.363678199389199e-06, + "loss": 0.3615, + "step": 12713 + }, + { + "epoch": 1.6, + "grad_norm": 12.24676513671875, + "learning_rate": 9.362841484332511e-06, + "loss": 0.607, + "step": 12714 + }, + { + "epoch": 1.6, + "grad_norm": 22.23177719116211, + "learning_rate": 9.362004769275825e-06, + "loss": 1.5199, + "step": 12715 + }, + { + "epoch": 1.6, + "grad_norm": 105.7651138305664, + "learning_rate": 9.361168054219137e-06, + "loss": 1.6218, + "step": 12716 + }, + { + "epoch": 1.6, + "grad_norm": 12.95954704284668, + "learning_rate": 9.36033133916245e-06, + "loss": 1.3081, + "step": 12717 + }, + { + "epoch": 1.6, + "grad_norm": 7.558269500732422, + "learning_rate": 9.359494624105762e-06, + "loss": 1.1606, + "step": 12718 + }, + { + "epoch": 1.6, + "grad_norm": 28.80192756652832, + "learning_rate": 9.358657909049074e-06, + "loss": 1.4126, + "step": 12719 + }, + { + "epoch": 1.6, + "grad_norm": 14.604012489318848, + "learning_rate": 9.357821193992386e-06, + "loss": 0.9703, + "step": 12720 + }, + { + "epoch": 1.6, + "grad_norm": 7.633918762207031, + "learning_rate": 9.3569844789357e-06, + "loss": 0.6268, + "step": 12721 + }, + { + "epoch": 1.6, + "grad_norm": 9.201777458190918, + "learning_rate": 9.356147763879012e-06, + "loss": 0.4496, + "step": 12722 + }, + { + "epoch": 1.6, + "grad_norm": 101.30696105957031, + "learning_rate": 9.355311048822324e-06, + "loss": 1.3412, + "step": 12723 + }, + { + "epoch": 1.6, + "grad_norm": 6.9050068855285645, + "learning_rate": 9.354474333765637e-06, + "loss": 0.7253, + "step": 12724 + }, + { + "epoch": 1.6, + "grad_norm": 14.176651954650879, + "learning_rate": 9.35363761870895e-06, + "loss": 2.8281, + "step": 12725 + }, + { + "epoch": 1.6, + "grad_norm": 20.2751522064209, + "learning_rate": 9.352800903652261e-06, + "loss": 0.6233, + "step": 12726 + }, + { + "epoch": 1.6, + "grad_norm": 6.783174991607666, + "learning_rate": 9.351964188595575e-06, + "loss": 0.1949, + "step": 12727 + }, + { + "epoch": 1.6, + "grad_norm": 21.548534393310547, + "learning_rate": 9.351127473538887e-06, + "loss": 1.0327, + "step": 12728 + }, + { + "epoch": 1.6, + "grad_norm": 18.70937728881836, + "learning_rate": 9.3502907584822e-06, + "loss": 1.1648, + "step": 12729 + }, + { + "epoch": 1.6, + "grad_norm": 10.633247375488281, + "learning_rate": 9.349454043425512e-06, + "loss": 2.1667, + "step": 12730 + }, + { + "epoch": 1.6, + "grad_norm": 42.88833236694336, + "learning_rate": 9.348617328368826e-06, + "loss": 1.4255, + "step": 12731 + }, + { + "epoch": 1.6, + "grad_norm": 15.978346824645996, + "learning_rate": 9.347780613312138e-06, + "loss": 0.6872, + "step": 12732 + }, + { + "epoch": 1.6, + "grad_norm": 9.080323219299316, + "learning_rate": 9.34694389825545e-06, + "loss": 0.5269, + "step": 12733 + }, + { + "epoch": 1.6, + "grad_norm": 10.514498710632324, + "learning_rate": 9.346107183198762e-06, + "loss": 1.2947, + "step": 12734 + }, + { + "epoch": 1.6, + "grad_norm": 7.079040050506592, + "learning_rate": 9.345270468142076e-06, + "loss": 0.8225, + "step": 12735 + }, + { + "epoch": 1.6, + "grad_norm": 11.065547943115234, + "learning_rate": 9.344433753085387e-06, + "loss": 1.3093, + "step": 12736 + }, + { + "epoch": 1.6, + "grad_norm": 14.474076271057129, + "learning_rate": 9.3435970380287e-06, + "loss": 0.6239, + "step": 12737 + }, + { + "epoch": 1.6, + "grad_norm": 25.227272033691406, + "learning_rate": 9.342760322972013e-06, + "loss": 1.5511, + "step": 12738 + }, + { + "epoch": 1.6, + "grad_norm": 36.528316497802734, + "learning_rate": 9.341923607915325e-06, + "loss": 0.987, + "step": 12739 + }, + { + "epoch": 1.6, + "grad_norm": 86.75886535644531, + "learning_rate": 9.341086892858637e-06, + "loss": 0.8112, + "step": 12740 + }, + { + "epoch": 1.6, + "grad_norm": 10.102453231811523, + "learning_rate": 9.34025017780195e-06, + "loss": 1.2799, + "step": 12741 + }, + { + "epoch": 1.6, + "grad_norm": 21.57074737548828, + "learning_rate": 9.339413462745263e-06, + "loss": 1.8643, + "step": 12742 + }, + { + "epoch": 1.6, + "grad_norm": 16.9290828704834, + "learning_rate": 9.338576747688576e-06, + "loss": 0.3976, + "step": 12743 + }, + { + "epoch": 1.6, + "grad_norm": 25.484235763549805, + "learning_rate": 9.337740032631888e-06, + "loss": 0.6294, + "step": 12744 + }, + { + "epoch": 1.6, + "grad_norm": 28.827713012695312, + "learning_rate": 9.336903317575202e-06, + "loss": 1.6037, + "step": 12745 + }, + { + "epoch": 1.6, + "grad_norm": 11.305099487304688, + "learning_rate": 9.336066602518514e-06, + "loss": 0.8748, + "step": 12746 + }, + { + "epoch": 1.6, + "grad_norm": 17.736371994018555, + "learning_rate": 9.335229887461826e-06, + "loss": 1.1183, + "step": 12747 + }, + { + "epoch": 1.6, + "grad_norm": 13.301612854003906, + "learning_rate": 9.334393172405138e-06, + "loss": 1.3793, + "step": 12748 + }, + { + "epoch": 1.6, + "grad_norm": 14.686193466186523, + "learning_rate": 9.33355645734845e-06, + "loss": 3.1688, + "step": 12749 + }, + { + "epoch": 1.6, + "grad_norm": 15.956156730651855, + "learning_rate": 9.332719742291763e-06, + "loss": 0.8889, + "step": 12750 + }, + { + "epoch": 1.6, + "grad_norm": 16.69364356994629, + "learning_rate": 9.331883027235075e-06, + "loss": 1.9165, + "step": 12751 + }, + { + "epoch": 1.6, + "grad_norm": 12.45902156829834, + "learning_rate": 9.331046312178389e-06, + "loss": 0.7861, + "step": 12752 + }, + { + "epoch": 1.6, + "grad_norm": 17.342580795288086, + "learning_rate": 9.330209597121701e-06, + "loss": 1.3061, + "step": 12753 + }, + { + "epoch": 1.6, + "grad_norm": 11.511950492858887, + "learning_rate": 9.329372882065013e-06, + "loss": 1.1508, + "step": 12754 + }, + { + "epoch": 1.6, + "grad_norm": 8.777467727661133, + "learning_rate": 9.328536167008326e-06, + "loss": 0.3007, + "step": 12755 + }, + { + "epoch": 1.6, + "grad_norm": 17.43426513671875, + "learning_rate": 9.327699451951638e-06, + "loss": 1.1306, + "step": 12756 + }, + { + "epoch": 1.6, + "grad_norm": 6.546712875366211, + "learning_rate": 9.326862736894952e-06, + "loss": 0.7657, + "step": 12757 + }, + { + "epoch": 1.6, + "grad_norm": 15.232168197631836, + "learning_rate": 9.326026021838264e-06, + "loss": 1.9775, + "step": 12758 + }, + { + "epoch": 1.6, + "grad_norm": 10.149893760681152, + "learning_rate": 9.325189306781576e-06, + "loss": 0.7729, + "step": 12759 + }, + { + "epoch": 1.6, + "grad_norm": 6.922850608825684, + "learning_rate": 9.32435259172489e-06, + "loss": 0.3599, + "step": 12760 + }, + { + "epoch": 1.6, + "grad_norm": 12.676969528198242, + "learning_rate": 9.323515876668202e-06, + "loss": 0.6969, + "step": 12761 + }, + { + "epoch": 1.6, + "grad_norm": 7.15059757232666, + "learning_rate": 9.322679161611514e-06, + "loss": 0.2292, + "step": 12762 + }, + { + "epoch": 1.6, + "grad_norm": 10.097254753112793, + "learning_rate": 9.321842446554826e-06, + "loss": 1.7266, + "step": 12763 + }, + { + "epoch": 1.6, + "grad_norm": 38.53457260131836, + "learning_rate": 9.321005731498139e-06, + "loss": 1.8251, + "step": 12764 + }, + { + "epoch": 1.6, + "grad_norm": 19.701183319091797, + "learning_rate": 9.320169016441451e-06, + "loss": 1.5037, + "step": 12765 + }, + { + "epoch": 1.6, + "grad_norm": 14.709261894226074, + "learning_rate": 9.319332301384763e-06, + "loss": 0.9079, + "step": 12766 + }, + { + "epoch": 1.6, + "grad_norm": 8.67050838470459, + "learning_rate": 9.318495586328077e-06, + "loss": 0.7052, + "step": 12767 + }, + { + "epoch": 1.6, + "grad_norm": 10.067826271057129, + "learning_rate": 9.317658871271389e-06, + "loss": 0.3172, + "step": 12768 + }, + { + "epoch": 1.6, + "grad_norm": 13.909753799438477, + "learning_rate": 9.316822156214702e-06, + "loss": 0.6805, + "step": 12769 + }, + { + "epoch": 1.6, + "grad_norm": 15.233804702758789, + "learning_rate": 9.315985441158014e-06, + "loss": 1.4905, + "step": 12770 + }, + { + "epoch": 1.6, + "grad_norm": 16.987150192260742, + "learning_rate": 9.315148726101328e-06, + "loss": 1.4539, + "step": 12771 + }, + { + "epoch": 1.6, + "grad_norm": 4.675383567810059, + "learning_rate": 9.31431201104464e-06, + "loss": 0.47, + "step": 12772 + }, + { + "epoch": 1.6, + "grad_norm": 6.47659158706665, + "learning_rate": 9.313475295987952e-06, + "loss": 0.3059, + "step": 12773 + }, + { + "epoch": 1.6, + "grad_norm": 11.951998710632324, + "learning_rate": 9.312638580931265e-06, + "loss": 1.4986, + "step": 12774 + }, + { + "epoch": 1.6, + "grad_norm": 25.695837020874023, + "learning_rate": 9.311801865874577e-06, + "loss": 1.4772, + "step": 12775 + }, + { + "epoch": 1.6, + "grad_norm": 20.428926467895508, + "learning_rate": 9.31096515081789e-06, + "loss": 1.0381, + "step": 12776 + }, + { + "epoch": 1.6, + "grad_norm": 12.581475257873535, + "learning_rate": 9.310128435761201e-06, + "loss": 0.7638, + "step": 12777 + }, + { + "epoch": 1.6, + "grad_norm": 12.591764450073242, + "learning_rate": 9.309291720704515e-06, + "loss": 1.7015, + "step": 12778 + }, + { + "epoch": 1.6, + "grad_norm": 20.13874626159668, + "learning_rate": 9.308455005647827e-06, + "loss": 1.1436, + "step": 12779 + }, + { + "epoch": 1.6, + "grad_norm": 27.49405860900879, + "learning_rate": 9.307618290591139e-06, + "loss": 0.7916, + "step": 12780 + }, + { + "epoch": 1.6, + "grad_norm": 73.70000457763672, + "learning_rate": 9.306781575534453e-06, + "loss": 2.4081, + "step": 12781 + }, + { + "epoch": 1.6, + "grad_norm": 9.437817573547363, + "learning_rate": 9.305944860477765e-06, + "loss": 0.3377, + "step": 12782 + }, + { + "epoch": 1.6, + "grad_norm": 23.251123428344727, + "learning_rate": 9.305108145421078e-06, + "loss": 0.8318, + "step": 12783 + }, + { + "epoch": 1.6, + "grad_norm": 8.730528831481934, + "learning_rate": 9.30427143036439e-06, + "loss": 0.7024, + "step": 12784 + }, + { + "epoch": 1.6, + "grad_norm": 22.202592849731445, + "learning_rate": 9.303434715307704e-06, + "loss": 1.2179, + "step": 12785 + }, + { + "epoch": 1.6, + "grad_norm": 11.280617713928223, + "learning_rate": 9.302598000251016e-06, + "loss": 1.2216, + "step": 12786 + }, + { + "epoch": 1.6, + "grad_norm": 66.64436340332031, + "learning_rate": 9.301761285194328e-06, + "loss": 1.0517, + "step": 12787 + }, + { + "epoch": 1.6, + "grad_norm": 8.43831729888916, + "learning_rate": 9.300924570137641e-06, + "loss": 1.3776, + "step": 12788 + }, + { + "epoch": 1.6, + "grad_norm": 18.07183074951172, + "learning_rate": 9.300087855080953e-06, + "loss": 1.8422, + "step": 12789 + }, + { + "epoch": 1.61, + "grad_norm": 24.829612731933594, + "learning_rate": 9.299251140024265e-06, + "loss": 1.9916, + "step": 12790 + }, + { + "epoch": 1.61, + "grad_norm": 4.854472637176514, + "learning_rate": 9.298414424967577e-06, + "loss": 0.3375, + "step": 12791 + }, + { + "epoch": 1.61, + "grad_norm": 8.297489166259766, + "learning_rate": 9.29757770991089e-06, + "loss": 1.4392, + "step": 12792 + }, + { + "epoch": 1.61, + "grad_norm": 6.604464530944824, + "learning_rate": 9.296740994854203e-06, + "loss": 0.2846, + "step": 12793 + }, + { + "epoch": 1.61, + "grad_norm": 16.88629150390625, + "learning_rate": 9.295904279797515e-06, + "loss": 1.2431, + "step": 12794 + }, + { + "epoch": 1.61, + "grad_norm": 35.367759704589844, + "learning_rate": 9.295067564740828e-06, + "loss": 1.9533, + "step": 12795 + }, + { + "epoch": 1.61, + "grad_norm": 20.0457763671875, + "learning_rate": 9.29423084968414e-06, + "loss": 0.9273, + "step": 12796 + }, + { + "epoch": 1.61, + "grad_norm": 30.456619262695312, + "learning_rate": 9.293394134627454e-06, + "loss": 1.9967, + "step": 12797 + }, + { + "epoch": 1.61, + "grad_norm": 31.33993911743164, + "learning_rate": 9.292557419570766e-06, + "loss": 2.6482, + "step": 12798 + }, + { + "epoch": 1.61, + "grad_norm": 12.192071914672852, + "learning_rate": 9.29172070451408e-06, + "loss": 0.5799, + "step": 12799 + }, + { + "epoch": 1.61, + "grad_norm": 12.041566848754883, + "learning_rate": 9.290883989457392e-06, + "loss": 0.5665, + "step": 12800 + }, + { + "epoch": 1.61, + "eval_loss": 0.08084025979042053, + "eval_runtime": 94.8744, + "eval_samples_per_second": 37.334, + "eval_steps_per_second": 37.334, + "step": 12800 + }, + { + "epoch": 1.61, + "grad_norm": 13.741267204284668, + "learning_rate": 9.290047274400704e-06, + "loss": 1.1971, + "step": 12801 + }, + { + "epoch": 1.61, + "grad_norm": 12.394692420959473, + "learning_rate": 9.289210559344015e-06, + "loss": 0.8088, + "step": 12802 + }, + { + "epoch": 1.61, + "grad_norm": 30.322452545166016, + "learning_rate": 9.288373844287329e-06, + "loss": 1.6056, + "step": 12803 + }, + { + "epoch": 1.61, + "grad_norm": 8.515654563903809, + "learning_rate": 9.287537129230641e-06, + "loss": 0.7914, + "step": 12804 + }, + { + "epoch": 1.61, + "grad_norm": 22.28693962097168, + "learning_rate": 9.286700414173953e-06, + "loss": 1.9402, + "step": 12805 + }, + { + "epoch": 1.61, + "grad_norm": 5.956439971923828, + "learning_rate": 9.285863699117267e-06, + "loss": 1.3111, + "step": 12806 + }, + { + "epoch": 1.61, + "grad_norm": 40.35560607910156, + "learning_rate": 9.285026984060579e-06, + "loss": 2.2955, + "step": 12807 + }, + { + "epoch": 1.61, + "grad_norm": 5.762033939361572, + "learning_rate": 9.28419026900389e-06, + "loss": 0.6368, + "step": 12808 + }, + { + "epoch": 1.61, + "grad_norm": 13.975659370422363, + "learning_rate": 9.283353553947204e-06, + "loss": 1.6104, + "step": 12809 + }, + { + "epoch": 1.61, + "grad_norm": 12.66861629486084, + "learning_rate": 9.282516838890516e-06, + "loss": 0.8433, + "step": 12810 + }, + { + "epoch": 1.61, + "grad_norm": 15.702493667602539, + "learning_rate": 9.28168012383383e-06, + "loss": 1.4411, + "step": 12811 + }, + { + "epoch": 1.61, + "grad_norm": 27.008512496948242, + "learning_rate": 9.280843408777142e-06, + "loss": 0.9404, + "step": 12812 + }, + { + "epoch": 1.61, + "grad_norm": 14.046055793762207, + "learning_rate": 9.280006693720455e-06, + "loss": 0.9495, + "step": 12813 + }, + { + "epoch": 1.61, + "grad_norm": 13.30699348449707, + "learning_rate": 9.279169978663767e-06, + "loss": 0.6348, + "step": 12814 + }, + { + "epoch": 1.61, + "grad_norm": 7.643478870391846, + "learning_rate": 9.27833326360708e-06, + "loss": 0.2463, + "step": 12815 + }, + { + "epoch": 1.61, + "grad_norm": 10.765046119689941, + "learning_rate": 9.277496548550391e-06, + "loss": 0.4739, + "step": 12816 + }, + { + "epoch": 1.61, + "grad_norm": 17.187000274658203, + "learning_rate": 9.276659833493705e-06, + "loss": 1.0207, + "step": 12817 + }, + { + "epoch": 1.61, + "grad_norm": 1.7416460514068604, + "learning_rate": 9.275823118437017e-06, + "loss": 0.0402, + "step": 12818 + }, + { + "epoch": 1.61, + "grad_norm": 23.240400314331055, + "learning_rate": 9.274986403380329e-06, + "loss": 0.8305, + "step": 12819 + }, + { + "epoch": 1.61, + "grad_norm": 7.226999759674072, + "learning_rate": 9.274149688323643e-06, + "loss": 0.949, + "step": 12820 + }, + { + "epoch": 1.61, + "grad_norm": 8.760710716247559, + "learning_rate": 9.273312973266954e-06, + "loss": 0.635, + "step": 12821 + }, + { + "epoch": 1.61, + "grad_norm": 9.73095989227295, + "learning_rate": 9.272476258210266e-06, + "loss": 0.5365, + "step": 12822 + }, + { + "epoch": 1.61, + "grad_norm": 15.786834716796875, + "learning_rate": 9.27163954315358e-06, + "loss": 0.5541, + "step": 12823 + }, + { + "epoch": 1.61, + "grad_norm": 6.006377220153809, + "learning_rate": 9.270802828096892e-06, + "loss": 0.4464, + "step": 12824 + }, + { + "epoch": 1.61, + "grad_norm": 48.15402603149414, + "learning_rate": 9.269966113040206e-06, + "loss": 1.9272, + "step": 12825 + }, + { + "epoch": 1.61, + "grad_norm": 20.68983268737793, + "learning_rate": 9.269129397983518e-06, + "loss": 1.3508, + "step": 12826 + }, + { + "epoch": 1.61, + "grad_norm": 12.626483917236328, + "learning_rate": 9.268292682926831e-06, + "loss": 1.1429, + "step": 12827 + }, + { + "epoch": 1.61, + "grad_norm": 14.831421852111816, + "learning_rate": 9.267455967870143e-06, + "loss": 0.8639, + "step": 12828 + }, + { + "epoch": 1.61, + "grad_norm": 11.292095184326172, + "learning_rate": 9.266619252813455e-06, + "loss": 0.8774, + "step": 12829 + }, + { + "epoch": 1.61, + "grad_norm": 9.60373592376709, + "learning_rate": 9.265782537756767e-06, + "loss": 1.5242, + "step": 12830 + }, + { + "epoch": 1.61, + "grad_norm": 27.918676376342773, + "learning_rate": 9.264945822700079e-06, + "loss": 2.0699, + "step": 12831 + }, + { + "epoch": 1.61, + "grad_norm": 5.252095699310303, + "learning_rate": 9.264109107643393e-06, + "loss": 0.2138, + "step": 12832 + }, + { + "epoch": 1.61, + "grad_norm": 23.539323806762695, + "learning_rate": 9.263272392586705e-06, + "loss": 1.0934, + "step": 12833 + }, + { + "epoch": 1.61, + "grad_norm": 22.99410629272461, + "learning_rate": 9.262435677530018e-06, + "loss": 2.6611, + "step": 12834 + }, + { + "epoch": 1.61, + "grad_norm": 8.666620254516602, + "learning_rate": 9.26159896247333e-06, + "loss": 0.3635, + "step": 12835 + }, + { + "epoch": 1.61, + "grad_norm": 14.20853042602539, + "learning_rate": 9.260762247416642e-06, + "loss": 0.7676, + "step": 12836 + }, + { + "epoch": 1.61, + "grad_norm": 5.007308483123779, + "learning_rate": 9.259925532359956e-06, + "loss": 0.3601, + "step": 12837 + }, + { + "epoch": 1.61, + "grad_norm": 13.300216674804688, + "learning_rate": 9.259088817303268e-06, + "loss": 1.1097, + "step": 12838 + }, + { + "epoch": 1.61, + "grad_norm": 29.382566452026367, + "learning_rate": 9.258252102246581e-06, + "loss": 1.257, + "step": 12839 + }, + { + "epoch": 1.61, + "grad_norm": 128.78793334960938, + "learning_rate": 9.257415387189893e-06, + "loss": 2.0364, + "step": 12840 + }, + { + "epoch": 1.61, + "grad_norm": 13.95974349975586, + "learning_rate": 9.256578672133207e-06, + "loss": 1.3131, + "step": 12841 + }, + { + "epoch": 1.61, + "grad_norm": 9.428278923034668, + "learning_rate": 9.255741957076519e-06, + "loss": 0.4572, + "step": 12842 + }, + { + "epoch": 1.61, + "grad_norm": 9.121671676635742, + "learning_rate": 9.254905242019831e-06, + "loss": 0.5315, + "step": 12843 + }, + { + "epoch": 1.61, + "grad_norm": 6.823456764221191, + "learning_rate": 9.254068526963143e-06, + "loss": 0.0958, + "step": 12844 + }, + { + "epoch": 1.61, + "grad_norm": 20.92385482788086, + "learning_rate": 9.253231811906455e-06, + "loss": 1.9386, + "step": 12845 + }, + { + "epoch": 1.61, + "grad_norm": 8.319921493530273, + "learning_rate": 9.252395096849769e-06, + "loss": 0.3208, + "step": 12846 + }, + { + "epoch": 1.61, + "grad_norm": 14.565032958984375, + "learning_rate": 9.25155838179308e-06, + "loss": 1.3546, + "step": 12847 + }, + { + "epoch": 1.61, + "grad_norm": 9.67790699005127, + "learning_rate": 9.250721666736394e-06, + "loss": 1.176, + "step": 12848 + }, + { + "epoch": 1.61, + "grad_norm": 18.494850158691406, + "learning_rate": 9.249884951679706e-06, + "loss": 2.5711, + "step": 12849 + }, + { + "epoch": 1.61, + "grad_norm": 44.53583908081055, + "learning_rate": 9.249048236623018e-06, + "loss": 1.2291, + "step": 12850 + }, + { + "epoch": 1.61, + "grad_norm": 21.787281036376953, + "learning_rate": 9.248211521566332e-06, + "loss": 1.064, + "step": 12851 + }, + { + "epoch": 1.61, + "grad_norm": 25.983638763427734, + "learning_rate": 9.247374806509644e-06, + "loss": 1.1076, + "step": 12852 + }, + { + "epoch": 1.61, + "grad_norm": 15.283540725708008, + "learning_rate": 9.246538091452957e-06, + "loss": 0.922, + "step": 12853 + }, + { + "epoch": 1.61, + "grad_norm": 23.032045364379883, + "learning_rate": 9.24570137639627e-06, + "loss": 1.5744, + "step": 12854 + }, + { + "epoch": 1.61, + "grad_norm": 16.80522918701172, + "learning_rate": 9.244864661339581e-06, + "loss": 0.8887, + "step": 12855 + }, + { + "epoch": 1.61, + "grad_norm": 14.449721336364746, + "learning_rate": 9.244027946282895e-06, + "loss": 0.625, + "step": 12856 + }, + { + "epoch": 1.61, + "grad_norm": 12.633468627929688, + "learning_rate": 9.243191231226207e-06, + "loss": 2.3903, + "step": 12857 + }, + { + "epoch": 1.61, + "grad_norm": 20.23297691345215, + "learning_rate": 9.242354516169519e-06, + "loss": 1.6446, + "step": 12858 + }, + { + "epoch": 1.61, + "grad_norm": 5.653713226318359, + "learning_rate": 9.24151780111283e-06, + "loss": 0.1913, + "step": 12859 + }, + { + "epoch": 1.61, + "grad_norm": 9.748419761657715, + "learning_rate": 9.240681086056144e-06, + "loss": 0.6558, + "step": 12860 + }, + { + "epoch": 1.61, + "grad_norm": 13.776673316955566, + "learning_rate": 9.239844370999456e-06, + "loss": 0.4821, + "step": 12861 + }, + { + "epoch": 1.61, + "grad_norm": 29.427501678466797, + "learning_rate": 9.23900765594277e-06, + "loss": 1.1989, + "step": 12862 + }, + { + "epoch": 1.61, + "grad_norm": 8.08488941192627, + "learning_rate": 9.238170940886082e-06, + "loss": 0.3813, + "step": 12863 + }, + { + "epoch": 1.61, + "grad_norm": 77.74174499511719, + "learning_rate": 9.237334225829394e-06, + "loss": 2.1877, + "step": 12864 + }, + { + "epoch": 1.61, + "grad_norm": 38.8492317199707, + "learning_rate": 9.236497510772708e-06, + "loss": 2.0237, + "step": 12865 + }, + { + "epoch": 1.61, + "grad_norm": 15.239294052124023, + "learning_rate": 9.23566079571602e-06, + "loss": 0.7539, + "step": 12866 + }, + { + "epoch": 1.61, + "grad_norm": 15.638760566711426, + "learning_rate": 9.234824080659333e-06, + "loss": 1.3254, + "step": 12867 + }, + { + "epoch": 1.61, + "grad_norm": 4.971253871917725, + "learning_rate": 9.233987365602645e-06, + "loss": 0.238, + "step": 12868 + }, + { + "epoch": 1.62, + "grad_norm": 44.882686614990234, + "learning_rate": 9.233150650545957e-06, + "loss": 1.2559, + "step": 12869 + }, + { + "epoch": 1.62, + "grad_norm": 8.837841033935547, + "learning_rate": 9.23231393548927e-06, + "loss": 1.9611, + "step": 12870 + }, + { + "epoch": 1.62, + "grad_norm": 8.826338768005371, + "learning_rate": 9.231477220432583e-06, + "loss": 1.5011, + "step": 12871 + }, + { + "epoch": 1.62, + "grad_norm": 12.538323402404785, + "learning_rate": 9.230640505375895e-06, + "loss": 1.4513, + "step": 12872 + }, + { + "epoch": 1.62, + "grad_norm": 9.48988151550293, + "learning_rate": 9.229803790319207e-06, + "loss": 2.127, + "step": 12873 + }, + { + "epoch": 1.62, + "grad_norm": 20.119808197021484, + "learning_rate": 9.22896707526252e-06, + "loss": 1.4636, + "step": 12874 + }, + { + "epoch": 1.62, + "grad_norm": 40.06005096435547, + "learning_rate": 9.228130360205832e-06, + "loss": 2.06, + "step": 12875 + }, + { + "epoch": 1.62, + "grad_norm": 7.540440082550049, + "learning_rate": 9.227293645149146e-06, + "loss": 1.0775, + "step": 12876 + }, + { + "epoch": 1.62, + "grad_norm": 21.5985050201416, + "learning_rate": 9.226456930092458e-06, + "loss": 1.5787, + "step": 12877 + }, + { + "epoch": 1.62, + "grad_norm": 23.678401947021484, + "learning_rate": 9.22562021503577e-06, + "loss": 1.7279, + "step": 12878 + }, + { + "epoch": 1.62, + "grad_norm": 16.571250915527344, + "learning_rate": 9.224783499979083e-06, + "loss": 0.9984, + "step": 12879 + }, + { + "epoch": 1.62, + "grad_norm": 16.851226806640625, + "learning_rate": 9.223946784922395e-06, + "loss": 2.244, + "step": 12880 + }, + { + "epoch": 1.62, + "grad_norm": 8.822027206420898, + "learning_rate": 9.223110069865709e-06, + "loss": 0.6714, + "step": 12881 + }, + { + "epoch": 1.62, + "grad_norm": 10.036003112792969, + "learning_rate": 9.222273354809021e-06, + "loss": 1.1554, + "step": 12882 + }, + { + "epoch": 1.62, + "grad_norm": 6.548466205596924, + "learning_rate": 9.221436639752333e-06, + "loss": 0.1496, + "step": 12883 + }, + { + "epoch": 1.62, + "grad_norm": 14.927818298339844, + "learning_rate": 9.220599924695645e-06, + "loss": 2.1248, + "step": 12884 + }, + { + "epoch": 1.62, + "grad_norm": 35.90271759033203, + "learning_rate": 9.219763209638959e-06, + "loss": 1.7514, + "step": 12885 + }, + { + "epoch": 1.62, + "grad_norm": 21.67229652404785, + "learning_rate": 9.21892649458227e-06, + "loss": 1.4736, + "step": 12886 + }, + { + "epoch": 1.62, + "grad_norm": 9.010519027709961, + "learning_rate": 9.218089779525582e-06, + "loss": 1.2694, + "step": 12887 + }, + { + "epoch": 1.62, + "grad_norm": 11.781586647033691, + "learning_rate": 9.217253064468896e-06, + "loss": 0.6613, + "step": 12888 + }, + { + "epoch": 1.62, + "grad_norm": 19.111835479736328, + "learning_rate": 9.216416349412208e-06, + "loss": 3.3232, + "step": 12889 + }, + { + "epoch": 1.62, + "grad_norm": 10.765817642211914, + "learning_rate": 9.215579634355522e-06, + "loss": 0.7193, + "step": 12890 + }, + { + "epoch": 1.62, + "grad_norm": 23.58574867248535, + "learning_rate": 9.214742919298834e-06, + "loss": 1.5909, + "step": 12891 + }, + { + "epoch": 1.62, + "grad_norm": 7.807656288146973, + "learning_rate": 9.213906204242146e-06, + "loss": 0.471, + "step": 12892 + }, + { + "epoch": 1.62, + "grad_norm": 13.614156723022461, + "learning_rate": 9.21306948918546e-06, + "loss": 0.6731, + "step": 12893 + }, + { + "epoch": 1.62, + "grad_norm": 12.069944381713867, + "learning_rate": 9.212232774128771e-06, + "loss": 1.1097, + "step": 12894 + }, + { + "epoch": 1.62, + "grad_norm": 10.385514259338379, + "learning_rate": 9.211396059072085e-06, + "loss": 0.5647, + "step": 12895 + }, + { + "epoch": 1.62, + "grad_norm": 9.53537368774414, + "learning_rate": 9.210559344015397e-06, + "loss": 0.4047, + "step": 12896 + }, + { + "epoch": 1.62, + "grad_norm": 11.166543006896973, + "learning_rate": 9.209722628958709e-06, + "loss": 0.3255, + "step": 12897 + }, + { + "epoch": 1.62, + "grad_norm": 7.068735599517822, + "learning_rate": 9.20888591390202e-06, + "loss": 0.3223, + "step": 12898 + }, + { + "epoch": 1.62, + "grad_norm": 14.783595085144043, + "learning_rate": 9.208049198845334e-06, + "loss": 0.877, + "step": 12899 + }, + { + "epoch": 1.62, + "grad_norm": 22.187023162841797, + "learning_rate": 9.207212483788646e-06, + "loss": 2.1927, + "step": 12900 + }, + { + "epoch": 1.62, + "grad_norm": 8.431422233581543, + "learning_rate": 9.206375768731958e-06, + "loss": 0.9181, + "step": 12901 + }, + { + "epoch": 1.62, + "grad_norm": 103.34065246582031, + "learning_rate": 9.205539053675272e-06, + "loss": 2.653, + "step": 12902 + }, + { + "epoch": 1.62, + "grad_norm": 13.142014503479004, + "learning_rate": 9.204702338618584e-06, + "loss": 1.3479, + "step": 12903 + }, + { + "epoch": 1.62, + "grad_norm": 15.372876167297363, + "learning_rate": 9.203865623561898e-06, + "loss": 2.6061, + "step": 12904 + }, + { + "epoch": 1.62, + "grad_norm": 12.46223258972168, + "learning_rate": 9.20302890850521e-06, + "loss": 0.6639, + "step": 12905 + }, + { + "epoch": 1.62, + "grad_norm": 115.90477752685547, + "learning_rate": 9.202192193448521e-06, + "loss": 1.2725, + "step": 12906 + }, + { + "epoch": 1.62, + "grad_norm": 5.032820224761963, + "learning_rate": 9.201355478391835e-06, + "loss": 0.2912, + "step": 12907 + }, + { + "epoch": 1.62, + "grad_norm": 39.1680908203125, + "learning_rate": 9.200518763335147e-06, + "loss": 1.3619, + "step": 12908 + }, + { + "epoch": 1.62, + "grad_norm": 17.965621948242188, + "learning_rate": 9.19968204827846e-06, + "loss": 0.9468, + "step": 12909 + }, + { + "epoch": 1.62, + "grad_norm": 18.22383689880371, + "learning_rate": 9.198845333221773e-06, + "loss": 2.1563, + "step": 12910 + }, + { + "epoch": 1.62, + "grad_norm": 31.021142959594727, + "learning_rate": 9.198008618165085e-06, + "loss": 1.0991, + "step": 12911 + }, + { + "epoch": 1.62, + "grad_norm": 18.455698013305664, + "learning_rate": 9.197171903108397e-06, + "loss": 2.0503, + "step": 12912 + }, + { + "epoch": 1.62, + "grad_norm": 42.6202507019043, + "learning_rate": 9.196335188051709e-06, + "loss": 1.5054, + "step": 12913 + }, + { + "epoch": 1.62, + "grad_norm": 15.459112167358398, + "learning_rate": 9.195498472995022e-06, + "loss": 1.3822, + "step": 12914 + }, + { + "epoch": 1.62, + "grad_norm": 18.592182159423828, + "learning_rate": 9.194661757938334e-06, + "loss": 0.5478, + "step": 12915 + }, + { + "epoch": 1.62, + "grad_norm": 29.801151275634766, + "learning_rate": 9.193825042881648e-06, + "loss": 2.387, + "step": 12916 + }, + { + "epoch": 1.62, + "grad_norm": 63.17427444458008, + "learning_rate": 9.19298832782496e-06, + "loss": 3.607, + "step": 12917 + }, + { + "epoch": 1.62, + "grad_norm": 15.072610855102539, + "learning_rate": 9.192151612768273e-06, + "loss": 1.1563, + "step": 12918 + }, + { + "epoch": 1.62, + "grad_norm": 33.61115264892578, + "learning_rate": 9.191314897711585e-06, + "loss": 1.1586, + "step": 12919 + }, + { + "epoch": 1.62, + "grad_norm": 15.370922088623047, + "learning_rate": 9.190478182654897e-06, + "loss": 1.1077, + "step": 12920 + }, + { + "epoch": 1.62, + "grad_norm": 25.674022674560547, + "learning_rate": 9.189641467598211e-06, + "loss": 1.09, + "step": 12921 + }, + { + "epoch": 1.62, + "grad_norm": 5.827080726623535, + "learning_rate": 9.188804752541523e-06, + "loss": 0.3103, + "step": 12922 + }, + { + "epoch": 1.62, + "grad_norm": 21.401575088500977, + "learning_rate": 9.187968037484837e-06, + "loss": 1.5182, + "step": 12923 + }, + { + "epoch": 1.62, + "grad_norm": 9.65243148803711, + "learning_rate": 9.187131322428148e-06, + "loss": 1.3113, + "step": 12924 + }, + { + "epoch": 1.62, + "grad_norm": 10.55691909790039, + "learning_rate": 9.18629460737146e-06, + "loss": 2.3306, + "step": 12925 + }, + { + "epoch": 1.62, + "grad_norm": 4.405165672302246, + "learning_rate": 9.185457892314772e-06, + "loss": 0.148, + "step": 12926 + }, + { + "epoch": 1.62, + "grad_norm": 6.446748733520508, + "learning_rate": 9.184621177258084e-06, + "loss": 0.5479, + "step": 12927 + }, + { + "epoch": 1.62, + "grad_norm": 11.645052909851074, + "learning_rate": 9.183784462201398e-06, + "loss": 1.109, + "step": 12928 + }, + { + "epoch": 1.62, + "grad_norm": 11.832785606384277, + "learning_rate": 9.18294774714471e-06, + "loss": 0.9107, + "step": 12929 + }, + { + "epoch": 1.62, + "grad_norm": 8.79918384552002, + "learning_rate": 9.182111032088024e-06, + "loss": 1.1871, + "step": 12930 + }, + { + "epoch": 1.62, + "grad_norm": 11.487600326538086, + "learning_rate": 9.181274317031336e-06, + "loss": 0.3511, + "step": 12931 + }, + { + "epoch": 1.62, + "grad_norm": 11.59465503692627, + "learning_rate": 9.18043760197465e-06, + "loss": 0.4442, + "step": 12932 + }, + { + "epoch": 1.62, + "grad_norm": 2.8561928272247314, + "learning_rate": 9.179600886917961e-06, + "loss": 0.1065, + "step": 12933 + }, + { + "epoch": 1.62, + "grad_norm": 16.970701217651367, + "learning_rate": 9.178764171861273e-06, + "loss": 1.8206, + "step": 12934 + }, + { + "epoch": 1.62, + "grad_norm": 25.955537796020508, + "learning_rate": 9.177927456804587e-06, + "loss": 3.0464, + "step": 12935 + }, + { + "epoch": 1.62, + "grad_norm": 190.4348602294922, + "learning_rate": 9.177090741747899e-06, + "loss": 0.8961, + "step": 12936 + }, + { + "epoch": 1.62, + "grad_norm": 8.63735580444336, + "learning_rate": 9.17625402669121e-06, + "loss": 0.3384, + "step": 12937 + }, + { + "epoch": 1.62, + "grad_norm": 75.52478790283203, + "learning_rate": 9.175417311634524e-06, + "loss": 0.972, + "step": 12938 + }, + { + "epoch": 1.62, + "grad_norm": 12.855717658996582, + "learning_rate": 9.174580596577836e-06, + "loss": 1.4988, + "step": 12939 + }, + { + "epoch": 1.62, + "grad_norm": 61.936988830566406, + "learning_rate": 9.173743881521148e-06, + "loss": 2.4051, + "step": 12940 + }, + { + "epoch": 1.62, + "grad_norm": 20.956762313842773, + "learning_rate": 9.17290716646446e-06, + "loss": 0.7292, + "step": 12941 + }, + { + "epoch": 1.62, + "grad_norm": 39.11030960083008, + "learning_rate": 9.172070451407774e-06, + "loss": 0.7878, + "step": 12942 + }, + { + "epoch": 1.62, + "grad_norm": 12.907938003540039, + "learning_rate": 9.171233736351086e-06, + "loss": 1.0599, + "step": 12943 + }, + { + "epoch": 1.62, + "grad_norm": 7.939062118530273, + "learning_rate": 9.1703970212944e-06, + "loss": 0.399, + "step": 12944 + }, + { + "epoch": 1.62, + "grad_norm": 12.606284141540527, + "learning_rate": 9.169560306237711e-06, + "loss": 1.4578, + "step": 12945 + }, + { + "epoch": 1.62, + "grad_norm": 28.248008728027344, + "learning_rate": 9.168723591181023e-06, + "loss": 1.1074, + "step": 12946 + }, + { + "epoch": 1.62, + "grad_norm": 12.51881217956543, + "learning_rate": 9.167886876124337e-06, + "loss": 1.7508, + "step": 12947 + }, + { + "epoch": 1.62, + "grad_norm": 6.950279235839844, + "learning_rate": 9.167050161067649e-06, + "loss": 0.6877, + "step": 12948 + }, + { + "epoch": 1.63, + "grad_norm": 5.21493673324585, + "learning_rate": 9.166213446010963e-06, + "loss": 0.6966, + "step": 12949 + }, + { + "epoch": 1.63, + "grad_norm": 61.48523712158203, + "learning_rate": 9.165376730954275e-06, + "loss": 1.9934, + "step": 12950 + }, + { + "epoch": 1.63, + "grad_norm": 28.346878051757812, + "learning_rate": 9.164540015897587e-06, + "loss": 1.3782, + "step": 12951 + }, + { + "epoch": 1.63, + "grad_norm": 14.001502990722656, + "learning_rate": 9.1637033008409e-06, + "loss": 1.5156, + "step": 12952 + }, + { + "epoch": 1.63, + "grad_norm": 25.5045108795166, + "learning_rate": 9.162866585784212e-06, + "loss": 2.0468, + "step": 12953 + }, + { + "epoch": 1.63, + "grad_norm": 16.51694679260254, + "learning_rate": 9.162029870727524e-06, + "loss": 1.3583, + "step": 12954 + }, + { + "epoch": 1.63, + "grad_norm": 17.545406341552734, + "learning_rate": 9.161193155670836e-06, + "loss": 0.8695, + "step": 12955 + }, + { + "epoch": 1.63, + "grad_norm": 5.469407081604004, + "learning_rate": 9.16035644061415e-06, + "loss": 0.3411, + "step": 12956 + }, + { + "epoch": 1.63, + "grad_norm": 9.810575485229492, + "learning_rate": 9.159519725557462e-06, + "loss": 0.8654, + "step": 12957 + }, + { + "epoch": 1.63, + "grad_norm": 8.806991577148438, + "learning_rate": 9.158683010500775e-06, + "loss": 0.2469, + "step": 12958 + }, + { + "epoch": 1.63, + "grad_norm": 12.568473815917969, + "learning_rate": 9.157846295444087e-06, + "loss": 2.249, + "step": 12959 + }, + { + "epoch": 1.63, + "grad_norm": 7.245036602020264, + "learning_rate": 9.1570095803874e-06, + "loss": 0.8974, + "step": 12960 + }, + { + "epoch": 1.63, + "grad_norm": 21.185012817382812, + "learning_rate": 9.156172865330713e-06, + "loss": 1.611, + "step": 12961 + }, + { + "epoch": 1.63, + "grad_norm": 7.256817817687988, + "learning_rate": 9.155336150274025e-06, + "loss": 0.4137, + "step": 12962 + }, + { + "epoch": 1.63, + "grad_norm": 6.60871696472168, + "learning_rate": 9.154499435217338e-06, + "loss": 0.8335, + "step": 12963 + }, + { + "epoch": 1.63, + "grad_norm": 16.758840560913086, + "learning_rate": 9.15366272016065e-06, + "loss": 1.1956, + "step": 12964 + }, + { + "epoch": 1.63, + "grad_norm": 8.283269882202148, + "learning_rate": 9.152826005103962e-06, + "loss": 0.757, + "step": 12965 + }, + { + "epoch": 1.63, + "grad_norm": 5.587263584136963, + "learning_rate": 9.151989290047274e-06, + "loss": 0.3193, + "step": 12966 + }, + { + "epoch": 1.63, + "grad_norm": 15.995939254760742, + "learning_rate": 9.151152574990588e-06, + "loss": 0.7247, + "step": 12967 + }, + { + "epoch": 1.63, + "grad_norm": 5.188906192779541, + "learning_rate": 9.1503158599339e-06, + "loss": 0.3046, + "step": 12968 + }, + { + "epoch": 1.63, + "grad_norm": 19.06035614013672, + "learning_rate": 9.149479144877212e-06, + "loss": 0.7589, + "step": 12969 + }, + { + "epoch": 1.63, + "grad_norm": 28.602251052856445, + "learning_rate": 9.148642429820526e-06, + "loss": 0.3866, + "step": 12970 + }, + { + "epoch": 1.63, + "grad_norm": 10.895336151123047, + "learning_rate": 9.147805714763837e-06, + "loss": 0.678, + "step": 12971 + }, + { + "epoch": 1.63, + "grad_norm": 23.167448043823242, + "learning_rate": 9.146968999707151e-06, + "loss": 1.1801, + "step": 12972 + }, + { + "epoch": 1.63, + "grad_norm": 21.483211517333984, + "learning_rate": 9.146132284650463e-06, + "loss": 0.7266, + "step": 12973 + }, + { + "epoch": 1.63, + "grad_norm": 19.09107208251953, + "learning_rate": 9.145295569593775e-06, + "loss": 0.7794, + "step": 12974 + }, + { + "epoch": 1.63, + "grad_norm": 15.3024263381958, + "learning_rate": 9.144458854537089e-06, + "loss": 1.7888, + "step": 12975 + }, + { + "epoch": 1.63, + "grad_norm": 15.789032936096191, + "learning_rate": 9.1436221394804e-06, + "loss": 1.2744, + "step": 12976 + }, + { + "epoch": 1.63, + "grad_norm": 6.284669399261475, + "learning_rate": 9.142785424423714e-06, + "loss": 2.9806, + "step": 12977 + }, + { + "epoch": 1.63, + "grad_norm": 15.586581230163574, + "learning_rate": 9.141948709367026e-06, + "loss": 0.7945, + "step": 12978 + }, + { + "epoch": 1.63, + "grad_norm": 16.861600875854492, + "learning_rate": 9.141111994310338e-06, + "loss": 1.2064, + "step": 12979 + }, + { + "epoch": 1.63, + "grad_norm": 14.14163589477539, + "learning_rate": 9.14027527925365e-06, + "loss": 0.5206, + "step": 12980 + }, + { + "epoch": 1.63, + "grad_norm": 11.704914093017578, + "learning_rate": 9.139438564196964e-06, + "loss": 0.773, + "step": 12981 + }, + { + "epoch": 1.63, + "grad_norm": 54.307395935058594, + "learning_rate": 9.138601849140276e-06, + "loss": 2.7865, + "step": 12982 + }, + { + "epoch": 1.63, + "grad_norm": 16.4180965423584, + "learning_rate": 9.137765134083588e-06, + "loss": 1.4351, + "step": 12983 + }, + { + "epoch": 1.63, + "grad_norm": 7.057581901550293, + "learning_rate": 9.136928419026901e-06, + "loss": 0.3731, + "step": 12984 + }, + { + "epoch": 1.63, + "grad_norm": 29.563465118408203, + "learning_rate": 9.136091703970213e-06, + "loss": 1.1386, + "step": 12985 + }, + { + "epoch": 1.63, + "grad_norm": 8.984923362731934, + "learning_rate": 9.135254988913527e-06, + "loss": 0.3085, + "step": 12986 + }, + { + "epoch": 1.63, + "grad_norm": 4.832681179046631, + "learning_rate": 9.134418273856839e-06, + "loss": 0.5914, + "step": 12987 + }, + { + "epoch": 1.63, + "grad_norm": 7.778252601623535, + "learning_rate": 9.133581558800151e-06, + "loss": 0.347, + "step": 12988 + }, + { + "epoch": 1.63, + "grad_norm": 13.914613723754883, + "learning_rate": 9.132744843743464e-06, + "loss": 2.0648, + "step": 12989 + }, + { + "epoch": 1.63, + "grad_norm": 5.425278663635254, + "learning_rate": 9.131908128686776e-06, + "loss": 1.4011, + "step": 12990 + }, + { + "epoch": 1.63, + "grad_norm": 13.942360877990723, + "learning_rate": 9.13107141363009e-06, + "loss": 1.6094, + "step": 12991 + }, + { + "epoch": 1.63, + "grad_norm": 5.301091194152832, + "learning_rate": 9.130234698573402e-06, + "loss": 1.8374, + "step": 12992 + }, + { + "epoch": 1.63, + "grad_norm": 19.2606201171875, + "learning_rate": 9.129397983516714e-06, + "loss": 1.2653, + "step": 12993 + }, + { + "epoch": 1.63, + "grad_norm": 13.116103172302246, + "learning_rate": 9.128561268460026e-06, + "loss": 0.5151, + "step": 12994 + }, + { + "epoch": 1.63, + "grad_norm": 10.208422660827637, + "learning_rate": 9.127724553403338e-06, + "loss": 2.3458, + "step": 12995 + }, + { + "epoch": 1.63, + "grad_norm": 55.8681640625, + "learning_rate": 9.126887838346652e-06, + "loss": 1.2972, + "step": 12996 + }, + { + "epoch": 1.63, + "grad_norm": 8.729084014892578, + "learning_rate": 9.126051123289964e-06, + "loss": 1.3225, + "step": 12997 + }, + { + "epoch": 1.63, + "grad_norm": 11.963275909423828, + "learning_rate": 9.125214408233277e-06, + "loss": 0.9134, + "step": 12998 + }, + { + "epoch": 1.63, + "grad_norm": 18.510528564453125, + "learning_rate": 9.124377693176589e-06, + "loss": 1.1661, + "step": 12999 + }, + { + "epoch": 1.63, + "grad_norm": 6.595053672790527, + "learning_rate": 9.123540978119903e-06, + "loss": 0.5564, + "step": 13000 + }, + { + "epoch": 1.63, + "grad_norm": 11.768023490905762, + "learning_rate": 9.122704263063215e-06, + "loss": 0.7828, + "step": 13001 + }, + { + "epoch": 1.63, + "grad_norm": 11.766725540161133, + "learning_rate": 9.121867548006527e-06, + "loss": 1.9123, + "step": 13002 + }, + { + "epoch": 1.63, + "grad_norm": 9.593929290771484, + "learning_rate": 9.12103083294984e-06, + "loss": 0.445, + "step": 13003 + }, + { + "epoch": 1.63, + "grad_norm": 35.650943756103516, + "learning_rate": 9.120194117893152e-06, + "loss": 2.4764, + "step": 13004 + }, + { + "epoch": 1.63, + "grad_norm": 2.9402058124542236, + "learning_rate": 9.119357402836464e-06, + "loss": 0.1609, + "step": 13005 + }, + { + "epoch": 1.63, + "grad_norm": 28.445289611816406, + "learning_rate": 9.118520687779778e-06, + "loss": 2.7993, + "step": 13006 + }, + { + "epoch": 1.63, + "grad_norm": 9.816569328308105, + "learning_rate": 9.11768397272309e-06, + "loss": 0.4052, + "step": 13007 + }, + { + "epoch": 1.63, + "grad_norm": 150.9856719970703, + "learning_rate": 9.116847257666402e-06, + "loss": 1.4806, + "step": 13008 + }, + { + "epoch": 1.63, + "grad_norm": 16.420270919799805, + "learning_rate": 9.116010542609714e-06, + "loss": 0.8386, + "step": 13009 + }, + { + "epoch": 1.63, + "grad_norm": 7.546241760253906, + "learning_rate": 9.115173827553027e-06, + "loss": 0.4028, + "step": 13010 + }, + { + "epoch": 1.63, + "grad_norm": 13.879183769226074, + "learning_rate": 9.11433711249634e-06, + "loss": 0.8947, + "step": 13011 + }, + { + "epoch": 1.63, + "grad_norm": 54.53948211669922, + "learning_rate": 9.113500397439653e-06, + "loss": 1.996, + "step": 13012 + }, + { + "epoch": 1.63, + "grad_norm": 18.00624656677246, + "learning_rate": 9.112663682382965e-06, + "loss": 1.6993, + "step": 13013 + }, + { + "epoch": 1.63, + "grad_norm": 15.442643165588379, + "learning_rate": 9.111826967326279e-06, + "loss": 1.7874, + "step": 13014 + }, + { + "epoch": 1.63, + "grad_norm": 46.516849517822266, + "learning_rate": 9.11099025226959e-06, + "loss": 1.399, + "step": 13015 + }, + { + "epoch": 1.63, + "grad_norm": 75.43692779541016, + "learning_rate": 9.110153537212903e-06, + "loss": 1.3915, + "step": 13016 + }, + { + "epoch": 1.63, + "grad_norm": 7.721510887145996, + "learning_rate": 9.109316822156216e-06, + "loss": 0.177, + "step": 13017 + }, + { + "epoch": 1.63, + "grad_norm": 11.118610382080078, + "learning_rate": 9.108480107099528e-06, + "loss": 1.1014, + "step": 13018 + }, + { + "epoch": 1.63, + "grad_norm": 89.78044891357422, + "learning_rate": 9.10764339204284e-06, + "loss": 0.9686, + "step": 13019 + }, + { + "epoch": 1.63, + "grad_norm": 6.995342254638672, + "learning_rate": 9.106806676986154e-06, + "loss": 2.3016, + "step": 13020 + }, + { + "epoch": 1.63, + "grad_norm": 13.39775562286377, + "learning_rate": 9.105969961929466e-06, + "loss": 1.4211, + "step": 13021 + }, + { + "epoch": 1.63, + "grad_norm": 24.990713119506836, + "learning_rate": 9.105133246872778e-06, + "loss": 1.2749, + "step": 13022 + }, + { + "epoch": 1.63, + "grad_norm": 5.517844200134277, + "learning_rate": 9.10429653181609e-06, + "loss": 0.2187, + "step": 13023 + }, + { + "epoch": 1.63, + "grad_norm": 10.690154075622559, + "learning_rate": 9.103459816759403e-06, + "loss": 0.395, + "step": 13024 + }, + { + "epoch": 1.63, + "grad_norm": 18.35173988342285, + "learning_rate": 9.102623101702715e-06, + "loss": 0.6232, + "step": 13025 + }, + { + "epoch": 1.63, + "grad_norm": 101.38148498535156, + "learning_rate": 9.101786386646029e-06, + "loss": 0.7097, + "step": 13026 + }, + { + "epoch": 1.63, + "grad_norm": 17.962690353393555, + "learning_rate": 9.10094967158934e-06, + "loss": 0.834, + "step": 13027 + }, + { + "epoch": 1.63, + "grad_norm": 20.415565490722656, + "learning_rate": 9.100112956532654e-06, + "loss": 1.1577, + "step": 13028 + }, + { + "epoch": 1.64, + "grad_norm": 14.006248474121094, + "learning_rate": 9.099276241475966e-06, + "loss": 0.7075, + "step": 13029 + }, + { + "epoch": 1.64, + "grad_norm": 13.301826477050781, + "learning_rate": 9.098439526419278e-06, + "loss": 0.5778, + "step": 13030 + }, + { + "epoch": 1.64, + "grad_norm": 9.223112106323242, + "learning_rate": 9.097602811362592e-06, + "loss": 0.6171, + "step": 13031 + }, + { + "epoch": 1.64, + "grad_norm": 20.63963508605957, + "learning_rate": 9.096766096305904e-06, + "loss": 1.2354, + "step": 13032 + }, + { + "epoch": 1.64, + "grad_norm": 16.198240280151367, + "learning_rate": 9.095929381249216e-06, + "loss": 1.3007, + "step": 13033 + }, + { + "epoch": 1.64, + "grad_norm": 11.846941947937012, + "learning_rate": 9.09509266619253e-06, + "loss": 0.9787, + "step": 13034 + }, + { + "epoch": 1.64, + "grad_norm": 6.430784225463867, + "learning_rate": 9.094255951135842e-06, + "loss": 0.2251, + "step": 13035 + }, + { + "epoch": 1.64, + "grad_norm": 12.79133415222168, + "learning_rate": 9.093419236079153e-06, + "loss": 1.9349, + "step": 13036 + }, + { + "epoch": 1.64, + "grad_norm": 30.484493255615234, + "learning_rate": 9.092582521022465e-06, + "loss": 1.7943, + "step": 13037 + }, + { + "epoch": 1.64, + "grad_norm": 47.64484405517578, + "learning_rate": 9.091745805965779e-06, + "loss": 1.2628, + "step": 13038 + }, + { + "epoch": 1.64, + "grad_norm": 15.437004089355469, + "learning_rate": 9.090909090909091e-06, + "loss": 1.1696, + "step": 13039 + }, + { + "epoch": 1.64, + "grad_norm": 4.615384101867676, + "learning_rate": 9.090072375852405e-06, + "loss": 0.4406, + "step": 13040 + }, + { + "epoch": 1.64, + "grad_norm": 15.526065826416016, + "learning_rate": 9.089235660795717e-06, + "loss": 0.4678, + "step": 13041 + }, + { + "epoch": 1.64, + "grad_norm": 22.352813720703125, + "learning_rate": 9.08839894573903e-06, + "loss": 1.5878, + "step": 13042 + }, + { + "epoch": 1.64, + "grad_norm": 29.267131805419922, + "learning_rate": 9.087562230682342e-06, + "loss": 1.5648, + "step": 13043 + }, + { + "epoch": 1.64, + "grad_norm": 9.587777137756348, + "learning_rate": 9.086725515625654e-06, + "loss": 1.0474, + "step": 13044 + }, + { + "epoch": 1.64, + "grad_norm": 14.594001770019531, + "learning_rate": 9.085888800568968e-06, + "loss": 1.7063, + "step": 13045 + }, + { + "epoch": 1.64, + "grad_norm": 18.16721534729004, + "learning_rate": 9.08505208551228e-06, + "loss": 1.147, + "step": 13046 + }, + { + "epoch": 1.64, + "grad_norm": 27.573862075805664, + "learning_rate": 9.084215370455592e-06, + "loss": 0.7921, + "step": 13047 + }, + { + "epoch": 1.64, + "grad_norm": 18.00398826599121, + "learning_rate": 9.083378655398904e-06, + "loss": 0.6198, + "step": 13048 + }, + { + "epoch": 1.64, + "grad_norm": 6.626566410064697, + "learning_rate": 9.082541940342217e-06, + "loss": 0.7332, + "step": 13049 + }, + { + "epoch": 1.64, + "grad_norm": 39.86878967285156, + "learning_rate": 9.08170522528553e-06, + "loss": 0.9836, + "step": 13050 + }, + { + "epoch": 1.64, + "grad_norm": 413.9939880371094, + "learning_rate": 9.080868510228841e-06, + "loss": 1.1279, + "step": 13051 + }, + { + "epoch": 1.64, + "grad_norm": 2.7134273052215576, + "learning_rate": 9.080031795172155e-06, + "loss": 0.0985, + "step": 13052 + }, + { + "epoch": 1.64, + "grad_norm": 12.295166969299316, + "learning_rate": 9.079195080115467e-06, + "loss": 1.1999, + "step": 13053 + }, + { + "epoch": 1.64, + "grad_norm": 47.10236358642578, + "learning_rate": 9.07835836505878e-06, + "loss": 2.3204, + "step": 13054 + }, + { + "epoch": 1.64, + "grad_norm": 34.14249801635742, + "learning_rate": 9.077521650002092e-06, + "loss": 2.6892, + "step": 13055 + }, + { + "epoch": 1.64, + "grad_norm": 11.125469207763672, + "learning_rate": 9.076684934945406e-06, + "loss": 2.22, + "step": 13056 + }, + { + "epoch": 1.64, + "grad_norm": 10.195199966430664, + "learning_rate": 9.075848219888718e-06, + "loss": 1.3924, + "step": 13057 + }, + { + "epoch": 1.64, + "grad_norm": 20.390213012695312, + "learning_rate": 9.07501150483203e-06, + "loss": 1.5304, + "step": 13058 + }, + { + "epoch": 1.64, + "grad_norm": 10.653514862060547, + "learning_rate": 9.074174789775344e-06, + "loss": 0.4206, + "step": 13059 + }, + { + "epoch": 1.64, + "grad_norm": 4.093517303466797, + "learning_rate": 9.073338074718656e-06, + "loss": 0.2536, + "step": 13060 + }, + { + "epoch": 1.64, + "grad_norm": 4.747400760650635, + "learning_rate": 9.072501359661968e-06, + "loss": 1.274, + "step": 13061 + }, + { + "epoch": 1.64, + "grad_norm": 65.5945816040039, + "learning_rate": 9.07166464460528e-06, + "loss": 1.0873, + "step": 13062 + }, + { + "epoch": 1.64, + "grad_norm": 14.872116088867188, + "learning_rate": 9.070827929548593e-06, + "loss": 1.0578, + "step": 13063 + }, + { + "epoch": 1.64, + "grad_norm": 12.61624813079834, + "learning_rate": 9.069991214491905e-06, + "loss": 1.4042, + "step": 13064 + }, + { + "epoch": 1.64, + "grad_norm": 15.354135513305664, + "learning_rate": 9.069154499435217e-06, + "loss": 0.7141, + "step": 13065 + }, + { + "epoch": 1.64, + "grad_norm": 7.654422283172607, + "learning_rate": 9.06831778437853e-06, + "loss": 1.499, + "step": 13066 + }, + { + "epoch": 1.64, + "grad_norm": 21.095413208007812, + "learning_rate": 9.067481069321843e-06, + "loss": 1.2577, + "step": 13067 + }, + { + "epoch": 1.64, + "grad_norm": 50.755958557128906, + "learning_rate": 9.066644354265156e-06, + "loss": 2.6346, + "step": 13068 + }, + { + "epoch": 1.64, + "grad_norm": 16.771076202392578, + "learning_rate": 9.065807639208468e-06, + "loss": 0.6723, + "step": 13069 + }, + { + "epoch": 1.64, + "grad_norm": 3.3442487716674805, + "learning_rate": 9.064970924151782e-06, + "loss": 0.2401, + "step": 13070 + }, + { + "epoch": 1.64, + "grad_norm": 11.364527702331543, + "learning_rate": 9.064134209095094e-06, + "loss": 1.9107, + "step": 13071 + }, + { + "epoch": 1.64, + "grad_norm": 67.20685577392578, + "learning_rate": 9.063297494038406e-06, + "loss": 3.1088, + "step": 13072 + }, + { + "epoch": 1.64, + "grad_norm": 12.825528144836426, + "learning_rate": 9.06246077898172e-06, + "loss": 0.7203, + "step": 13073 + }, + { + "epoch": 1.64, + "grad_norm": 7.3285698890686035, + "learning_rate": 9.061624063925031e-06, + "loss": 0.3413, + "step": 13074 + }, + { + "epoch": 1.64, + "grad_norm": 4.640127658843994, + "learning_rate": 9.060787348868343e-06, + "loss": 0.1723, + "step": 13075 + }, + { + "epoch": 1.64, + "grad_norm": 8.927471160888672, + "learning_rate": 9.059950633811655e-06, + "loss": 1.1897, + "step": 13076 + }, + { + "epoch": 1.64, + "grad_norm": 29.194480895996094, + "learning_rate": 9.059113918754969e-06, + "loss": 2.18, + "step": 13077 + }, + { + "epoch": 1.64, + "grad_norm": 10.097738265991211, + "learning_rate": 9.058277203698281e-06, + "loss": 0.4861, + "step": 13078 + }, + { + "epoch": 1.64, + "grad_norm": 11.935216903686523, + "learning_rate": 9.057440488641593e-06, + "loss": 0.7497, + "step": 13079 + }, + { + "epoch": 1.64, + "grad_norm": 23.826723098754883, + "learning_rate": 9.056603773584907e-06, + "loss": 1.7941, + "step": 13080 + }, + { + "epoch": 1.64, + "grad_norm": 11.864187240600586, + "learning_rate": 9.055767058528219e-06, + "loss": 0.5174, + "step": 13081 + }, + { + "epoch": 1.64, + "grad_norm": 9.379133224487305, + "learning_rate": 9.054930343471532e-06, + "loss": 1.9926, + "step": 13082 + }, + { + "epoch": 1.64, + "grad_norm": 21.72909927368164, + "learning_rate": 9.054093628414844e-06, + "loss": 0.9551, + "step": 13083 + }, + { + "epoch": 1.64, + "grad_norm": 69.47557830810547, + "learning_rate": 9.053256913358158e-06, + "loss": 1.2902, + "step": 13084 + }, + { + "epoch": 1.64, + "grad_norm": 7.750579357147217, + "learning_rate": 9.05242019830147e-06, + "loss": 1.019, + "step": 13085 + }, + { + "epoch": 1.64, + "grad_norm": 5.383609771728516, + "learning_rate": 9.051583483244782e-06, + "loss": 0.272, + "step": 13086 + }, + { + "epoch": 1.64, + "grad_norm": 18.136207580566406, + "learning_rate": 9.050746768188094e-06, + "loss": 3.1578, + "step": 13087 + }, + { + "epoch": 1.64, + "grad_norm": 33.227691650390625, + "learning_rate": 9.049910053131407e-06, + "loss": 0.9955, + "step": 13088 + }, + { + "epoch": 1.64, + "grad_norm": 15.877317428588867, + "learning_rate": 9.04907333807472e-06, + "loss": 0.7077, + "step": 13089 + }, + { + "epoch": 1.64, + "grad_norm": 17.573396682739258, + "learning_rate": 9.048236623018031e-06, + "loss": 1.6198, + "step": 13090 + }, + { + "epoch": 1.64, + "grad_norm": 14.696701049804688, + "learning_rate": 9.047399907961345e-06, + "loss": 0.7756, + "step": 13091 + }, + { + "epoch": 1.64, + "grad_norm": 8.865352630615234, + "learning_rate": 9.046563192904657e-06, + "loss": 0.5272, + "step": 13092 + }, + { + "epoch": 1.64, + "grad_norm": 13.953130722045898, + "learning_rate": 9.045726477847969e-06, + "loss": 0.6747, + "step": 13093 + }, + { + "epoch": 1.64, + "grad_norm": 53.01630401611328, + "learning_rate": 9.044889762791282e-06, + "loss": 1.9975, + "step": 13094 + }, + { + "epoch": 1.64, + "grad_norm": 3.929391860961914, + "learning_rate": 9.044053047734594e-06, + "loss": 0.1135, + "step": 13095 + }, + { + "epoch": 1.64, + "grad_norm": 56.032047271728516, + "learning_rate": 9.043216332677908e-06, + "loss": 2.2502, + "step": 13096 + }, + { + "epoch": 1.64, + "grad_norm": 16.776247024536133, + "learning_rate": 9.04237961762122e-06, + "loss": 0.5896, + "step": 13097 + }, + { + "epoch": 1.64, + "grad_norm": 11.387946128845215, + "learning_rate": 9.041542902564534e-06, + "loss": 0.7921, + "step": 13098 + }, + { + "epoch": 1.64, + "grad_norm": 12.66744327545166, + "learning_rate": 9.040706187507846e-06, + "loss": 1.4705, + "step": 13099 + }, + { + "epoch": 1.64, + "grad_norm": 8.530153274536133, + "learning_rate": 9.039869472451158e-06, + "loss": 0.7724, + "step": 13100 + }, + { + "epoch": 1.64, + "grad_norm": 30.687973022460938, + "learning_rate": 9.03903275739447e-06, + "loss": 1.1171, + "step": 13101 + }, + { + "epoch": 1.64, + "grad_norm": 8.460206031799316, + "learning_rate": 9.038196042337783e-06, + "loss": 2.4668, + "step": 13102 + }, + { + "epoch": 1.64, + "grad_norm": 8.673661231994629, + "learning_rate": 9.037359327281095e-06, + "loss": 0.3487, + "step": 13103 + }, + { + "epoch": 1.64, + "grad_norm": 11.060383796691895, + "learning_rate": 9.036522612224407e-06, + "loss": 1.2103, + "step": 13104 + }, + { + "epoch": 1.64, + "grad_norm": 13.42879581451416, + "learning_rate": 9.03568589716772e-06, + "loss": 0.8598, + "step": 13105 + }, + { + "epoch": 1.64, + "grad_norm": 438.6036376953125, + "learning_rate": 9.034849182111033e-06, + "loss": 2.0738, + "step": 13106 + }, + { + "epoch": 1.64, + "grad_norm": 25.97335433959961, + "learning_rate": 9.034012467054345e-06, + "loss": 0.9358, + "step": 13107 + }, + { + "epoch": 1.65, + "grad_norm": 14.31015682220459, + "learning_rate": 9.033175751997658e-06, + "loss": 0.6819, + "step": 13108 + }, + { + "epoch": 1.65, + "grad_norm": 15.247424125671387, + "learning_rate": 9.03233903694097e-06, + "loss": 0.8224, + "step": 13109 + }, + { + "epoch": 1.65, + "grad_norm": 4.5875420570373535, + "learning_rate": 9.031502321884284e-06, + "loss": 0.1946, + "step": 13110 + }, + { + "epoch": 1.65, + "grad_norm": 17.96656608581543, + "learning_rate": 9.030665606827596e-06, + "loss": 0.9879, + "step": 13111 + }, + { + "epoch": 1.65, + "grad_norm": 4.172995567321777, + "learning_rate": 9.029828891770908e-06, + "loss": 0.6168, + "step": 13112 + }, + { + "epoch": 1.65, + "grad_norm": 6.244558811187744, + "learning_rate": 9.028992176714221e-06, + "loss": 0.1583, + "step": 13113 + }, + { + "epoch": 1.65, + "grad_norm": 10.22422981262207, + "learning_rate": 9.028155461657533e-06, + "loss": 0.6554, + "step": 13114 + }, + { + "epoch": 1.65, + "grad_norm": 31.16236114501953, + "learning_rate": 9.027318746600845e-06, + "loss": 2.6027, + "step": 13115 + }, + { + "epoch": 1.65, + "grad_norm": 51.754554748535156, + "learning_rate": 9.026482031544159e-06, + "loss": 2.1243, + "step": 13116 + }, + { + "epoch": 1.65, + "grad_norm": 12.304715156555176, + "learning_rate": 9.025645316487471e-06, + "loss": 1.1691, + "step": 13117 + }, + { + "epoch": 1.65, + "grad_norm": 34.253997802734375, + "learning_rate": 9.024808601430783e-06, + "loss": 1.2489, + "step": 13118 + }, + { + "epoch": 1.65, + "grad_norm": 18.965248107910156, + "learning_rate": 9.023971886374095e-06, + "loss": 1.4573, + "step": 13119 + }, + { + "epoch": 1.65, + "grad_norm": 30.697521209716797, + "learning_rate": 9.023135171317409e-06, + "loss": 1.608, + "step": 13120 + }, + { + "epoch": 1.65, + "grad_norm": 34.999149322509766, + "learning_rate": 9.02229845626072e-06, + "loss": 1.2003, + "step": 13121 + }, + { + "epoch": 1.65, + "grad_norm": 28.988059997558594, + "learning_rate": 9.021461741204034e-06, + "loss": 2.2427, + "step": 13122 + }, + { + "epoch": 1.65, + "grad_norm": 11.598092079162598, + "learning_rate": 9.020625026147346e-06, + "loss": 0.4843, + "step": 13123 + }, + { + "epoch": 1.65, + "grad_norm": 12.560540199279785, + "learning_rate": 9.01978831109066e-06, + "loss": 0.4743, + "step": 13124 + }, + { + "epoch": 1.65, + "grad_norm": 14.121764183044434, + "learning_rate": 9.018951596033972e-06, + "loss": 2.2912, + "step": 13125 + }, + { + "epoch": 1.65, + "grad_norm": 30.48697853088379, + "learning_rate": 9.018114880977284e-06, + "loss": 1.9225, + "step": 13126 + }, + { + "epoch": 1.65, + "grad_norm": 10.957478523254395, + "learning_rate": 9.017278165920597e-06, + "loss": 0.7876, + "step": 13127 + }, + { + "epoch": 1.65, + "grad_norm": 11.144898414611816, + "learning_rate": 9.01644145086391e-06, + "loss": 0.69, + "step": 13128 + }, + { + "epoch": 1.65, + "grad_norm": 22.866575241088867, + "learning_rate": 9.015604735807221e-06, + "loss": 1.3409, + "step": 13129 + }, + { + "epoch": 1.65, + "grad_norm": 13.678271293640137, + "learning_rate": 9.014768020750533e-06, + "loss": 0.8963, + "step": 13130 + }, + { + "epoch": 1.65, + "grad_norm": 26.363157272338867, + "learning_rate": 9.013931305693847e-06, + "loss": 1.6555, + "step": 13131 + }, + { + "epoch": 1.65, + "grad_norm": 147.813720703125, + "learning_rate": 9.013094590637159e-06, + "loss": 2.2164, + "step": 13132 + }, + { + "epoch": 1.65, + "grad_norm": 10.695622444152832, + "learning_rate": 9.01225787558047e-06, + "loss": 1.7061, + "step": 13133 + }, + { + "epoch": 1.65, + "grad_norm": 19.920499801635742, + "learning_rate": 9.011421160523784e-06, + "loss": 1.1819, + "step": 13134 + }, + { + "epoch": 1.65, + "grad_norm": 29.973834991455078, + "learning_rate": 9.010584445467096e-06, + "loss": 1.4635, + "step": 13135 + }, + { + "epoch": 1.65, + "grad_norm": 8.446659088134766, + "learning_rate": 9.00974773041041e-06, + "loss": 1.5663, + "step": 13136 + }, + { + "epoch": 1.65, + "grad_norm": 13.579355239868164, + "learning_rate": 9.008911015353722e-06, + "loss": 1.9149, + "step": 13137 + }, + { + "epoch": 1.65, + "grad_norm": 4.746458053588867, + "learning_rate": 9.008074300297036e-06, + "loss": 0.2992, + "step": 13138 + }, + { + "epoch": 1.65, + "grad_norm": 10.081870079040527, + "learning_rate": 9.007237585240347e-06, + "loss": 0.6807, + "step": 13139 + }, + { + "epoch": 1.65, + "grad_norm": 7.301534175872803, + "learning_rate": 9.00640087018366e-06, + "loss": 0.87, + "step": 13140 + }, + { + "epoch": 1.65, + "grad_norm": 7.21815299987793, + "learning_rate": 9.005564155126973e-06, + "loss": 1.4987, + "step": 13141 + }, + { + "epoch": 1.65, + "grad_norm": 4.622696876525879, + "learning_rate": 9.004727440070285e-06, + "loss": 0.1623, + "step": 13142 + }, + { + "epoch": 1.65, + "grad_norm": 29.417076110839844, + "learning_rate": 9.003890725013597e-06, + "loss": 1.4325, + "step": 13143 + }, + { + "epoch": 1.65, + "grad_norm": 6.1637115478515625, + "learning_rate": 9.003054009956909e-06, + "loss": 1.247, + "step": 13144 + }, + { + "epoch": 1.65, + "grad_norm": 22.631696701049805, + "learning_rate": 9.002217294900223e-06, + "loss": 1.1119, + "step": 13145 + }, + { + "epoch": 1.65, + "grad_norm": 17.008960723876953, + "learning_rate": 9.001380579843535e-06, + "loss": 1.8602, + "step": 13146 + }, + { + "epoch": 1.65, + "grad_norm": 4.248160362243652, + "learning_rate": 9.000543864786847e-06, + "loss": 1.6219, + "step": 13147 + }, + { + "epoch": 1.65, + "grad_norm": 63.04439163208008, + "learning_rate": 8.99970714973016e-06, + "loss": 1.4084, + "step": 13148 + }, + { + "epoch": 1.65, + "grad_norm": 8.603978157043457, + "learning_rate": 8.998870434673472e-06, + "loss": 0.8025, + "step": 13149 + }, + { + "epoch": 1.65, + "grad_norm": 508.3733215332031, + "learning_rate": 8.998033719616786e-06, + "loss": 3.8823, + "step": 13150 + }, + { + "epoch": 1.65, + "grad_norm": 39.03932571411133, + "learning_rate": 8.997197004560098e-06, + "loss": 1.1376, + "step": 13151 + }, + { + "epoch": 1.65, + "grad_norm": 12.622542381286621, + "learning_rate": 8.996360289503411e-06, + "loss": 0.6019, + "step": 13152 + }, + { + "epoch": 1.65, + "grad_norm": 8.678735733032227, + "learning_rate": 8.995523574446723e-06, + "loss": 1.2947, + "step": 13153 + }, + { + "epoch": 1.65, + "grad_norm": 12.827431678771973, + "learning_rate": 8.994686859390035e-06, + "loss": 0.7529, + "step": 13154 + }, + { + "epoch": 1.65, + "grad_norm": 14.978957176208496, + "learning_rate": 8.993850144333349e-06, + "loss": 1.5085, + "step": 13155 + }, + { + "epoch": 1.65, + "grad_norm": 12.834829330444336, + "learning_rate": 8.993013429276661e-06, + "loss": 0.5835, + "step": 13156 + }, + { + "epoch": 1.65, + "grad_norm": 7.3710126876831055, + "learning_rate": 8.992176714219973e-06, + "loss": 0.3713, + "step": 13157 + }, + { + "epoch": 1.65, + "grad_norm": 21.46312141418457, + "learning_rate": 8.991339999163285e-06, + "loss": 1.1262, + "step": 13158 + }, + { + "epoch": 1.65, + "grad_norm": 1.169536828994751, + "learning_rate": 8.990503284106598e-06, + "loss": 0.0426, + "step": 13159 + }, + { + "epoch": 1.65, + "grad_norm": 6.690242767333984, + "learning_rate": 8.98966656904991e-06, + "loss": 1.1321, + "step": 13160 + }, + { + "epoch": 1.65, + "grad_norm": 5.8285040855407715, + "learning_rate": 8.988829853993222e-06, + "loss": 1.5329, + "step": 13161 + }, + { + "epoch": 1.65, + "grad_norm": 5.963802337646484, + "learning_rate": 8.987993138936536e-06, + "loss": 0.2929, + "step": 13162 + }, + { + "epoch": 1.65, + "grad_norm": 14.935331344604492, + "learning_rate": 8.987156423879848e-06, + "loss": 1.0795, + "step": 13163 + }, + { + "epoch": 1.65, + "grad_norm": 13.327879905700684, + "learning_rate": 8.986319708823162e-06, + "loss": 0.9714, + "step": 13164 + }, + { + "epoch": 1.65, + "grad_norm": 6.610556125640869, + "learning_rate": 8.985482993766474e-06, + "loss": 0.3407, + "step": 13165 + }, + { + "epoch": 1.65, + "grad_norm": 9.004228591918945, + "learning_rate": 8.984646278709787e-06, + "loss": 0.6166, + "step": 13166 + }, + { + "epoch": 1.65, + "grad_norm": 11.152169227600098, + "learning_rate": 8.9838095636531e-06, + "loss": 0.4951, + "step": 13167 + }, + { + "epoch": 1.65, + "grad_norm": 28.9377384185791, + "learning_rate": 8.982972848596411e-06, + "loss": 1.8287, + "step": 13168 + }, + { + "epoch": 1.65, + "grad_norm": 15.092056274414062, + "learning_rate": 8.982136133539723e-06, + "loss": 0.914, + "step": 13169 + }, + { + "epoch": 1.65, + "grad_norm": 12.820289611816406, + "learning_rate": 8.981299418483037e-06, + "loss": 0.5046, + "step": 13170 + }, + { + "epoch": 1.65, + "grad_norm": 24.191577911376953, + "learning_rate": 8.980462703426349e-06, + "loss": 1.9734, + "step": 13171 + }, + { + "epoch": 1.65, + "grad_norm": 5.4591450691223145, + "learning_rate": 8.97962598836966e-06, + "loss": 0.4638, + "step": 13172 + }, + { + "epoch": 1.65, + "grad_norm": 12.546009063720703, + "learning_rate": 8.978789273312974e-06, + "loss": 0.9607, + "step": 13173 + }, + { + "epoch": 1.65, + "grad_norm": 7.58516263961792, + "learning_rate": 8.977952558256286e-06, + "loss": 0.5153, + "step": 13174 + }, + { + "epoch": 1.65, + "grad_norm": 23.11334991455078, + "learning_rate": 8.977115843199598e-06, + "loss": 1.3486, + "step": 13175 + }, + { + "epoch": 1.65, + "grad_norm": 30.083221435546875, + "learning_rate": 8.976279128142912e-06, + "loss": 1.4435, + "step": 13176 + }, + { + "epoch": 1.65, + "grad_norm": 9.878336906433105, + "learning_rate": 8.975442413086224e-06, + "loss": 2.6615, + "step": 13177 + }, + { + "epoch": 1.65, + "grad_norm": 12.88902473449707, + "learning_rate": 8.974605698029537e-06, + "loss": 1.0042, + "step": 13178 + }, + { + "epoch": 1.65, + "grad_norm": 23.017744064331055, + "learning_rate": 8.97376898297285e-06, + "loss": 1.5633, + "step": 13179 + }, + { + "epoch": 1.65, + "grad_norm": 25.739858627319336, + "learning_rate": 8.972932267916163e-06, + "loss": 1.3024, + "step": 13180 + }, + { + "epoch": 1.65, + "grad_norm": 7.158169746398926, + "learning_rate": 8.972095552859475e-06, + "loss": 1.7064, + "step": 13181 + }, + { + "epoch": 1.65, + "grad_norm": 7.283500671386719, + "learning_rate": 8.971258837802787e-06, + "loss": 0.5594, + "step": 13182 + }, + { + "epoch": 1.65, + "grad_norm": 18.39688491821289, + "learning_rate": 8.970422122746099e-06, + "loss": 1.7752, + "step": 13183 + }, + { + "epoch": 1.65, + "grad_norm": 9.553594589233398, + "learning_rate": 8.969585407689413e-06, + "loss": 0.4924, + "step": 13184 + }, + { + "epoch": 1.65, + "grad_norm": 18.251686096191406, + "learning_rate": 8.968748692632725e-06, + "loss": 1.7104, + "step": 13185 + }, + { + "epoch": 1.65, + "grad_norm": 5.3052825927734375, + "learning_rate": 8.967911977576036e-06, + "loss": 0.5095, + "step": 13186 + }, + { + "epoch": 1.65, + "grad_norm": 6.065149784088135, + "learning_rate": 8.96707526251935e-06, + "loss": 0.3137, + "step": 13187 + }, + { + "epoch": 1.66, + "grad_norm": 11.02402114868164, + "learning_rate": 8.966238547462662e-06, + "loss": 1.0114, + "step": 13188 + }, + { + "epoch": 1.66, + "grad_norm": 13.939468383789062, + "learning_rate": 8.965401832405974e-06, + "loss": 0.5848, + "step": 13189 + }, + { + "epoch": 1.66, + "grad_norm": 5.761425018310547, + "learning_rate": 8.964565117349288e-06, + "loss": 0.784, + "step": 13190 + }, + { + "epoch": 1.66, + "grad_norm": 25.257661819458008, + "learning_rate": 8.9637284022926e-06, + "loss": 0.819, + "step": 13191 + }, + { + "epoch": 1.66, + "grad_norm": 10.442473411560059, + "learning_rate": 8.962891687235913e-06, + "loss": 0.689, + "step": 13192 + }, + { + "epoch": 1.66, + "grad_norm": 14.158611297607422, + "learning_rate": 8.962054972179225e-06, + "loss": 0.469, + "step": 13193 + }, + { + "epoch": 1.66, + "grad_norm": 9.559187889099121, + "learning_rate": 8.961218257122539e-06, + "loss": 0.7824, + "step": 13194 + }, + { + "epoch": 1.66, + "grad_norm": 8.47706413269043, + "learning_rate": 8.960381542065851e-06, + "loss": 0.2856, + "step": 13195 + }, + { + "epoch": 1.66, + "grad_norm": 15.341854095458984, + "learning_rate": 8.959544827009163e-06, + "loss": 1.8322, + "step": 13196 + }, + { + "epoch": 1.66, + "grad_norm": 31.592058181762695, + "learning_rate": 8.958708111952475e-06, + "loss": 3.7815, + "step": 13197 + }, + { + "epoch": 1.66, + "grad_norm": 21.079078674316406, + "learning_rate": 8.957871396895787e-06, + "loss": 2.3007, + "step": 13198 + }, + { + "epoch": 1.66, + "grad_norm": 4.746679782867432, + "learning_rate": 8.9570346818391e-06, + "loss": 0.9576, + "step": 13199 + }, + { + "epoch": 1.66, + "grad_norm": 20.35892105102539, + "learning_rate": 8.956197966782412e-06, + "loss": 0.6661, + "step": 13200 + }, + { + "epoch": 1.66, + "eval_loss": 0.10212861001491547, + "eval_runtime": 94.7891, + "eval_samples_per_second": 37.367, + "eval_steps_per_second": 37.367, + "step": 13200 + }, + { + "epoch": 1.66, + "grad_norm": 9.064656257629395, + "learning_rate": 8.955361251725726e-06, + "loss": 1.6586, + "step": 13201 + }, + { + "epoch": 1.66, + "grad_norm": 15.648777961730957, + "learning_rate": 8.954524536669038e-06, + "loss": 1.0213, + "step": 13202 + }, + { + "epoch": 1.66, + "grad_norm": 34.27608108520508, + "learning_rate": 8.95368782161235e-06, + "loss": 2.2202, + "step": 13203 + }, + { + "epoch": 1.66, + "grad_norm": 39.48409652709961, + "learning_rate": 8.952851106555664e-06, + "loss": 2.8217, + "step": 13204 + }, + { + "epoch": 1.66, + "grad_norm": 11.432174682617188, + "learning_rate": 8.952014391498975e-06, + "loss": 1.1946, + "step": 13205 + }, + { + "epoch": 1.66, + "grad_norm": 22.67432403564453, + "learning_rate": 8.951177676442289e-06, + "loss": 1.4657, + "step": 13206 + }, + { + "epoch": 1.66, + "grad_norm": 20.243619918823242, + "learning_rate": 8.950340961385601e-06, + "loss": 0.729, + "step": 13207 + }, + { + "epoch": 1.66, + "grad_norm": 143.8177947998047, + "learning_rate": 8.949504246328915e-06, + "loss": 0.6804, + "step": 13208 + }, + { + "epoch": 1.66, + "grad_norm": 90.39649963378906, + "learning_rate": 8.948667531272227e-06, + "loss": 2.949, + "step": 13209 + }, + { + "epoch": 1.66, + "grad_norm": 10.051481246948242, + "learning_rate": 8.947830816215539e-06, + "loss": 0.8819, + "step": 13210 + }, + { + "epoch": 1.66, + "grad_norm": 12.02458381652832, + "learning_rate": 8.94699410115885e-06, + "loss": 0.8701, + "step": 13211 + }, + { + "epoch": 1.66, + "grad_norm": 9.880314826965332, + "learning_rate": 8.946157386102163e-06, + "loss": 1.0519, + "step": 13212 + }, + { + "epoch": 1.66, + "grad_norm": 3.207190990447998, + "learning_rate": 8.945320671045476e-06, + "loss": 0.1411, + "step": 13213 + }, + { + "epoch": 1.66, + "grad_norm": 13.814050674438477, + "learning_rate": 8.944483955988788e-06, + "loss": 0.1641, + "step": 13214 + }, + { + "epoch": 1.66, + "grad_norm": 8.019259452819824, + "learning_rate": 8.943647240932102e-06, + "loss": 0.7049, + "step": 13215 + }, + { + "epoch": 1.66, + "grad_norm": 9.67119026184082, + "learning_rate": 8.942810525875414e-06, + "loss": 0.9873, + "step": 13216 + }, + { + "epoch": 1.66, + "grad_norm": 10.612390518188477, + "learning_rate": 8.941973810818726e-06, + "loss": 0.9955, + "step": 13217 + }, + { + "epoch": 1.66, + "grad_norm": 5.1115264892578125, + "learning_rate": 8.94113709576204e-06, + "loss": 0.4268, + "step": 13218 + }, + { + "epoch": 1.66, + "grad_norm": 21.119964599609375, + "learning_rate": 8.940300380705351e-06, + "loss": 1.0977, + "step": 13219 + }, + { + "epoch": 1.66, + "grad_norm": 14.732322692871094, + "learning_rate": 8.939463665648665e-06, + "loss": 1.9414, + "step": 13220 + }, + { + "epoch": 1.66, + "grad_norm": 18.428756713867188, + "learning_rate": 8.938626950591977e-06, + "loss": 2.0643, + "step": 13221 + }, + { + "epoch": 1.66, + "grad_norm": 9.45617389678955, + "learning_rate": 8.937790235535289e-06, + "loss": 0.7956, + "step": 13222 + }, + { + "epoch": 1.66, + "grad_norm": 9.30435562133789, + "learning_rate": 8.936953520478603e-06, + "loss": 0.4185, + "step": 13223 + }, + { + "epoch": 1.66, + "grad_norm": 10.009489059448242, + "learning_rate": 8.936116805421914e-06, + "loss": 1.2203, + "step": 13224 + }, + { + "epoch": 1.66, + "grad_norm": 9.37564468383789, + "learning_rate": 8.935280090365226e-06, + "loss": 1.103, + "step": 13225 + }, + { + "epoch": 1.66, + "grad_norm": 27.137699127197266, + "learning_rate": 8.934443375308538e-06, + "loss": 1.3436, + "step": 13226 + }, + { + "epoch": 1.66, + "grad_norm": 10.834226608276367, + "learning_rate": 8.933606660251852e-06, + "loss": 2.5409, + "step": 13227 + }, + { + "epoch": 1.66, + "grad_norm": 197.4990692138672, + "learning_rate": 8.932769945195164e-06, + "loss": 3.1156, + "step": 13228 + }, + { + "epoch": 1.66, + "grad_norm": 52.09612274169922, + "learning_rate": 8.931933230138478e-06, + "loss": 2.612, + "step": 13229 + }, + { + "epoch": 1.66, + "grad_norm": 10.936538696289062, + "learning_rate": 8.93109651508179e-06, + "loss": 1.1146, + "step": 13230 + }, + { + "epoch": 1.66, + "grad_norm": 160.63803100585938, + "learning_rate": 8.930259800025102e-06, + "loss": 1.8826, + "step": 13231 + }, + { + "epoch": 1.66, + "grad_norm": 11.45350456237793, + "learning_rate": 8.929423084968415e-06, + "loss": 1.1375, + "step": 13232 + }, + { + "epoch": 1.66, + "grad_norm": 6.9955573081970215, + "learning_rate": 8.928586369911727e-06, + "loss": 0.2738, + "step": 13233 + }, + { + "epoch": 1.66, + "grad_norm": 11.343851089477539, + "learning_rate": 8.92774965485504e-06, + "loss": 0.9436, + "step": 13234 + }, + { + "epoch": 1.66, + "grad_norm": 13.81751537322998, + "learning_rate": 8.926912939798353e-06, + "loss": 1.8904, + "step": 13235 + }, + { + "epoch": 1.66, + "grad_norm": 2.447096347808838, + "learning_rate": 8.926076224741665e-06, + "loss": 0.1465, + "step": 13236 + }, + { + "epoch": 1.66, + "grad_norm": 11.009682655334473, + "learning_rate": 8.925239509684978e-06, + "loss": 0.9183, + "step": 13237 + }, + { + "epoch": 1.66, + "grad_norm": 24.190349578857422, + "learning_rate": 8.92440279462829e-06, + "loss": 1.2925, + "step": 13238 + }, + { + "epoch": 1.66, + "grad_norm": 124.643310546875, + "learning_rate": 8.923566079571602e-06, + "loss": 1.3646, + "step": 13239 + }, + { + "epoch": 1.66, + "grad_norm": 9.799116134643555, + "learning_rate": 8.922729364514914e-06, + "loss": 0.4123, + "step": 13240 + }, + { + "epoch": 1.66, + "grad_norm": 7.647983074188232, + "learning_rate": 8.921892649458228e-06, + "loss": 0.699, + "step": 13241 + }, + { + "epoch": 1.66, + "grad_norm": 9.45411491394043, + "learning_rate": 8.92105593440154e-06, + "loss": 1.606, + "step": 13242 + }, + { + "epoch": 1.66, + "grad_norm": 19.23842430114746, + "learning_rate": 8.920219219344853e-06, + "loss": 0.4207, + "step": 13243 + }, + { + "epoch": 1.66, + "grad_norm": 23.47686195373535, + "learning_rate": 8.919382504288165e-06, + "loss": 1.8671, + "step": 13244 + }, + { + "epoch": 1.66, + "grad_norm": 11.251797676086426, + "learning_rate": 8.918545789231477e-06, + "loss": 0.4017, + "step": 13245 + }, + { + "epoch": 1.66, + "grad_norm": 13.252801895141602, + "learning_rate": 8.917709074174791e-06, + "loss": 0.4089, + "step": 13246 + }, + { + "epoch": 1.66, + "grad_norm": 7.073338985443115, + "learning_rate": 8.916872359118103e-06, + "loss": 1.9157, + "step": 13247 + }, + { + "epoch": 1.66, + "grad_norm": 32.249420166015625, + "learning_rate": 8.916035644061417e-06, + "loss": 3.1766, + "step": 13248 + }, + { + "epoch": 1.66, + "grad_norm": 10.403848648071289, + "learning_rate": 8.915198929004729e-06, + "loss": 1.0762, + "step": 13249 + }, + { + "epoch": 1.66, + "grad_norm": 11.180082321166992, + "learning_rate": 8.91436221394804e-06, + "loss": 0.5364, + "step": 13250 + }, + { + "epoch": 1.66, + "grad_norm": 12.972611427307129, + "learning_rate": 8.913525498891353e-06, + "loss": 0.938, + "step": 13251 + }, + { + "epoch": 1.66, + "grad_norm": 6.637202739715576, + "learning_rate": 8.912688783834666e-06, + "loss": 0.8613, + "step": 13252 + }, + { + "epoch": 1.66, + "grad_norm": 13.172598838806152, + "learning_rate": 8.911852068777978e-06, + "loss": 0.9947, + "step": 13253 + }, + { + "epoch": 1.66, + "grad_norm": 12.756169319152832, + "learning_rate": 8.91101535372129e-06, + "loss": 2.4711, + "step": 13254 + }, + { + "epoch": 1.66, + "grad_norm": 14.738645553588867, + "learning_rate": 8.910178638664604e-06, + "loss": 1.4537, + "step": 13255 + }, + { + "epoch": 1.66, + "grad_norm": 12.010141372680664, + "learning_rate": 8.909341923607916e-06, + "loss": 1.0155, + "step": 13256 + }, + { + "epoch": 1.66, + "grad_norm": 9.001840591430664, + "learning_rate": 8.90850520855123e-06, + "loss": 0.4968, + "step": 13257 + }, + { + "epoch": 1.66, + "grad_norm": 16.674413681030273, + "learning_rate": 8.907668493494541e-06, + "loss": 1.6073, + "step": 13258 + }, + { + "epoch": 1.66, + "grad_norm": 5.00567626953125, + "learning_rate": 8.906831778437853e-06, + "loss": 0.444, + "step": 13259 + }, + { + "epoch": 1.66, + "grad_norm": 36.17286682128906, + "learning_rate": 8.905995063381167e-06, + "loss": 0.7597, + "step": 13260 + }, + { + "epoch": 1.66, + "grad_norm": 5.889520645141602, + "learning_rate": 8.905158348324479e-06, + "loss": 1.3893, + "step": 13261 + }, + { + "epoch": 1.66, + "grad_norm": 22.4814395904541, + "learning_rate": 8.904321633267792e-06, + "loss": 1.5976, + "step": 13262 + }, + { + "epoch": 1.66, + "grad_norm": 93.91947174072266, + "learning_rate": 8.903484918211104e-06, + "loss": 2.2526, + "step": 13263 + }, + { + "epoch": 1.66, + "grad_norm": 4.852908134460449, + "learning_rate": 8.902648203154416e-06, + "loss": 0.1709, + "step": 13264 + }, + { + "epoch": 1.66, + "grad_norm": 9.342089653015137, + "learning_rate": 8.901811488097728e-06, + "loss": 1.2207, + "step": 13265 + }, + { + "epoch": 1.66, + "grad_norm": 33.74315643310547, + "learning_rate": 8.900974773041042e-06, + "loss": 0.7485, + "step": 13266 + }, + { + "epoch": 1.66, + "grad_norm": 14.375835418701172, + "learning_rate": 8.900138057984354e-06, + "loss": 1.0302, + "step": 13267 + }, + { + "epoch": 1.67, + "grad_norm": 14.565969467163086, + "learning_rate": 8.899301342927666e-06, + "loss": 1.0541, + "step": 13268 + }, + { + "epoch": 1.67, + "grad_norm": 6.327230453491211, + "learning_rate": 8.89846462787098e-06, + "loss": 0.6158, + "step": 13269 + }, + { + "epoch": 1.67, + "grad_norm": 6.902919769287109, + "learning_rate": 8.897627912814292e-06, + "loss": 1.111, + "step": 13270 + }, + { + "epoch": 1.67, + "grad_norm": 16.355833053588867, + "learning_rate": 8.896791197757605e-06, + "loss": 1.2105, + "step": 13271 + }, + { + "epoch": 1.67, + "grad_norm": 14.873172760009766, + "learning_rate": 8.895954482700917e-06, + "loss": 1.2151, + "step": 13272 + }, + { + "epoch": 1.67, + "grad_norm": 1.8628973960876465, + "learning_rate": 8.895117767644229e-06, + "loss": 0.0907, + "step": 13273 + }, + { + "epoch": 1.67, + "grad_norm": 15.422528266906738, + "learning_rate": 8.894281052587543e-06, + "loss": 0.5023, + "step": 13274 + }, + { + "epoch": 1.67, + "grad_norm": 13.421253204345703, + "learning_rate": 8.893444337530855e-06, + "loss": 1.842, + "step": 13275 + }, + { + "epoch": 1.67, + "grad_norm": 21.391128540039062, + "learning_rate": 8.892607622474168e-06, + "loss": 1.4473, + "step": 13276 + }, + { + "epoch": 1.67, + "grad_norm": 9.939013481140137, + "learning_rate": 8.89177090741748e-06, + "loss": 1.1062, + "step": 13277 + }, + { + "epoch": 1.67, + "grad_norm": 17.822078704833984, + "learning_rate": 8.890934192360792e-06, + "loss": 0.4375, + "step": 13278 + }, + { + "epoch": 1.67, + "grad_norm": 3.838214874267578, + "learning_rate": 8.890097477304104e-06, + "loss": 0.1922, + "step": 13279 + }, + { + "epoch": 1.67, + "grad_norm": 17.83454704284668, + "learning_rate": 8.889260762247416e-06, + "loss": 1.0488, + "step": 13280 + }, + { + "epoch": 1.67, + "grad_norm": 12.977375984191895, + "learning_rate": 8.88842404719073e-06, + "loss": 1.3126, + "step": 13281 + }, + { + "epoch": 1.67, + "grad_norm": 9.493778228759766, + "learning_rate": 8.887587332134042e-06, + "loss": 0.9259, + "step": 13282 + }, + { + "epoch": 1.67, + "grad_norm": 7.79358434677124, + "learning_rate": 8.886750617077355e-06, + "loss": 0.4632, + "step": 13283 + }, + { + "epoch": 1.67, + "grad_norm": 40.83755874633789, + "learning_rate": 8.885913902020667e-06, + "loss": 0.8441, + "step": 13284 + }, + { + "epoch": 1.67, + "grad_norm": 5.328298091888428, + "learning_rate": 8.885077186963981e-06, + "loss": 0.238, + "step": 13285 + }, + { + "epoch": 1.67, + "grad_norm": 27.0789794921875, + "learning_rate": 8.884240471907293e-06, + "loss": 1.1623, + "step": 13286 + }, + { + "epoch": 1.67, + "grad_norm": 10.417692184448242, + "learning_rate": 8.883403756850605e-06, + "loss": 0.6112, + "step": 13287 + }, + { + "epoch": 1.67, + "grad_norm": 10.841894149780273, + "learning_rate": 8.882567041793919e-06, + "loss": 0.9748, + "step": 13288 + }, + { + "epoch": 1.67, + "grad_norm": 25.555082321166992, + "learning_rate": 8.88173032673723e-06, + "loss": 1.5632, + "step": 13289 + }, + { + "epoch": 1.67, + "grad_norm": 13.408820152282715, + "learning_rate": 8.880893611680544e-06, + "loss": 0.4948, + "step": 13290 + }, + { + "epoch": 1.67, + "grad_norm": 22.0626163482666, + "learning_rate": 8.880056896623856e-06, + "loss": 1.7045, + "step": 13291 + }, + { + "epoch": 1.67, + "grad_norm": 11.598587036132812, + "learning_rate": 8.879220181567168e-06, + "loss": 0.7306, + "step": 13292 + }, + { + "epoch": 1.67, + "grad_norm": 85.8493423461914, + "learning_rate": 8.87838346651048e-06, + "loss": 1.3975, + "step": 13293 + }, + { + "epoch": 1.67, + "grad_norm": 13.468649864196777, + "learning_rate": 8.877546751453792e-06, + "loss": 1.0883, + "step": 13294 + }, + { + "epoch": 1.67, + "grad_norm": 13.701459884643555, + "learning_rate": 8.876710036397106e-06, + "loss": 1.903, + "step": 13295 + }, + { + "epoch": 1.67, + "grad_norm": 8.358095169067383, + "learning_rate": 8.875873321340418e-06, + "loss": 0.7231, + "step": 13296 + }, + { + "epoch": 1.67, + "grad_norm": 6.389739513397217, + "learning_rate": 8.875036606283731e-06, + "loss": 0.8284, + "step": 13297 + }, + { + "epoch": 1.67, + "grad_norm": 8.046916961669922, + "learning_rate": 8.874199891227043e-06, + "loss": 1.4727, + "step": 13298 + }, + { + "epoch": 1.67, + "grad_norm": 25.28152084350586, + "learning_rate": 8.873363176170355e-06, + "loss": 1.1091, + "step": 13299 + }, + { + "epoch": 1.67, + "grad_norm": 20.24308967590332, + "learning_rate": 8.872526461113669e-06, + "loss": 1.9117, + "step": 13300 + }, + { + "epoch": 1.67, + "grad_norm": 15.938324928283691, + "learning_rate": 8.87168974605698e-06, + "loss": 1.1477, + "step": 13301 + }, + { + "epoch": 1.67, + "grad_norm": 4.031920433044434, + "learning_rate": 8.870853031000294e-06, + "loss": 0.1123, + "step": 13302 + }, + { + "epoch": 1.67, + "grad_norm": 31.852489471435547, + "learning_rate": 8.870016315943606e-06, + "loss": 0.9396, + "step": 13303 + }, + { + "epoch": 1.67, + "grad_norm": 13.530977249145508, + "learning_rate": 8.869179600886918e-06, + "loss": 1.7093, + "step": 13304 + }, + { + "epoch": 1.67, + "grad_norm": 16.1658878326416, + "learning_rate": 8.868342885830232e-06, + "loss": 2.0838, + "step": 13305 + }, + { + "epoch": 1.67, + "grad_norm": 11.248319625854492, + "learning_rate": 8.867506170773544e-06, + "loss": 2.0007, + "step": 13306 + }, + { + "epoch": 1.67, + "grad_norm": 7.573624610900879, + "learning_rate": 8.866669455716856e-06, + "loss": 1.0224, + "step": 13307 + }, + { + "epoch": 1.67, + "grad_norm": 21.678936004638672, + "learning_rate": 8.865832740660168e-06, + "loss": 1.508, + "step": 13308 + }, + { + "epoch": 1.67, + "grad_norm": 6.0260138511657715, + "learning_rate": 8.864996025603481e-06, + "loss": 0.3533, + "step": 13309 + }, + { + "epoch": 1.67, + "grad_norm": 25.69033432006836, + "learning_rate": 8.864159310546793e-06, + "loss": 0.589, + "step": 13310 + }, + { + "epoch": 1.67, + "grad_norm": 266.6009521484375, + "learning_rate": 8.863322595490107e-06, + "loss": 2.3583, + "step": 13311 + }, + { + "epoch": 1.67, + "grad_norm": 23.762489318847656, + "learning_rate": 8.862485880433419e-06, + "loss": 1.7588, + "step": 13312 + }, + { + "epoch": 1.67, + "grad_norm": 10.351102828979492, + "learning_rate": 8.861649165376731e-06, + "loss": 2.2779, + "step": 13313 + }, + { + "epoch": 1.67, + "grad_norm": 23.822011947631836, + "learning_rate": 8.860812450320045e-06, + "loss": 1.6248, + "step": 13314 + }, + { + "epoch": 1.67, + "grad_norm": 6.911247730255127, + "learning_rate": 8.859975735263357e-06, + "loss": 0.7206, + "step": 13315 + }, + { + "epoch": 1.67, + "grad_norm": 15.00839614868164, + "learning_rate": 8.85913902020667e-06, + "loss": 2.2555, + "step": 13316 + }, + { + "epoch": 1.67, + "grad_norm": 37.120361328125, + "learning_rate": 8.858302305149982e-06, + "loss": 1.3278, + "step": 13317 + }, + { + "epoch": 1.67, + "grad_norm": 23.88784408569336, + "learning_rate": 8.857465590093294e-06, + "loss": 0.9319, + "step": 13318 + }, + { + "epoch": 1.67, + "grad_norm": 10.098158836364746, + "learning_rate": 8.856628875036608e-06, + "loss": 0.6505, + "step": 13319 + }, + { + "epoch": 1.67, + "grad_norm": 162.61207580566406, + "learning_rate": 8.85579215997992e-06, + "loss": 1.2954, + "step": 13320 + }, + { + "epoch": 1.67, + "grad_norm": 30.69794464111328, + "learning_rate": 8.854955444923232e-06, + "loss": 2.927, + "step": 13321 + }, + { + "epoch": 1.67, + "grad_norm": 12.624781608581543, + "learning_rate": 8.854118729866544e-06, + "loss": 1.0001, + "step": 13322 + }, + { + "epoch": 1.67, + "grad_norm": 8.311716079711914, + "learning_rate": 8.853282014809857e-06, + "loss": 0.4252, + "step": 13323 + }, + { + "epoch": 1.67, + "grad_norm": 39.431396484375, + "learning_rate": 8.85244529975317e-06, + "loss": 1.9132, + "step": 13324 + }, + { + "epoch": 1.67, + "grad_norm": 6.1541218757629395, + "learning_rate": 8.851608584696483e-06, + "loss": 0.7265, + "step": 13325 + }, + { + "epoch": 1.67, + "grad_norm": 9.475818634033203, + "learning_rate": 8.850771869639795e-06, + "loss": 0.4471, + "step": 13326 + }, + { + "epoch": 1.67, + "grad_norm": 12.667397499084473, + "learning_rate": 8.849935154583107e-06, + "loss": 0.5298, + "step": 13327 + }, + { + "epoch": 1.67, + "grad_norm": 9.55826473236084, + "learning_rate": 8.84909843952642e-06, + "loss": 0.8277, + "step": 13328 + }, + { + "epoch": 1.67, + "grad_norm": 18.122146606445312, + "learning_rate": 8.848261724469732e-06, + "loss": 1.7542, + "step": 13329 + }, + { + "epoch": 1.67, + "grad_norm": 12.492593765258789, + "learning_rate": 8.847425009413046e-06, + "loss": 0.6207, + "step": 13330 + }, + { + "epoch": 1.67, + "grad_norm": 14.764479637145996, + "learning_rate": 8.846588294356358e-06, + "loss": 1.0586, + "step": 13331 + }, + { + "epoch": 1.67, + "grad_norm": 52.10407638549805, + "learning_rate": 8.84575157929967e-06, + "loss": 2.8135, + "step": 13332 + }, + { + "epoch": 1.67, + "grad_norm": 61.20310592651367, + "learning_rate": 8.844914864242982e-06, + "loss": 2.2693, + "step": 13333 + }, + { + "epoch": 1.67, + "grad_norm": 22.537817001342773, + "learning_rate": 8.844078149186296e-06, + "loss": 1.4059, + "step": 13334 + }, + { + "epoch": 1.67, + "grad_norm": 17.515825271606445, + "learning_rate": 8.843241434129608e-06, + "loss": 1.8921, + "step": 13335 + }, + { + "epoch": 1.67, + "grad_norm": 108.60758972167969, + "learning_rate": 8.84240471907292e-06, + "loss": 1.2075, + "step": 13336 + }, + { + "epoch": 1.67, + "grad_norm": 14.8504638671875, + "learning_rate": 8.841568004016233e-06, + "loss": 0.6504, + "step": 13337 + }, + { + "epoch": 1.67, + "grad_norm": 15.910173416137695, + "learning_rate": 8.840731288959545e-06, + "loss": 0.8047, + "step": 13338 + }, + { + "epoch": 1.67, + "grad_norm": 8.04807186126709, + "learning_rate": 8.839894573902859e-06, + "loss": 0.9502, + "step": 13339 + }, + { + "epoch": 1.67, + "grad_norm": 11.169631958007812, + "learning_rate": 8.83905785884617e-06, + "loss": 1.7287, + "step": 13340 + }, + { + "epoch": 1.67, + "grad_norm": 8.26987075805664, + "learning_rate": 8.838221143789483e-06, + "loss": 0.4565, + "step": 13341 + }, + { + "epoch": 1.67, + "grad_norm": 11.048150062561035, + "learning_rate": 8.837384428732796e-06, + "loss": 1.1632, + "step": 13342 + }, + { + "epoch": 1.67, + "grad_norm": 21.97183609008789, + "learning_rate": 8.836547713676108e-06, + "loss": 2.2033, + "step": 13343 + }, + { + "epoch": 1.67, + "grad_norm": 39.257266998291016, + "learning_rate": 8.835710998619422e-06, + "loss": 0.8737, + "step": 13344 + }, + { + "epoch": 1.67, + "grad_norm": 17.208297729492188, + "learning_rate": 8.834874283562734e-06, + "loss": 1.6556, + "step": 13345 + }, + { + "epoch": 1.67, + "grad_norm": 1.4875246286392212, + "learning_rate": 8.834037568506046e-06, + "loss": 0.0885, + "step": 13346 + }, + { + "epoch": 1.68, + "grad_norm": 22.558366775512695, + "learning_rate": 8.833200853449358e-06, + "loss": 0.3735, + "step": 13347 + }, + { + "epoch": 1.68, + "grad_norm": 10.42983341217041, + "learning_rate": 8.832364138392671e-06, + "loss": 1.197, + "step": 13348 + }, + { + "epoch": 1.68, + "grad_norm": 11.471229553222656, + "learning_rate": 8.831527423335983e-06, + "loss": 0.8928, + "step": 13349 + }, + { + "epoch": 1.68, + "grad_norm": 67.52642822265625, + "learning_rate": 8.830690708279295e-06, + "loss": 1.4679, + "step": 13350 + }, + { + "epoch": 1.68, + "grad_norm": 3.370436429977417, + "learning_rate": 8.829853993222609e-06, + "loss": 0.2173, + "step": 13351 + }, + { + "epoch": 1.68, + "grad_norm": 56.54008102416992, + "learning_rate": 8.829017278165921e-06, + "loss": 2.1923, + "step": 13352 + }, + { + "epoch": 1.68, + "grad_norm": 27.789016723632812, + "learning_rate": 8.828180563109235e-06, + "loss": 2.1442, + "step": 13353 + }, + { + "epoch": 1.68, + "grad_norm": 8.23800277709961, + "learning_rate": 8.827343848052547e-06, + "loss": 0.4276, + "step": 13354 + }, + { + "epoch": 1.68, + "grad_norm": 15.000799179077148, + "learning_rate": 8.826507132995858e-06, + "loss": 1.1826, + "step": 13355 + }, + { + "epoch": 1.68, + "grad_norm": 6.0219831466674805, + "learning_rate": 8.825670417939172e-06, + "loss": 0.3008, + "step": 13356 + }, + { + "epoch": 1.68, + "grad_norm": 13.526106834411621, + "learning_rate": 8.824833702882484e-06, + "loss": 1.5786, + "step": 13357 + }, + { + "epoch": 1.68, + "grad_norm": 53.346534729003906, + "learning_rate": 8.823996987825798e-06, + "loss": 2.1324, + "step": 13358 + }, + { + "epoch": 1.68, + "grad_norm": 51.98458480834961, + "learning_rate": 8.82316027276911e-06, + "loss": 1.66, + "step": 13359 + }, + { + "epoch": 1.68, + "grad_norm": 11.333806037902832, + "learning_rate": 8.822323557712422e-06, + "loss": 0.8996, + "step": 13360 + }, + { + "epoch": 1.68, + "grad_norm": 22.819717407226562, + "learning_rate": 8.821486842655734e-06, + "loss": 1.5081, + "step": 13361 + }, + { + "epoch": 1.68, + "grad_norm": 4.25081729888916, + "learning_rate": 8.820650127599046e-06, + "loss": 0.0894, + "step": 13362 + }, + { + "epoch": 1.68, + "grad_norm": 15.563796043395996, + "learning_rate": 8.81981341254236e-06, + "loss": 1.2254, + "step": 13363 + }, + { + "epoch": 1.68, + "grad_norm": 8.962475776672363, + "learning_rate": 8.818976697485671e-06, + "loss": 1.3991, + "step": 13364 + }, + { + "epoch": 1.68, + "grad_norm": 11.515885353088379, + "learning_rate": 8.818139982428985e-06, + "loss": 2.4045, + "step": 13365 + }, + { + "epoch": 1.68, + "grad_norm": 15.969586372375488, + "learning_rate": 8.817303267372297e-06, + "loss": 3.9123, + "step": 13366 + }, + { + "epoch": 1.68, + "grad_norm": 11.065643310546875, + "learning_rate": 8.81646655231561e-06, + "loss": 1.8376, + "step": 13367 + }, + { + "epoch": 1.68, + "grad_norm": 25.345840454101562, + "learning_rate": 8.815629837258922e-06, + "loss": 2.6124, + "step": 13368 + }, + { + "epoch": 1.68, + "grad_norm": 19.20560073852539, + "learning_rate": 8.814793122202234e-06, + "loss": 2.4694, + "step": 13369 + }, + { + "epoch": 1.68, + "grad_norm": 17.70154571533203, + "learning_rate": 8.813956407145548e-06, + "loss": 0.9896, + "step": 13370 + }, + { + "epoch": 1.68, + "grad_norm": 8.05247688293457, + "learning_rate": 8.81311969208886e-06, + "loss": 0.5865, + "step": 13371 + }, + { + "epoch": 1.68, + "grad_norm": 18.543642044067383, + "learning_rate": 8.812282977032174e-06, + "loss": 1.2898, + "step": 13372 + }, + { + "epoch": 1.68, + "grad_norm": 87.89289093017578, + "learning_rate": 8.811446261975486e-06, + "loss": 2.2731, + "step": 13373 + }, + { + "epoch": 1.68, + "grad_norm": 11.179682731628418, + "learning_rate": 8.810609546918797e-06, + "loss": 1.1016, + "step": 13374 + }, + { + "epoch": 1.68, + "grad_norm": 14.395187377929688, + "learning_rate": 8.80977283186211e-06, + "loss": 1.9152, + "step": 13375 + }, + { + "epoch": 1.68, + "grad_norm": 12.498957633972168, + "learning_rate": 8.808936116805421e-06, + "loss": 0.8089, + "step": 13376 + }, + { + "epoch": 1.68, + "grad_norm": 19.434778213500977, + "learning_rate": 8.808099401748735e-06, + "loss": 0.9951, + "step": 13377 + }, + { + "epoch": 1.68, + "grad_norm": 4.149975299835205, + "learning_rate": 8.807262686692047e-06, + "loss": 0.4149, + "step": 13378 + }, + { + "epoch": 1.68, + "grad_norm": 4.292070388793945, + "learning_rate": 8.80642597163536e-06, + "loss": 0.3905, + "step": 13379 + }, + { + "epoch": 1.68, + "grad_norm": 16.486425399780273, + "learning_rate": 8.805589256578673e-06, + "loss": 1.3178, + "step": 13380 + }, + { + "epoch": 1.68, + "grad_norm": 13.734532356262207, + "learning_rate": 8.804752541521986e-06, + "loss": 0.4845, + "step": 13381 + }, + { + "epoch": 1.68, + "grad_norm": 12.013821601867676, + "learning_rate": 8.803915826465298e-06, + "loss": 1.778, + "step": 13382 + }, + { + "epoch": 1.68, + "grad_norm": 100.57466125488281, + "learning_rate": 8.80307911140861e-06, + "loss": 1.6756, + "step": 13383 + }, + { + "epoch": 1.68, + "grad_norm": 8.50030517578125, + "learning_rate": 8.802242396351924e-06, + "loss": 0.4496, + "step": 13384 + }, + { + "epoch": 1.68, + "grad_norm": 7.649212837219238, + "learning_rate": 8.801405681295236e-06, + "loss": 0.3725, + "step": 13385 + }, + { + "epoch": 1.68, + "grad_norm": 13.671998023986816, + "learning_rate": 8.800568966238548e-06, + "loss": 2.0125, + "step": 13386 + }, + { + "epoch": 1.68, + "grad_norm": 16.365331649780273, + "learning_rate": 8.799732251181861e-06, + "loss": 0.6007, + "step": 13387 + }, + { + "epoch": 1.68, + "grad_norm": 55.1285400390625, + "learning_rate": 8.798895536125173e-06, + "loss": 1.6335, + "step": 13388 + }, + { + "epoch": 1.68, + "grad_norm": 7.412749767303467, + "learning_rate": 8.798058821068485e-06, + "loss": 0.5369, + "step": 13389 + }, + { + "epoch": 1.68, + "grad_norm": 13.199882507324219, + "learning_rate": 8.797222106011797e-06, + "loss": 0.4003, + "step": 13390 + }, + { + "epoch": 1.68, + "grad_norm": 20.589523315429688, + "learning_rate": 8.796385390955111e-06, + "loss": 1.498, + "step": 13391 + }, + { + "epoch": 1.68, + "grad_norm": 19.743736267089844, + "learning_rate": 8.795548675898423e-06, + "loss": 1.5427, + "step": 13392 + }, + { + "epoch": 1.68, + "grad_norm": 17.00905418395996, + "learning_rate": 8.794711960841736e-06, + "loss": 0.7849, + "step": 13393 + }, + { + "epoch": 1.68, + "grad_norm": 11.008318901062012, + "learning_rate": 8.793875245785048e-06, + "loss": 1.1313, + "step": 13394 + }, + { + "epoch": 1.68, + "grad_norm": 18.286884307861328, + "learning_rate": 8.793038530728362e-06, + "loss": 2.9482, + "step": 13395 + }, + { + "epoch": 1.68, + "grad_norm": 29.307876586914062, + "learning_rate": 8.792201815671674e-06, + "loss": 3.9423, + "step": 13396 + }, + { + "epoch": 1.68, + "grad_norm": 24.198068618774414, + "learning_rate": 8.791365100614986e-06, + "loss": 1.1835, + "step": 13397 + }, + { + "epoch": 1.68, + "grad_norm": 20.103090286254883, + "learning_rate": 8.7905283855583e-06, + "loss": 0.616, + "step": 13398 + }, + { + "epoch": 1.68, + "grad_norm": 11.391027450561523, + "learning_rate": 8.789691670501612e-06, + "loss": 0.8341, + "step": 13399 + }, + { + "epoch": 1.68, + "grad_norm": 12.7701416015625, + "learning_rate": 8.788854955444924e-06, + "loss": 0.9425, + "step": 13400 + }, + { + "epoch": 1.68, + "grad_norm": 6.627573013305664, + "learning_rate": 8.788018240388237e-06, + "loss": 0.8287, + "step": 13401 + }, + { + "epoch": 1.68, + "grad_norm": 10.134284973144531, + "learning_rate": 8.787181525331549e-06, + "loss": 0.7226, + "step": 13402 + }, + { + "epoch": 1.68, + "grad_norm": 3.0420546531677246, + "learning_rate": 8.786344810274861e-06, + "loss": 0.0561, + "step": 13403 + }, + { + "epoch": 1.68, + "grad_norm": 26.259544372558594, + "learning_rate": 8.785508095218173e-06, + "loss": 2.305, + "step": 13404 + }, + { + "epoch": 1.68, + "grad_norm": 13.82083511352539, + "learning_rate": 8.784671380161487e-06, + "loss": 0.935, + "step": 13405 + }, + { + "epoch": 1.68, + "grad_norm": 30.167530059814453, + "learning_rate": 8.783834665104799e-06, + "loss": 2.9084, + "step": 13406 + }, + { + "epoch": 1.68, + "grad_norm": 27.253812789916992, + "learning_rate": 8.782997950048112e-06, + "loss": 3.7398, + "step": 13407 + }, + { + "epoch": 1.68, + "grad_norm": 9.45655632019043, + "learning_rate": 8.782161234991424e-06, + "loss": 1.2718, + "step": 13408 + }, + { + "epoch": 1.68, + "grad_norm": 40.28466796875, + "learning_rate": 8.781324519934738e-06, + "loss": 1.8136, + "step": 13409 + }, + { + "epoch": 1.68, + "grad_norm": 51.040706634521484, + "learning_rate": 8.78048780487805e-06, + "loss": 1.6632, + "step": 13410 + }, + { + "epoch": 1.68, + "grad_norm": 12.13768482208252, + "learning_rate": 8.779651089821362e-06, + "loss": 0.2764, + "step": 13411 + }, + { + "epoch": 1.68, + "grad_norm": 4.894547462463379, + "learning_rate": 8.778814374764675e-06, + "loss": 0.321, + "step": 13412 + }, + { + "epoch": 1.68, + "grad_norm": 31.434677124023438, + "learning_rate": 8.777977659707987e-06, + "loss": 1.4748, + "step": 13413 + }, + { + "epoch": 1.68, + "grad_norm": 6.621557712554932, + "learning_rate": 8.7771409446513e-06, + "loss": 1.5256, + "step": 13414 + }, + { + "epoch": 1.68, + "grad_norm": 26.227516174316406, + "learning_rate": 8.776304229594611e-06, + "loss": 1.5478, + "step": 13415 + }, + { + "epoch": 1.68, + "grad_norm": 6.326592922210693, + "learning_rate": 8.775467514537925e-06, + "loss": 0.4077, + "step": 13416 + }, + { + "epoch": 1.68, + "grad_norm": 82.8076171875, + "learning_rate": 8.774630799481237e-06, + "loss": 2.9694, + "step": 13417 + }, + { + "epoch": 1.68, + "grad_norm": 9.175357818603516, + "learning_rate": 8.773794084424549e-06, + "loss": 0.4495, + "step": 13418 + }, + { + "epoch": 1.68, + "grad_norm": 10.854291915893555, + "learning_rate": 8.772957369367863e-06, + "loss": 1.138, + "step": 13419 + }, + { + "epoch": 1.68, + "grad_norm": 9.000757217407227, + "learning_rate": 8.772120654311175e-06, + "loss": 0.3788, + "step": 13420 + }, + { + "epoch": 1.68, + "grad_norm": 21.460311889648438, + "learning_rate": 8.771283939254488e-06, + "loss": 2.3919, + "step": 13421 + }, + { + "epoch": 1.68, + "grad_norm": 12.737377166748047, + "learning_rate": 8.7704472241978e-06, + "loss": 0.9744, + "step": 13422 + }, + { + "epoch": 1.68, + "grad_norm": 28.311187744140625, + "learning_rate": 8.769610509141114e-06, + "loss": 1.5108, + "step": 13423 + }, + { + "epoch": 1.68, + "grad_norm": 12.060797691345215, + "learning_rate": 8.768773794084426e-06, + "loss": 1.4748, + "step": 13424 + }, + { + "epoch": 1.68, + "grad_norm": 10.785904884338379, + "learning_rate": 8.767937079027738e-06, + "loss": 0.7211, + "step": 13425 + }, + { + "epoch": 1.68, + "grad_norm": 13.229130744934082, + "learning_rate": 8.767100363971051e-06, + "loss": 0.6266, + "step": 13426 + }, + { + "epoch": 1.69, + "grad_norm": 18.435352325439453, + "learning_rate": 8.766263648914363e-06, + "loss": 0.82, + "step": 13427 + }, + { + "epoch": 1.69, + "grad_norm": 122.35250091552734, + "learning_rate": 8.765426933857675e-06, + "loss": 2.9313, + "step": 13428 + }, + { + "epoch": 1.69, + "grad_norm": 9.277535438537598, + "learning_rate": 8.764590218800987e-06, + "loss": 1.6462, + "step": 13429 + }, + { + "epoch": 1.69, + "grad_norm": 6.448642730712891, + "learning_rate": 8.7637535037443e-06, + "loss": 0.809, + "step": 13430 + }, + { + "epoch": 1.69, + "grad_norm": 28.299129486083984, + "learning_rate": 8.762916788687613e-06, + "loss": 2.3807, + "step": 13431 + }, + { + "epoch": 1.69, + "grad_norm": 4.615137577056885, + "learning_rate": 8.762080073630925e-06, + "loss": 1.7854, + "step": 13432 + }, + { + "epoch": 1.69, + "grad_norm": 14.869714736938477, + "learning_rate": 8.761243358574238e-06, + "loss": 0.9222, + "step": 13433 + }, + { + "epoch": 1.69, + "grad_norm": 5.068790435791016, + "learning_rate": 8.76040664351755e-06, + "loss": 0.463, + "step": 13434 + }, + { + "epoch": 1.69, + "grad_norm": 244.9357452392578, + "learning_rate": 8.759569928460864e-06, + "loss": 2.3795, + "step": 13435 + }, + { + "epoch": 1.69, + "grad_norm": 11.517192840576172, + "learning_rate": 8.758733213404176e-06, + "loss": 1.8589, + "step": 13436 + }, + { + "epoch": 1.69, + "grad_norm": 2.460245370864868, + "learning_rate": 8.75789649834749e-06, + "loss": 0.1194, + "step": 13437 + }, + { + "epoch": 1.69, + "grad_norm": 37.955230712890625, + "learning_rate": 8.757059783290802e-06, + "loss": 0.4742, + "step": 13438 + }, + { + "epoch": 1.69, + "grad_norm": 14.026997566223145, + "learning_rate": 8.756223068234113e-06, + "loss": 0.9611, + "step": 13439 + }, + { + "epoch": 1.69, + "grad_norm": 13.362473487854004, + "learning_rate": 8.755386353177427e-06, + "loss": 0.529, + "step": 13440 + }, + { + "epoch": 1.69, + "grad_norm": 23.569236755371094, + "learning_rate": 8.754549638120739e-06, + "loss": 1.5198, + "step": 13441 + }, + { + "epoch": 1.69, + "grad_norm": 11.897934913635254, + "learning_rate": 8.753712923064051e-06, + "loss": 0.881, + "step": 13442 + }, + { + "epoch": 1.69, + "grad_norm": 12.302495002746582, + "learning_rate": 8.752876208007363e-06, + "loss": 0.9799, + "step": 13443 + }, + { + "epoch": 1.69, + "grad_norm": 33.61111068725586, + "learning_rate": 8.752039492950677e-06, + "loss": 1.3375, + "step": 13444 + }, + { + "epoch": 1.69, + "grad_norm": 29.497589111328125, + "learning_rate": 8.751202777893989e-06, + "loss": 2.4669, + "step": 13445 + }, + { + "epoch": 1.69, + "grad_norm": 28.58656883239746, + "learning_rate": 8.7503660628373e-06, + "loss": 1.0672, + "step": 13446 + }, + { + "epoch": 1.69, + "grad_norm": 4.034222602844238, + "learning_rate": 8.749529347780614e-06, + "loss": 0.1654, + "step": 13447 + }, + { + "epoch": 1.69, + "grad_norm": 7.035055160522461, + "learning_rate": 8.748692632723926e-06, + "loss": 0.1728, + "step": 13448 + }, + { + "epoch": 1.69, + "grad_norm": 12.092005729675293, + "learning_rate": 8.74785591766724e-06, + "loss": 1.5952, + "step": 13449 + }, + { + "epoch": 1.69, + "grad_norm": 12.995216369628906, + "learning_rate": 8.747019202610552e-06, + "loss": 0.4874, + "step": 13450 + }, + { + "epoch": 1.69, + "grad_norm": 11.660663604736328, + "learning_rate": 8.746182487553865e-06, + "loss": 0.5306, + "step": 13451 + }, + { + "epoch": 1.69, + "grad_norm": 11.609655380249023, + "learning_rate": 8.745345772497177e-06, + "loss": 0.9101, + "step": 13452 + }, + { + "epoch": 1.69, + "grad_norm": 11.562085151672363, + "learning_rate": 8.74450905744049e-06, + "loss": 2.1206, + "step": 13453 + }, + { + "epoch": 1.69, + "grad_norm": 22.926427841186523, + "learning_rate": 8.743672342383801e-06, + "loss": 1.2818, + "step": 13454 + }, + { + "epoch": 1.69, + "grad_norm": 19.309673309326172, + "learning_rate": 8.742835627327115e-06, + "loss": 0.8676, + "step": 13455 + }, + { + "epoch": 1.69, + "grad_norm": 11.054777145385742, + "learning_rate": 8.741998912270427e-06, + "loss": 1.3197, + "step": 13456 + }, + { + "epoch": 1.69, + "grad_norm": 4.592566967010498, + "learning_rate": 8.741162197213739e-06, + "loss": 0.1495, + "step": 13457 + }, + { + "epoch": 1.69, + "grad_norm": 10.187602996826172, + "learning_rate": 8.740325482157052e-06, + "loss": 3.2393, + "step": 13458 + }, + { + "epoch": 1.69, + "grad_norm": 14.268762588500977, + "learning_rate": 8.739488767100364e-06, + "loss": 0.5333, + "step": 13459 + }, + { + "epoch": 1.69, + "grad_norm": 13.50385570526123, + "learning_rate": 8.738652052043676e-06, + "loss": 0.6521, + "step": 13460 + }, + { + "epoch": 1.69, + "grad_norm": 7.107166290283203, + "learning_rate": 8.73781533698699e-06, + "loss": 0.3628, + "step": 13461 + }, + { + "epoch": 1.69, + "grad_norm": 188.79522705078125, + "learning_rate": 8.736978621930302e-06, + "loss": 1.6856, + "step": 13462 + }, + { + "epoch": 1.69, + "grad_norm": 10.642914772033691, + "learning_rate": 8.736141906873616e-06, + "loss": 1.4633, + "step": 13463 + }, + { + "epoch": 1.69, + "grad_norm": 19.645877838134766, + "learning_rate": 8.735305191816928e-06, + "loss": 3.3901, + "step": 13464 + }, + { + "epoch": 1.69, + "grad_norm": 50.0162467956543, + "learning_rate": 8.734468476760241e-06, + "loss": 1.942, + "step": 13465 + }, + { + "epoch": 1.69, + "grad_norm": 10.19096851348877, + "learning_rate": 8.733631761703553e-06, + "loss": 0.5044, + "step": 13466 + }, + { + "epoch": 1.69, + "grad_norm": 12.090299606323242, + "learning_rate": 8.732795046646865e-06, + "loss": 0.871, + "step": 13467 + }, + { + "epoch": 1.69, + "grad_norm": 22.82764434814453, + "learning_rate": 8.731958331590177e-06, + "loss": 1.4258, + "step": 13468 + }, + { + "epoch": 1.69, + "grad_norm": 8.270901679992676, + "learning_rate": 8.73112161653349e-06, + "loss": 0.4719, + "step": 13469 + }, + { + "epoch": 1.69, + "grad_norm": 27.25560760498047, + "learning_rate": 8.730284901476803e-06, + "loss": 2.2294, + "step": 13470 + }, + { + "epoch": 1.69, + "grad_norm": 11.683439254760742, + "learning_rate": 8.729448186420115e-06, + "loss": 1.1828, + "step": 13471 + }, + { + "epoch": 1.69, + "grad_norm": 26.004322052001953, + "learning_rate": 8.728611471363427e-06, + "loss": 0.7483, + "step": 13472 + }, + { + "epoch": 1.69, + "grad_norm": 24.552906036376953, + "learning_rate": 8.72777475630674e-06, + "loss": 1.509, + "step": 13473 + }, + { + "epoch": 1.69, + "grad_norm": 4.466454029083252, + "learning_rate": 8.726938041250052e-06, + "loss": 0.2364, + "step": 13474 + }, + { + "epoch": 1.69, + "grad_norm": 12.081656455993652, + "learning_rate": 8.726101326193366e-06, + "loss": 0.5443, + "step": 13475 + }, + { + "epoch": 1.69, + "grad_norm": 9.333365440368652, + "learning_rate": 8.725264611136678e-06, + "loss": 0.5033, + "step": 13476 + }, + { + "epoch": 1.69, + "grad_norm": 8.717599868774414, + "learning_rate": 8.724427896079991e-06, + "loss": 0.3139, + "step": 13477 + }, + { + "epoch": 1.69, + "grad_norm": 7.948026657104492, + "learning_rate": 8.723591181023303e-06, + "loss": 0.8541, + "step": 13478 + }, + { + "epoch": 1.69, + "grad_norm": 8.756032943725586, + "learning_rate": 8.722754465966615e-06, + "loss": 0.3445, + "step": 13479 + }, + { + "epoch": 1.69, + "grad_norm": 6.668010234832764, + "learning_rate": 8.721917750909929e-06, + "loss": 1.1067, + "step": 13480 + }, + { + "epoch": 1.69, + "grad_norm": 13.908268928527832, + "learning_rate": 8.721081035853241e-06, + "loss": 1.2556, + "step": 13481 + }, + { + "epoch": 1.69, + "grad_norm": 12.909756660461426, + "learning_rate": 8.720244320796553e-06, + "loss": 0.3901, + "step": 13482 + }, + { + "epoch": 1.69, + "grad_norm": 8.64794635772705, + "learning_rate": 8.719407605739867e-06, + "loss": 1.385, + "step": 13483 + }, + { + "epoch": 1.69, + "grad_norm": 27.266633987426758, + "learning_rate": 8.718570890683179e-06, + "loss": 1.5704, + "step": 13484 + }, + { + "epoch": 1.69, + "grad_norm": 7.373408794403076, + "learning_rate": 8.71773417562649e-06, + "loss": 0.3352, + "step": 13485 + }, + { + "epoch": 1.69, + "grad_norm": 8.586067199707031, + "learning_rate": 8.716897460569802e-06, + "loss": 1.0504, + "step": 13486 + }, + { + "epoch": 1.69, + "grad_norm": 19.755983352661133, + "learning_rate": 8.716060745513116e-06, + "loss": 0.7271, + "step": 13487 + }, + { + "epoch": 1.69, + "grad_norm": 13.576322555541992, + "learning_rate": 8.715224030456428e-06, + "loss": 0.3314, + "step": 13488 + }, + { + "epoch": 1.69, + "grad_norm": 14.759033203125, + "learning_rate": 8.714387315399742e-06, + "loss": 0.9154, + "step": 13489 + }, + { + "epoch": 1.69, + "grad_norm": 46.80823516845703, + "learning_rate": 8.713550600343054e-06, + "loss": 3.4295, + "step": 13490 + }, + { + "epoch": 1.69, + "grad_norm": 14.86972713470459, + "learning_rate": 8.712713885286367e-06, + "loss": 1.122, + "step": 13491 + }, + { + "epoch": 1.69, + "grad_norm": 21.130163192749023, + "learning_rate": 8.71187717022968e-06, + "loss": 1.1324, + "step": 13492 + }, + { + "epoch": 1.69, + "grad_norm": 20.94374656677246, + "learning_rate": 8.711040455172991e-06, + "loss": 1.0438, + "step": 13493 + }, + { + "epoch": 1.69, + "grad_norm": 22.614845275878906, + "learning_rate": 8.710203740116305e-06, + "loss": 1.3184, + "step": 13494 + }, + { + "epoch": 1.69, + "grad_norm": 20.864540100097656, + "learning_rate": 8.709367025059617e-06, + "loss": 0.6603, + "step": 13495 + }, + { + "epoch": 1.69, + "grad_norm": 10.283666610717773, + "learning_rate": 8.708530310002929e-06, + "loss": 0.8978, + "step": 13496 + }, + { + "epoch": 1.69, + "grad_norm": 121.8965072631836, + "learning_rate": 8.70769359494624e-06, + "loss": 1.1968, + "step": 13497 + }, + { + "epoch": 1.69, + "grad_norm": 26.533832550048828, + "learning_rate": 8.706856879889554e-06, + "loss": 1.3313, + "step": 13498 + }, + { + "epoch": 1.69, + "grad_norm": 92.5091552734375, + "learning_rate": 8.706020164832866e-06, + "loss": 1.9148, + "step": 13499 + }, + { + "epoch": 1.69, + "grad_norm": 16.19998550415039, + "learning_rate": 8.705183449776178e-06, + "loss": 1.619, + "step": 13500 + }, + { + "epoch": 1.69, + "grad_norm": 15.971726417541504, + "learning_rate": 8.704346734719492e-06, + "loss": 1.1749, + "step": 13501 + }, + { + "epoch": 1.69, + "grad_norm": 19.345623016357422, + "learning_rate": 8.703510019662804e-06, + "loss": 1.0511, + "step": 13502 + }, + { + "epoch": 1.69, + "grad_norm": 6.182967662811279, + "learning_rate": 8.702673304606118e-06, + "loss": 0.93, + "step": 13503 + }, + { + "epoch": 1.69, + "grad_norm": 9.37590503692627, + "learning_rate": 8.70183658954943e-06, + "loss": 0.3127, + "step": 13504 + }, + { + "epoch": 1.69, + "grad_norm": 7.4262542724609375, + "learning_rate": 8.700999874492743e-06, + "loss": 0.2227, + "step": 13505 + }, + { + "epoch": 1.69, + "grad_norm": 7.645088195800781, + "learning_rate": 8.700163159436055e-06, + "loss": 0.5829, + "step": 13506 + }, + { + "epoch": 1.7, + "grad_norm": 42.65239715576172, + "learning_rate": 8.699326444379367e-06, + "loss": 2.4669, + "step": 13507 + }, + { + "epoch": 1.7, + "grad_norm": 8.191194534301758, + "learning_rate": 8.69848972932268e-06, + "loss": 2.1358, + "step": 13508 + }, + { + "epoch": 1.7, + "grad_norm": 24.681190490722656, + "learning_rate": 8.697653014265993e-06, + "loss": 1.0717, + "step": 13509 + }, + { + "epoch": 1.7, + "grad_norm": 11.34162712097168, + "learning_rate": 8.696816299209305e-06, + "loss": 0.6754, + "step": 13510 + }, + { + "epoch": 1.7, + "grad_norm": 31.257905960083008, + "learning_rate": 8.695979584152617e-06, + "loss": 2.1831, + "step": 13511 + }, + { + "epoch": 1.7, + "grad_norm": 41.80419921875, + "learning_rate": 8.69514286909593e-06, + "loss": 1.0436, + "step": 13512 + }, + { + "epoch": 1.7, + "grad_norm": 17.46442985534668, + "learning_rate": 8.694306154039242e-06, + "loss": 1.6393, + "step": 13513 + }, + { + "epoch": 1.7, + "grad_norm": 20.784839630126953, + "learning_rate": 8.693469438982554e-06, + "loss": 1.292, + "step": 13514 + }, + { + "epoch": 1.7, + "grad_norm": 18.792034149169922, + "learning_rate": 8.692632723925868e-06, + "loss": 1.3554, + "step": 13515 + }, + { + "epoch": 1.7, + "grad_norm": 7.121496200561523, + "learning_rate": 8.69179600886918e-06, + "loss": 0.3784, + "step": 13516 + }, + { + "epoch": 1.7, + "grad_norm": 48.75419616699219, + "learning_rate": 8.690959293812493e-06, + "loss": 1.5, + "step": 13517 + }, + { + "epoch": 1.7, + "grad_norm": 13.551359176635742, + "learning_rate": 8.690122578755805e-06, + "loss": 0.667, + "step": 13518 + }, + { + "epoch": 1.7, + "grad_norm": 6.288450717926025, + "learning_rate": 8.689285863699119e-06, + "loss": 0.9067, + "step": 13519 + }, + { + "epoch": 1.7, + "grad_norm": 11.752936363220215, + "learning_rate": 8.688449148642431e-06, + "loss": 0.7502, + "step": 13520 + }, + { + "epoch": 1.7, + "grad_norm": 7.777066707611084, + "learning_rate": 8.687612433585743e-06, + "loss": 0.2846, + "step": 13521 + }, + { + "epoch": 1.7, + "grad_norm": 42.891937255859375, + "learning_rate": 8.686775718529057e-06, + "loss": 3.0157, + "step": 13522 + }, + { + "epoch": 1.7, + "grad_norm": 12.691798210144043, + "learning_rate": 8.685939003472369e-06, + "loss": 1.5813, + "step": 13523 + }, + { + "epoch": 1.7, + "grad_norm": 27.807815551757812, + "learning_rate": 8.68510228841568e-06, + "loss": 1.3952, + "step": 13524 + }, + { + "epoch": 1.7, + "grad_norm": 13.172384262084961, + "learning_rate": 8.684265573358992e-06, + "loss": 0.8702, + "step": 13525 + }, + { + "epoch": 1.7, + "grad_norm": 6.7595343589782715, + "learning_rate": 8.683428858302306e-06, + "loss": 0.22, + "step": 13526 + }, + { + "epoch": 1.7, + "grad_norm": 11.891069412231445, + "learning_rate": 8.682592143245618e-06, + "loss": 1.4104, + "step": 13527 + }, + { + "epoch": 1.7, + "grad_norm": 14.212307929992676, + "learning_rate": 8.68175542818893e-06, + "loss": 1.5403, + "step": 13528 + }, + { + "epoch": 1.7, + "grad_norm": 16.828903198242188, + "learning_rate": 8.680918713132244e-06, + "loss": 0.9557, + "step": 13529 + }, + { + "epoch": 1.7, + "grad_norm": 49.076622009277344, + "learning_rate": 8.680081998075556e-06, + "loss": 0.8234, + "step": 13530 + }, + { + "epoch": 1.7, + "grad_norm": 15.557707786560059, + "learning_rate": 8.67924528301887e-06, + "loss": 1.0391, + "step": 13531 + }, + { + "epoch": 1.7, + "grad_norm": 10.3461332321167, + "learning_rate": 8.678408567962181e-06, + "loss": 0.6084, + "step": 13532 + }, + { + "epoch": 1.7, + "grad_norm": 8.129873275756836, + "learning_rate": 8.677571852905495e-06, + "loss": 1.2939, + "step": 13533 + }, + { + "epoch": 1.7, + "grad_norm": 32.63777160644531, + "learning_rate": 8.676735137848807e-06, + "loss": 1.4533, + "step": 13534 + }, + { + "epoch": 1.7, + "grad_norm": 9.426362991333008, + "learning_rate": 8.675898422792119e-06, + "loss": 0.9518, + "step": 13535 + }, + { + "epoch": 1.7, + "grad_norm": 19.661296844482422, + "learning_rate": 8.67506170773543e-06, + "loss": 2.0506, + "step": 13536 + }, + { + "epoch": 1.7, + "grad_norm": 23.262487411499023, + "learning_rate": 8.674224992678744e-06, + "loss": 1.1738, + "step": 13537 + }, + { + "epoch": 1.7, + "grad_norm": 9.202347755432129, + "learning_rate": 8.673388277622056e-06, + "loss": 1.0119, + "step": 13538 + }, + { + "epoch": 1.7, + "grad_norm": 24.753644943237305, + "learning_rate": 8.672551562565368e-06, + "loss": 1.5677, + "step": 13539 + }, + { + "epoch": 1.7, + "grad_norm": 5.492506980895996, + "learning_rate": 8.671714847508682e-06, + "loss": 0.655, + "step": 13540 + }, + { + "epoch": 1.7, + "grad_norm": 21.3233585357666, + "learning_rate": 8.670878132451994e-06, + "loss": 1.0742, + "step": 13541 + }, + { + "epoch": 1.7, + "grad_norm": 11.688346862792969, + "learning_rate": 8.670041417395306e-06, + "loss": 0.7168, + "step": 13542 + }, + { + "epoch": 1.7, + "grad_norm": 5.929947853088379, + "learning_rate": 8.66920470233862e-06, + "loss": 0.6347, + "step": 13543 + }, + { + "epoch": 1.7, + "grad_norm": 11.265826225280762, + "learning_rate": 8.668367987281931e-06, + "loss": 1.0965, + "step": 13544 + }, + { + "epoch": 1.7, + "grad_norm": 11.524782180786133, + "learning_rate": 8.667531272225245e-06, + "loss": 1.0063, + "step": 13545 + }, + { + "epoch": 1.7, + "grad_norm": 11.861703872680664, + "learning_rate": 8.666694557168557e-06, + "loss": 1.071, + "step": 13546 + }, + { + "epoch": 1.7, + "grad_norm": 21.725366592407227, + "learning_rate": 8.66585784211187e-06, + "loss": 0.9319, + "step": 13547 + }, + { + "epoch": 1.7, + "grad_norm": 20.31465721130371, + "learning_rate": 8.665021127055183e-06, + "loss": 0.9347, + "step": 13548 + }, + { + "epoch": 1.7, + "grad_norm": 18.960803985595703, + "learning_rate": 8.664184411998495e-06, + "loss": 1.7244, + "step": 13549 + }, + { + "epoch": 1.7, + "grad_norm": 11.230573654174805, + "learning_rate": 8.663347696941807e-06, + "loss": 0.8174, + "step": 13550 + }, + { + "epoch": 1.7, + "grad_norm": 11.02849006652832, + "learning_rate": 8.66251098188512e-06, + "loss": 1.5155, + "step": 13551 + }, + { + "epoch": 1.7, + "grad_norm": 18.773893356323242, + "learning_rate": 8.661674266828432e-06, + "loss": 1.5971, + "step": 13552 + }, + { + "epoch": 1.7, + "grad_norm": 26.453706741333008, + "learning_rate": 8.660837551771744e-06, + "loss": 1.0111, + "step": 13553 + }, + { + "epoch": 1.7, + "grad_norm": 15.818222999572754, + "learning_rate": 8.660000836715058e-06, + "loss": 1.148, + "step": 13554 + }, + { + "epoch": 1.7, + "grad_norm": 12.550549507141113, + "learning_rate": 8.65916412165837e-06, + "loss": 0.7964, + "step": 13555 + }, + { + "epoch": 1.7, + "grad_norm": 2.0923314094543457, + "learning_rate": 8.658327406601682e-06, + "loss": 0.1456, + "step": 13556 + }, + { + "epoch": 1.7, + "grad_norm": 8.524921417236328, + "learning_rate": 8.657490691544995e-06, + "loss": 1.7639, + "step": 13557 + }, + { + "epoch": 1.7, + "grad_norm": 8.678040504455566, + "learning_rate": 8.656653976488307e-06, + "loss": 1.8976, + "step": 13558 + }, + { + "epoch": 1.7, + "grad_norm": 13.557211875915527, + "learning_rate": 8.655817261431621e-06, + "loss": 0.3274, + "step": 13559 + }, + { + "epoch": 1.7, + "grad_norm": 14.539973258972168, + "learning_rate": 8.654980546374933e-06, + "loss": 0.8182, + "step": 13560 + }, + { + "epoch": 1.7, + "grad_norm": 228.64242553710938, + "learning_rate": 8.654143831318247e-06, + "loss": 2.2122, + "step": 13561 + }, + { + "epoch": 1.7, + "grad_norm": 16.100831985473633, + "learning_rate": 8.653307116261558e-06, + "loss": 1.5305, + "step": 13562 + }, + { + "epoch": 1.7, + "grad_norm": 13.376947402954102, + "learning_rate": 8.65247040120487e-06, + "loss": 0.4518, + "step": 13563 + }, + { + "epoch": 1.7, + "grad_norm": 10.532651901245117, + "learning_rate": 8.651633686148182e-06, + "loss": 0.4058, + "step": 13564 + }, + { + "epoch": 1.7, + "grad_norm": 10.986957550048828, + "learning_rate": 8.650796971091496e-06, + "loss": 0.8033, + "step": 13565 + }, + { + "epoch": 1.7, + "grad_norm": 19.906335830688477, + "learning_rate": 8.649960256034808e-06, + "loss": 1.8828, + "step": 13566 + }, + { + "epoch": 1.7, + "grad_norm": 11.025758743286133, + "learning_rate": 8.64912354097812e-06, + "loss": 1.6456, + "step": 13567 + }, + { + "epoch": 1.7, + "grad_norm": 39.448062896728516, + "learning_rate": 8.648286825921434e-06, + "loss": 1.6568, + "step": 13568 + }, + { + "epoch": 1.7, + "grad_norm": 12.004514694213867, + "learning_rate": 8.647450110864746e-06, + "loss": 0.5564, + "step": 13569 + }, + { + "epoch": 1.7, + "grad_norm": 39.72369384765625, + "learning_rate": 8.646613395808058e-06, + "loss": 0.8057, + "step": 13570 + }, + { + "epoch": 1.7, + "grad_norm": 6.89996862411499, + "learning_rate": 8.645776680751371e-06, + "loss": 0.4684, + "step": 13571 + }, + { + "epoch": 1.7, + "grad_norm": 13.687106132507324, + "learning_rate": 8.644939965694683e-06, + "loss": 0.7281, + "step": 13572 + }, + { + "epoch": 1.7, + "grad_norm": 17.127885818481445, + "learning_rate": 8.644103250637997e-06, + "loss": 0.953, + "step": 13573 + }, + { + "epoch": 1.7, + "grad_norm": 11.036026954650879, + "learning_rate": 8.643266535581309e-06, + "loss": 1.4963, + "step": 13574 + }, + { + "epoch": 1.7, + "grad_norm": 9.478490829467773, + "learning_rate": 8.642429820524622e-06, + "loss": 1.2502, + "step": 13575 + }, + { + "epoch": 1.7, + "grad_norm": 14.2973051071167, + "learning_rate": 8.641593105467934e-06, + "loss": 1.4743, + "step": 13576 + }, + { + "epoch": 1.7, + "grad_norm": 19.452285766601562, + "learning_rate": 8.640756390411246e-06, + "loss": 1.4742, + "step": 13577 + }, + { + "epoch": 1.7, + "grad_norm": 43.88484573364258, + "learning_rate": 8.639919675354558e-06, + "loss": 1.2541, + "step": 13578 + }, + { + "epoch": 1.7, + "grad_norm": 44.801395416259766, + "learning_rate": 8.63908296029787e-06, + "loss": 1.1219, + "step": 13579 + }, + { + "epoch": 1.7, + "grad_norm": 11.87891674041748, + "learning_rate": 8.638246245241184e-06, + "loss": 0.8183, + "step": 13580 + }, + { + "epoch": 1.7, + "grad_norm": 7.8933000564575195, + "learning_rate": 8.637409530184496e-06, + "loss": 0.329, + "step": 13581 + }, + { + "epoch": 1.7, + "grad_norm": 17.78415870666504, + "learning_rate": 8.63657281512781e-06, + "loss": 0.8824, + "step": 13582 + }, + { + "epoch": 1.7, + "grad_norm": 24.023479461669922, + "learning_rate": 8.635736100071121e-06, + "loss": 0.4031, + "step": 13583 + }, + { + "epoch": 1.7, + "grad_norm": 8.85848617553711, + "learning_rate": 8.634899385014433e-06, + "loss": 0.1393, + "step": 13584 + }, + { + "epoch": 1.7, + "grad_norm": 19.534149169921875, + "learning_rate": 8.634062669957747e-06, + "loss": 2.2703, + "step": 13585 + }, + { + "epoch": 1.71, + "grad_norm": 53.858760833740234, + "learning_rate": 8.633225954901059e-06, + "loss": 2.1575, + "step": 13586 + }, + { + "epoch": 1.71, + "grad_norm": 15.325075149536133, + "learning_rate": 8.632389239844373e-06, + "loss": 2.039, + "step": 13587 + }, + { + "epoch": 1.71, + "grad_norm": 12.996599197387695, + "learning_rate": 8.631552524787685e-06, + "loss": 0.7443, + "step": 13588 + }, + { + "epoch": 1.71, + "grad_norm": 16.511795043945312, + "learning_rate": 8.630715809730996e-06, + "loss": 1.399, + "step": 13589 + }, + { + "epoch": 1.71, + "grad_norm": 21.71929168701172, + "learning_rate": 8.62987909467431e-06, + "loss": 0.854, + "step": 13590 + }, + { + "epoch": 1.71, + "grad_norm": 11.38278865814209, + "learning_rate": 8.629042379617622e-06, + "loss": 1.0666, + "step": 13591 + }, + { + "epoch": 1.71, + "grad_norm": 29.05997085571289, + "learning_rate": 8.628205664560934e-06, + "loss": 1.1241, + "step": 13592 + }, + { + "epoch": 1.71, + "grad_norm": 16.693775177001953, + "learning_rate": 8.627368949504246e-06, + "loss": 0.8409, + "step": 13593 + }, + { + "epoch": 1.71, + "grad_norm": 9.487668991088867, + "learning_rate": 8.62653223444756e-06, + "loss": 0.4452, + "step": 13594 + }, + { + "epoch": 1.71, + "grad_norm": 11.088615417480469, + "learning_rate": 8.625695519390872e-06, + "loss": 1.0518, + "step": 13595 + }, + { + "epoch": 1.71, + "grad_norm": 10.214578628540039, + "learning_rate": 8.624858804334185e-06, + "loss": 1.0723, + "step": 13596 + }, + { + "epoch": 1.71, + "grad_norm": 23.00535774230957, + "learning_rate": 8.624022089277497e-06, + "loss": 2.3293, + "step": 13597 + }, + { + "epoch": 1.71, + "grad_norm": 16.56564712524414, + "learning_rate": 8.62318537422081e-06, + "loss": 1.2909, + "step": 13598 + }, + { + "epoch": 1.71, + "grad_norm": 23.9890079498291, + "learning_rate": 8.622348659164123e-06, + "loss": 1.214, + "step": 13599 + }, + { + "epoch": 1.71, + "grad_norm": 17.81267738342285, + "learning_rate": 8.621511944107435e-06, + "loss": 1.1028, + "step": 13600 + }, + { + "epoch": 1.71, + "eval_loss": 0.08200293779373169, + "eval_runtime": 94.6835, + "eval_samples_per_second": 37.409, + "eval_steps_per_second": 37.409, + "step": 13600 + }, + { + "epoch": 1.71, + "grad_norm": 15.171854019165039, + "learning_rate": 8.620675229050748e-06, + "loss": 1.6704, + "step": 13601 + }, + { + "epoch": 1.71, + "grad_norm": 13.904727935791016, + "learning_rate": 8.61983851399406e-06, + "loss": 0.8487, + "step": 13602 + }, + { + "epoch": 1.71, + "grad_norm": 22.334592819213867, + "learning_rate": 8.619001798937372e-06, + "loss": 2.0529, + "step": 13603 + }, + { + "epoch": 1.71, + "grad_norm": 17.451332092285156, + "learning_rate": 8.618165083880686e-06, + "loss": 1.4637, + "step": 13604 + }, + { + "epoch": 1.71, + "grad_norm": 18.929479598999023, + "learning_rate": 8.617328368823998e-06, + "loss": 1.593, + "step": 13605 + }, + { + "epoch": 1.71, + "grad_norm": 46.291099548339844, + "learning_rate": 8.61649165376731e-06, + "loss": 1.761, + "step": 13606 + }, + { + "epoch": 1.71, + "grad_norm": 11.363897323608398, + "learning_rate": 8.615654938710622e-06, + "loss": 1.5096, + "step": 13607 + }, + { + "epoch": 1.71, + "grad_norm": 230.1785888671875, + "learning_rate": 8.614818223653935e-06, + "loss": 1.7891, + "step": 13608 + }, + { + "epoch": 1.71, + "grad_norm": 7.91306734085083, + "learning_rate": 8.613981508597247e-06, + "loss": 0.4705, + "step": 13609 + }, + { + "epoch": 1.71, + "grad_norm": 14.959234237670898, + "learning_rate": 8.613144793540561e-06, + "loss": 1.323, + "step": 13610 + }, + { + "epoch": 1.71, + "grad_norm": 7.350861072540283, + "learning_rate": 8.612308078483873e-06, + "loss": 1.33, + "step": 13611 + }, + { + "epoch": 1.71, + "grad_norm": 5.810110569000244, + "learning_rate": 8.611471363427185e-06, + "loss": 0.614, + "step": 13612 + }, + { + "epoch": 1.71, + "grad_norm": 5.175986289978027, + "learning_rate": 8.610634648370499e-06, + "loss": 0.7721, + "step": 13613 + }, + { + "epoch": 1.71, + "grad_norm": 15.35120964050293, + "learning_rate": 8.60979793331381e-06, + "loss": 2.745, + "step": 13614 + }, + { + "epoch": 1.71, + "grad_norm": 32.70784378051758, + "learning_rate": 8.608961218257124e-06, + "loss": 1.5796, + "step": 13615 + }, + { + "epoch": 1.71, + "grad_norm": 8.75576114654541, + "learning_rate": 8.608124503200436e-06, + "loss": 0.3475, + "step": 13616 + }, + { + "epoch": 1.71, + "grad_norm": 111.13134002685547, + "learning_rate": 8.607287788143748e-06, + "loss": 2.3101, + "step": 13617 + }, + { + "epoch": 1.71, + "grad_norm": 6.689243316650391, + "learning_rate": 8.60645107308706e-06, + "loss": 0.427, + "step": 13618 + }, + { + "epoch": 1.71, + "grad_norm": 1.947023868560791, + "learning_rate": 8.605614358030374e-06, + "loss": 0.0379, + "step": 13619 + }, + { + "epoch": 1.71, + "grad_norm": 23.508211135864258, + "learning_rate": 8.604777642973686e-06, + "loss": 0.5596, + "step": 13620 + }, + { + "epoch": 1.71, + "grad_norm": 22.981761932373047, + "learning_rate": 8.603940927916998e-06, + "loss": 1.0385, + "step": 13621 + }, + { + "epoch": 1.71, + "grad_norm": 8.272953033447266, + "learning_rate": 8.603104212860311e-06, + "loss": 0.659, + "step": 13622 + }, + { + "epoch": 1.71, + "grad_norm": 35.652923583984375, + "learning_rate": 8.602267497803623e-06, + "loss": 1.2104, + "step": 13623 + }, + { + "epoch": 1.71, + "grad_norm": 12.808666229248047, + "learning_rate": 8.601430782746937e-06, + "loss": 1.5748, + "step": 13624 + }, + { + "epoch": 1.71, + "grad_norm": 24.261981964111328, + "learning_rate": 8.600594067690249e-06, + "loss": 2.4687, + "step": 13625 + }, + { + "epoch": 1.71, + "grad_norm": 18.71622085571289, + "learning_rate": 8.599757352633561e-06, + "loss": 1.5981, + "step": 13626 + }, + { + "epoch": 1.71, + "grad_norm": 9.795702934265137, + "learning_rate": 8.598920637576874e-06, + "loss": 1.047, + "step": 13627 + }, + { + "epoch": 1.71, + "grad_norm": 7.158591270446777, + "learning_rate": 8.598083922520186e-06, + "loss": 0.5137, + "step": 13628 + }, + { + "epoch": 1.71, + "grad_norm": 22.881824493408203, + "learning_rate": 8.5972472074635e-06, + "loss": 0.3531, + "step": 13629 + }, + { + "epoch": 1.71, + "grad_norm": 10.794554710388184, + "learning_rate": 8.596410492406812e-06, + "loss": 0.9378, + "step": 13630 + }, + { + "epoch": 1.71, + "grad_norm": 24.48370933532715, + "learning_rate": 8.595573777350124e-06, + "loss": 1.6802, + "step": 13631 + }, + { + "epoch": 1.71, + "grad_norm": 8.828770637512207, + "learning_rate": 8.594737062293436e-06, + "loss": 0.6779, + "step": 13632 + }, + { + "epoch": 1.71, + "grad_norm": 28.461143493652344, + "learning_rate": 8.59390034723675e-06, + "loss": 1.9469, + "step": 13633 + }, + { + "epoch": 1.71, + "grad_norm": 52.104888916015625, + "learning_rate": 8.593063632180062e-06, + "loss": 3.3121, + "step": 13634 + }, + { + "epoch": 1.71, + "grad_norm": 8.891456604003906, + "learning_rate": 8.592226917123374e-06, + "loss": 0.7873, + "step": 13635 + }, + { + "epoch": 1.71, + "grad_norm": 16.9071044921875, + "learning_rate": 8.591390202066687e-06, + "loss": 1.2607, + "step": 13636 + }, + { + "epoch": 1.71, + "grad_norm": 6.1126909255981445, + "learning_rate": 8.590553487009999e-06, + "loss": 0.8182, + "step": 13637 + }, + { + "epoch": 1.71, + "grad_norm": 9.156928062438965, + "learning_rate": 8.589716771953313e-06, + "loss": 0.7253, + "step": 13638 + }, + { + "epoch": 1.71, + "grad_norm": 48.94029998779297, + "learning_rate": 8.588880056896625e-06, + "loss": 2.9874, + "step": 13639 + }, + { + "epoch": 1.71, + "grad_norm": 12.007301330566406, + "learning_rate": 8.588043341839937e-06, + "loss": 0.8996, + "step": 13640 + }, + { + "epoch": 1.71, + "grad_norm": 19.267536163330078, + "learning_rate": 8.58720662678325e-06, + "loss": 0.8171, + "step": 13641 + }, + { + "epoch": 1.71, + "grad_norm": 16.437095642089844, + "learning_rate": 8.586369911726562e-06, + "loss": 1.6608, + "step": 13642 + }, + { + "epoch": 1.71, + "grad_norm": 7.273100852966309, + "learning_rate": 8.585533196669876e-06, + "loss": 0.2429, + "step": 13643 + }, + { + "epoch": 1.71, + "grad_norm": 20.524356842041016, + "learning_rate": 8.584696481613188e-06, + "loss": 1.2124, + "step": 13644 + }, + { + "epoch": 1.71, + "grad_norm": 12.111412048339844, + "learning_rate": 8.5838597665565e-06, + "loss": 1.4774, + "step": 13645 + }, + { + "epoch": 1.71, + "grad_norm": 9.179340362548828, + "learning_rate": 8.583023051499812e-06, + "loss": 0.4397, + "step": 13646 + }, + { + "epoch": 1.71, + "grad_norm": 8.476637840270996, + "learning_rate": 8.582186336443124e-06, + "loss": 0.6604, + "step": 13647 + }, + { + "epoch": 1.71, + "grad_norm": 34.13114929199219, + "learning_rate": 8.581349621386437e-06, + "loss": 2.0151, + "step": 13648 + }, + { + "epoch": 1.71, + "grad_norm": 1.2700515985488892, + "learning_rate": 8.58051290632975e-06, + "loss": 0.0291, + "step": 13649 + }, + { + "epoch": 1.71, + "grad_norm": 8.670015335083008, + "learning_rate": 8.579676191273063e-06, + "loss": 0.7254, + "step": 13650 + }, + { + "epoch": 1.71, + "grad_norm": 58.138404846191406, + "learning_rate": 8.578839476216375e-06, + "loss": 1.9637, + "step": 13651 + }, + { + "epoch": 1.71, + "grad_norm": 21.40508460998535, + "learning_rate": 8.578002761159687e-06, + "loss": 1.3869, + "step": 13652 + }, + { + "epoch": 1.71, + "grad_norm": 11.227001190185547, + "learning_rate": 8.577166046103e-06, + "loss": 0.9253, + "step": 13653 + }, + { + "epoch": 1.71, + "grad_norm": 14.857300758361816, + "learning_rate": 8.576329331046313e-06, + "loss": 1.2623, + "step": 13654 + }, + { + "epoch": 1.71, + "grad_norm": 17.581310272216797, + "learning_rate": 8.575492615989626e-06, + "loss": 0.6921, + "step": 13655 + }, + { + "epoch": 1.71, + "grad_norm": 15.472161293029785, + "learning_rate": 8.574655900932938e-06, + "loss": 0.8102, + "step": 13656 + }, + { + "epoch": 1.71, + "grad_norm": 101.40353393554688, + "learning_rate": 8.573819185876252e-06, + "loss": 2.4513, + "step": 13657 + }, + { + "epoch": 1.71, + "grad_norm": 10.423720359802246, + "learning_rate": 8.572982470819564e-06, + "loss": 0.9824, + "step": 13658 + }, + { + "epoch": 1.71, + "grad_norm": 8.732719421386719, + "learning_rate": 8.572145755762876e-06, + "loss": 0.2346, + "step": 13659 + }, + { + "epoch": 1.71, + "grad_norm": 6.475010395050049, + "learning_rate": 8.571309040706188e-06, + "loss": 0.4437, + "step": 13660 + }, + { + "epoch": 1.71, + "grad_norm": 7.847319602966309, + "learning_rate": 8.5704723256495e-06, + "loss": 0.4661, + "step": 13661 + }, + { + "epoch": 1.71, + "grad_norm": 29.919858932495117, + "learning_rate": 8.569635610592813e-06, + "loss": 0.7812, + "step": 13662 + }, + { + "epoch": 1.71, + "grad_norm": 6.448214054107666, + "learning_rate": 8.568798895536125e-06, + "loss": 0.2187, + "step": 13663 + }, + { + "epoch": 1.71, + "grad_norm": 21.727697372436523, + "learning_rate": 8.567962180479439e-06, + "loss": 1.323, + "step": 13664 + }, + { + "epoch": 1.71, + "grad_norm": 10.879844665527344, + "learning_rate": 8.56712546542275e-06, + "loss": 0.3115, + "step": 13665 + }, + { + "epoch": 1.72, + "grad_norm": 17.375152587890625, + "learning_rate": 8.566288750366063e-06, + "loss": 0.339, + "step": 13666 + }, + { + "epoch": 1.72, + "grad_norm": 37.212764739990234, + "learning_rate": 8.565452035309376e-06, + "loss": 1.396, + "step": 13667 + }, + { + "epoch": 1.72, + "grad_norm": 73.83079528808594, + "learning_rate": 8.564615320252688e-06, + "loss": 1.2866, + "step": 13668 + }, + { + "epoch": 1.72, + "grad_norm": 7.035943031311035, + "learning_rate": 8.563778605196002e-06, + "loss": 1.3252, + "step": 13669 + }, + { + "epoch": 1.72, + "grad_norm": 11.69262409210205, + "learning_rate": 8.562941890139314e-06, + "loss": 1.0561, + "step": 13670 + }, + { + "epoch": 1.72, + "grad_norm": 18.235258102416992, + "learning_rate": 8.562105175082626e-06, + "loss": 0.4454, + "step": 13671 + }, + { + "epoch": 1.72, + "grad_norm": 10.037936210632324, + "learning_rate": 8.56126846002594e-06, + "loss": 0.4659, + "step": 13672 + }, + { + "epoch": 1.72, + "grad_norm": 22.74379539489746, + "learning_rate": 8.560431744969252e-06, + "loss": 2.4574, + "step": 13673 + }, + { + "epoch": 1.72, + "grad_norm": 13.192018508911133, + "learning_rate": 8.559595029912563e-06, + "loss": 1.5641, + "step": 13674 + }, + { + "epoch": 1.72, + "grad_norm": 13.780868530273438, + "learning_rate": 8.558758314855875e-06, + "loss": 1.4909, + "step": 13675 + }, + { + "epoch": 1.72, + "grad_norm": 71.01911926269531, + "learning_rate": 8.557921599799189e-06, + "loss": 1.5158, + "step": 13676 + }, + { + "epoch": 1.72, + "grad_norm": 8.662192344665527, + "learning_rate": 8.557084884742501e-06, + "loss": 0.4845, + "step": 13677 + }, + { + "epoch": 1.72, + "grad_norm": 29.933422088623047, + "learning_rate": 8.556248169685815e-06, + "loss": 1.5913, + "step": 13678 + }, + { + "epoch": 1.72, + "grad_norm": 10.449567794799805, + "learning_rate": 8.555411454629127e-06, + "loss": 0.3034, + "step": 13679 + }, + { + "epoch": 1.72, + "grad_norm": 13.019527435302734, + "learning_rate": 8.554574739572439e-06, + "loss": 0.5575, + "step": 13680 + }, + { + "epoch": 1.72, + "grad_norm": 12.01841926574707, + "learning_rate": 8.553738024515752e-06, + "loss": 1.6908, + "step": 13681 + }, + { + "epoch": 1.72, + "grad_norm": 10.98511028289795, + "learning_rate": 8.552901309459064e-06, + "loss": 0.4788, + "step": 13682 + }, + { + "epoch": 1.72, + "grad_norm": 10.490072250366211, + "learning_rate": 8.552064594402378e-06, + "loss": 0.3471, + "step": 13683 + }, + { + "epoch": 1.72, + "grad_norm": 11.30111312866211, + "learning_rate": 8.55122787934569e-06, + "loss": 0.3661, + "step": 13684 + }, + { + "epoch": 1.72, + "grad_norm": 14.25614070892334, + "learning_rate": 8.550391164289002e-06, + "loss": 0.8016, + "step": 13685 + }, + { + "epoch": 1.72, + "grad_norm": 7.77916955947876, + "learning_rate": 8.549554449232315e-06, + "loss": 1.9925, + "step": 13686 + }, + { + "epoch": 1.72, + "grad_norm": 30.912118911743164, + "learning_rate": 8.548717734175627e-06, + "loss": 1.7488, + "step": 13687 + }, + { + "epoch": 1.72, + "grad_norm": 15.012194633483887, + "learning_rate": 8.54788101911894e-06, + "loss": 0.7428, + "step": 13688 + }, + { + "epoch": 1.72, + "grad_norm": 20.931856155395508, + "learning_rate": 8.547044304062251e-06, + "loss": 1.9809, + "step": 13689 + }, + { + "epoch": 1.72, + "grad_norm": 20.445232391357422, + "learning_rate": 8.546207589005565e-06, + "loss": 0.9766, + "step": 13690 + }, + { + "epoch": 1.72, + "grad_norm": 7.759213447570801, + "learning_rate": 8.545370873948877e-06, + "loss": 1.1829, + "step": 13691 + }, + { + "epoch": 1.72, + "grad_norm": 13.30997371673584, + "learning_rate": 8.54453415889219e-06, + "loss": 0.5945, + "step": 13692 + }, + { + "epoch": 1.72, + "grad_norm": 14.735377311706543, + "learning_rate": 8.543697443835502e-06, + "loss": 0.8842, + "step": 13693 + }, + { + "epoch": 1.72, + "grad_norm": 18.377967834472656, + "learning_rate": 8.542860728778814e-06, + "loss": 1.3858, + "step": 13694 + }, + { + "epoch": 1.72, + "grad_norm": 7.478085041046143, + "learning_rate": 8.542024013722128e-06, + "loss": 1.8176, + "step": 13695 + }, + { + "epoch": 1.72, + "grad_norm": 10.661737442016602, + "learning_rate": 8.54118729866544e-06, + "loss": 1.0031, + "step": 13696 + }, + { + "epoch": 1.72, + "grad_norm": 31.49393081665039, + "learning_rate": 8.540350583608754e-06, + "loss": 2.1003, + "step": 13697 + }, + { + "epoch": 1.72, + "grad_norm": 7.8699774742126465, + "learning_rate": 8.539513868552066e-06, + "loss": 0.6333, + "step": 13698 + }, + { + "epoch": 1.72, + "grad_norm": 23.345783233642578, + "learning_rate": 8.538677153495378e-06, + "loss": 1.6522, + "step": 13699 + }, + { + "epoch": 1.72, + "grad_norm": 18.031431198120117, + "learning_rate": 8.53784043843869e-06, + "loss": 1.928, + "step": 13700 + }, + { + "epoch": 1.72, + "grad_norm": 11.421024322509766, + "learning_rate": 8.537003723382003e-06, + "loss": 1.0393, + "step": 13701 + }, + { + "epoch": 1.72, + "grad_norm": 12.769109725952148, + "learning_rate": 8.536167008325315e-06, + "loss": 2.8767, + "step": 13702 + }, + { + "epoch": 1.72, + "grad_norm": 19.535921096801758, + "learning_rate": 8.535330293268627e-06, + "loss": 1.5462, + "step": 13703 + }, + { + "epoch": 1.72, + "grad_norm": 9.937314987182617, + "learning_rate": 8.53449357821194e-06, + "loss": 0.1772, + "step": 13704 + }, + { + "epoch": 1.72, + "grad_norm": 14.477164268493652, + "learning_rate": 8.533656863155253e-06, + "loss": 1.0191, + "step": 13705 + }, + { + "epoch": 1.72, + "grad_norm": 6.0834479331970215, + "learning_rate": 8.532820148098566e-06, + "loss": 0.2097, + "step": 13706 + }, + { + "epoch": 1.72, + "grad_norm": 41.72673416137695, + "learning_rate": 8.531983433041878e-06, + "loss": 2.2051, + "step": 13707 + }, + { + "epoch": 1.72, + "grad_norm": 11.719079971313477, + "learning_rate": 8.53114671798519e-06, + "loss": 0.4026, + "step": 13708 + }, + { + "epoch": 1.72, + "grad_norm": 9.154730796813965, + "learning_rate": 8.530310002928504e-06, + "loss": 1.6962, + "step": 13709 + }, + { + "epoch": 1.72, + "grad_norm": 8.870038032531738, + "learning_rate": 8.529473287871816e-06, + "loss": 0.6991, + "step": 13710 + }, + { + "epoch": 1.72, + "grad_norm": 28.99807357788086, + "learning_rate": 8.52863657281513e-06, + "loss": 0.6092, + "step": 13711 + }, + { + "epoch": 1.72, + "grad_norm": 7.3584370613098145, + "learning_rate": 8.527799857758441e-06, + "loss": 0.3729, + "step": 13712 + }, + { + "epoch": 1.72, + "grad_norm": 3.6323156356811523, + "learning_rate": 8.526963142701753e-06, + "loss": 0.4234, + "step": 13713 + }, + { + "epoch": 1.72, + "grad_norm": 13.42520523071289, + "learning_rate": 8.526126427645065e-06, + "loss": 2.027, + "step": 13714 + }, + { + "epoch": 1.72, + "grad_norm": 6.929305553436279, + "learning_rate": 8.525289712588379e-06, + "loss": 1.1329, + "step": 13715 + }, + { + "epoch": 1.72, + "grad_norm": 27.86838150024414, + "learning_rate": 8.524452997531691e-06, + "loss": 1.128, + "step": 13716 + }, + { + "epoch": 1.72, + "grad_norm": 19.303163528442383, + "learning_rate": 8.523616282475003e-06, + "loss": 1.2456, + "step": 13717 + }, + { + "epoch": 1.72, + "grad_norm": 17.35099220275879, + "learning_rate": 8.522779567418317e-06, + "loss": 0.7379, + "step": 13718 + }, + { + "epoch": 1.72, + "grad_norm": 17.20098304748535, + "learning_rate": 8.521942852361629e-06, + "loss": 2.1604, + "step": 13719 + }, + { + "epoch": 1.72, + "grad_norm": 5.011656761169434, + "learning_rate": 8.521106137304942e-06, + "loss": 0.6209, + "step": 13720 + }, + { + "epoch": 1.72, + "grad_norm": 10.144037246704102, + "learning_rate": 8.520269422248254e-06, + "loss": 0.5365, + "step": 13721 + }, + { + "epoch": 1.72, + "grad_norm": 21.16668701171875, + "learning_rate": 8.519432707191566e-06, + "loss": 1.2462, + "step": 13722 + }, + { + "epoch": 1.72, + "grad_norm": 3.0023086071014404, + "learning_rate": 8.51859599213488e-06, + "loss": 1.3551, + "step": 13723 + }, + { + "epoch": 1.72, + "grad_norm": 8.263258934020996, + "learning_rate": 8.517759277078192e-06, + "loss": 1.2265, + "step": 13724 + }, + { + "epoch": 1.72, + "grad_norm": 23.832761764526367, + "learning_rate": 8.516922562021505e-06, + "loss": 0.8474, + "step": 13725 + }, + { + "epoch": 1.72, + "grad_norm": 14.02380084991455, + "learning_rate": 8.516085846964817e-06, + "loss": 0.7024, + "step": 13726 + }, + { + "epoch": 1.72, + "grad_norm": 10.211564064025879, + "learning_rate": 8.51524913190813e-06, + "loss": 1.5636, + "step": 13727 + }, + { + "epoch": 1.72, + "grad_norm": 4.276966094970703, + "learning_rate": 8.514412416851441e-06, + "loss": 0.2978, + "step": 13728 + }, + { + "epoch": 1.72, + "grad_norm": 155.47015380859375, + "learning_rate": 8.513575701794753e-06, + "loss": 2.1612, + "step": 13729 + }, + { + "epoch": 1.72, + "grad_norm": 22.023038864135742, + "learning_rate": 8.512738986738067e-06, + "loss": 1.2634, + "step": 13730 + }, + { + "epoch": 1.72, + "grad_norm": 18.61208724975586, + "learning_rate": 8.511902271681379e-06, + "loss": 0.8921, + "step": 13731 + }, + { + "epoch": 1.72, + "grad_norm": 8.686084747314453, + "learning_rate": 8.511065556624692e-06, + "loss": 1.5567, + "step": 13732 + }, + { + "epoch": 1.72, + "grad_norm": 16.485912322998047, + "learning_rate": 8.510228841568004e-06, + "loss": 0.8855, + "step": 13733 + }, + { + "epoch": 1.72, + "grad_norm": 17.467750549316406, + "learning_rate": 8.509392126511318e-06, + "loss": 1.5973, + "step": 13734 + }, + { + "epoch": 1.72, + "grad_norm": 25.251373291015625, + "learning_rate": 8.50855541145463e-06, + "loss": 0.9983, + "step": 13735 + }, + { + "epoch": 1.72, + "grad_norm": 48.691627502441406, + "learning_rate": 8.507718696397942e-06, + "loss": 2.9335, + "step": 13736 + }, + { + "epoch": 1.72, + "grad_norm": 35.23447036743164, + "learning_rate": 8.506881981341256e-06, + "loss": 1.7289, + "step": 13737 + }, + { + "epoch": 1.72, + "grad_norm": 8.047329902648926, + "learning_rate": 8.506045266284568e-06, + "loss": 0.5925, + "step": 13738 + }, + { + "epoch": 1.72, + "grad_norm": 23.861122131347656, + "learning_rate": 8.505208551227881e-06, + "loss": 1.1134, + "step": 13739 + }, + { + "epoch": 1.72, + "grad_norm": 20.802949905395508, + "learning_rate": 8.504371836171193e-06, + "loss": 0.4285, + "step": 13740 + }, + { + "epoch": 1.72, + "grad_norm": 16.312307357788086, + "learning_rate": 8.503535121114505e-06, + "loss": 1.0645, + "step": 13741 + }, + { + "epoch": 1.72, + "grad_norm": 10.014398574829102, + "learning_rate": 8.502698406057817e-06, + "loss": 1.7962, + "step": 13742 + }, + { + "epoch": 1.72, + "grad_norm": 11.285490989685059, + "learning_rate": 8.501861691001129e-06, + "loss": 0.7414, + "step": 13743 + }, + { + "epoch": 1.72, + "grad_norm": 9.441317558288574, + "learning_rate": 8.501024975944443e-06, + "loss": 0.3939, + "step": 13744 + }, + { + "epoch": 1.72, + "grad_norm": 19.6748104095459, + "learning_rate": 8.500188260887755e-06, + "loss": 1.512, + "step": 13745 + }, + { + "epoch": 1.73, + "grad_norm": 95.89828491210938, + "learning_rate": 8.499351545831068e-06, + "loss": 1.1321, + "step": 13746 + }, + { + "epoch": 1.73, + "grad_norm": 7.618378162384033, + "learning_rate": 8.49851483077438e-06, + "loss": 0.7681, + "step": 13747 + }, + { + "epoch": 1.73, + "grad_norm": 7.990718841552734, + "learning_rate": 8.497678115717694e-06, + "loss": 0.6788, + "step": 13748 + }, + { + "epoch": 1.73, + "grad_norm": 12.311456680297852, + "learning_rate": 8.496841400661006e-06, + "loss": 0.4918, + "step": 13749 + }, + { + "epoch": 1.73, + "grad_norm": 4.860684394836426, + "learning_rate": 8.496004685604318e-06, + "loss": 1.4039, + "step": 13750 + }, + { + "epoch": 1.73, + "grad_norm": 19.4648380279541, + "learning_rate": 8.495167970547631e-06, + "loss": 1.4667, + "step": 13751 + }, + { + "epoch": 1.73, + "grad_norm": 4.4415974617004395, + "learning_rate": 8.494331255490943e-06, + "loss": 0.1931, + "step": 13752 + }, + { + "epoch": 1.73, + "grad_norm": 16.85715675354004, + "learning_rate": 8.493494540434255e-06, + "loss": 0.9058, + "step": 13753 + }, + { + "epoch": 1.73, + "grad_norm": 8.172959327697754, + "learning_rate": 8.492657825377569e-06, + "loss": 1.8469, + "step": 13754 + }, + { + "epoch": 1.73, + "grad_norm": 27.272842407226562, + "learning_rate": 8.491821110320881e-06, + "loss": 1.3817, + "step": 13755 + }, + { + "epoch": 1.73, + "grad_norm": 10.073509216308594, + "learning_rate": 8.490984395264193e-06, + "loss": 1.3283, + "step": 13756 + }, + { + "epoch": 1.73, + "grad_norm": 4.937424182891846, + "learning_rate": 8.490147680207505e-06, + "loss": 0.0815, + "step": 13757 + }, + { + "epoch": 1.73, + "grad_norm": 46.79994201660156, + "learning_rate": 8.489310965150818e-06, + "loss": 1.4095, + "step": 13758 + }, + { + "epoch": 1.73, + "grad_norm": 8.656526565551758, + "learning_rate": 8.48847425009413e-06, + "loss": 1.0071, + "step": 13759 + }, + { + "epoch": 1.73, + "grad_norm": 27.997211456298828, + "learning_rate": 8.487637535037444e-06, + "loss": 0.6438, + "step": 13760 + }, + { + "epoch": 1.73, + "grad_norm": 41.18295669555664, + "learning_rate": 8.486800819980756e-06, + "loss": 0.7046, + "step": 13761 + }, + { + "epoch": 1.73, + "grad_norm": 7.984874248504639, + "learning_rate": 8.48596410492407e-06, + "loss": 0.4048, + "step": 13762 + }, + { + "epoch": 1.73, + "grad_norm": 27.08432960510254, + "learning_rate": 8.485127389867382e-06, + "loss": 1.5601, + "step": 13763 + }, + { + "epoch": 1.73, + "grad_norm": 110.85393524169922, + "learning_rate": 8.484290674810694e-06, + "loss": 2.1277, + "step": 13764 + }, + { + "epoch": 1.73, + "grad_norm": 45.55110168457031, + "learning_rate": 8.483453959754007e-06, + "loss": 1.2367, + "step": 13765 + }, + { + "epoch": 1.73, + "grad_norm": 18.700878143310547, + "learning_rate": 8.48261724469732e-06, + "loss": 0.9085, + "step": 13766 + }, + { + "epoch": 1.73, + "grad_norm": 12.493959426879883, + "learning_rate": 8.481780529640631e-06, + "loss": 0.6807, + "step": 13767 + }, + { + "epoch": 1.73, + "grad_norm": 24.03819465637207, + "learning_rate": 8.480943814583945e-06, + "loss": 2.3393, + "step": 13768 + }, + { + "epoch": 1.73, + "grad_norm": 14.463841438293457, + "learning_rate": 8.480107099527257e-06, + "loss": 1.6939, + "step": 13769 + }, + { + "epoch": 1.73, + "grad_norm": 4.762551307678223, + "learning_rate": 8.479270384470569e-06, + "loss": 0.3719, + "step": 13770 + }, + { + "epoch": 1.73, + "grad_norm": 7.250888347625732, + "learning_rate": 8.47843366941388e-06, + "loss": 0.743, + "step": 13771 + }, + { + "epoch": 1.73, + "grad_norm": 10.955687522888184, + "learning_rate": 8.477596954357194e-06, + "loss": 1.356, + "step": 13772 + }, + { + "epoch": 1.73, + "grad_norm": 55.618995666503906, + "learning_rate": 8.476760239300506e-06, + "loss": 1.9591, + "step": 13773 + }, + { + "epoch": 1.73, + "grad_norm": 5.087319374084473, + "learning_rate": 8.47592352424382e-06, + "loss": 0.0754, + "step": 13774 + }, + { + "epoch": 1.73, + "grad_norm": 15.81760025024414, + "learning_rate": 8.475086809187132e-06, + "loss": 0.5238, + "step": 13775 + }, + { + "epoch": 1.73, + "grad_norm": 35.106510162353516, + "learning_rate": 8.474250094130446e-06, + "loss": 2.2119, + "step": 13776 + }, + { + "epoch": 1.73, + "grad_norm": 19.20872688293457, + "learning_rate": 8.473413379073757e-06, + "loss": 1.3516, + "step": 13777 + }, + { + "epoch": 1.73, + "grad_norm": 24.926410675048828, + "learning_rate": 8.47257666401707e-06, + "loss": 1.1482, + "step": 13778 + }, + { + "epoch": 1.73, + "grad_norm": 16.512378692626953, + "learning_rate": 8.471739948960383e-06, + "loss": 2.4315, + "step": 13779 + }, + { + "epoch": 1.73, + "grad_norm": 26.667551040649414, + "learning_rate": 8.470903233903695e-06, + "loss": 1.3698, + "step": 13780 + }, + { + "epoch": 1.73, + "grad_norm": 2.896178722381592, + "learning_rate": 8.470066518847007e-06, + "loss": 0.0695, + "step": 13781 + }, + { + "epoch": 1.73, + "grad_norm": 16.381025314331055, + "learning_rate": 8.469229803790319e-06, + "loss": 1.255, + "step": 13782 + }, + { + "epoch": 1.73, + "grad_norm": 7.191415309906006, + "learning_rate": 8.468393088733633e-06, + "loss": 0.4244, + "step": 13783 + }, + { + "epoch": 1.73, + "grad_norm": 145.7000274658203, + "learning_rate": 8.467556373676945e-06, + "loss": 2.1742, + "step": 13784 + }, + { + "epoch": 1.73, + "grad_norm": 17.76723861694336, + "learning_rate": 8.466719658620257e-06, + "loss": 1.2058, + "step": 13785 + }, + { + "epoch": 1.73, + "grad_norm": 23.901243209838867, + "learning_rate": 8.46588294356357e-06, + "loss": 1.8144, + "step": 13786 + }, + { + "epoch": 1.73, + "grad_norm": 28.461530685424805, + "learning_rate": 8.465046228506882e-06, + "loss": 1.5681, + "step": 13787 + }, + { + "epoch": 1.73, + "grad_norm": 6.418713092803955, + "learning_rate": 8.464209513450196e-06, + "loss": 0.4838, + "step": 13788 + }, + { + "epoch": 1.73, + "grad_norm": 29.05074119567871, + "learning_rate": 8.463372798393508e-06, + "loss": 2.6298, + "step": 13789 + }, + { + "epoch": 1.73, + "grad_norm": 44.116294860839844, + "learning_rate": 8.462536083336821e-06, + "loss": 1.7151, + "step": 13790 + }, + { + "epoch": 1.73, + "grad_norm": 9.961648941040039, + "learning_rate": 8.461699368280133e-06, + "loss": 1.1424, + "step": 13791 + }, + { + "epoch": 1.73, + "grad_norm": 12.723161697387695, + "learning_rate": 8.460862653223445e-06, + "loss": 0.7386, + "step": 13792 + }, + { + "epoch": 1.73, + "grad_norm": 10.818270683288574, + "learning_rate": 8.460025938166759e-06, + "loss": 1.6343, + "step": 13793 + }, + { + "epoch": 1.73, + "grad_norm": 22.30669593811035, + "learning_rate": 8.459189223110071e-06, + "loss": 1.8495, + "step": 13794 + }, + { + "epoch": 1.73, + "grad_norm": 14.09034538269043, + "learning_rate": 8.458352508053383e-06, + "loss": 1.0546, + "step": 13795 + }, + { + "epoch": 1.73, + "grad_norm": 7.118631362915039, + "learning_rate": 8.457515792996695e-06, + "loss": 1.7067, + "step": 13796 + }, + { + "epoch": 1.73, + "grad_norm": 12.156725883483887, + "learning_rate": 8.456679077940008e-06, + "loss": 0.7631, + "step": 13797 + }, + { + "epoch": 1.73, + "grad_norm": 6.303142547607422, + "learning_rate": 8.45584236288332e-06, + "loss": 0.2077, + "step": 13798 + }, + { + "epoch": 1.73, + "grad_norm": 14.71904182434082, + "learning_rate": 8.455005647826632e-06, + "loss": 1.9614, + "step": 13799 + }, + { + "epoch": 1.73, + "grad_norm": 19.938352584838867, + "learning_rate": 8.454168932769946e-06, + "loss": 2.0698, + "step": 13800 + }, + { + "epoch": 1.73, + "grad_norm": 19.06098175048828, + "learning_rate": 8.453332217713258e-06, + "loss": 1.6484, + "step": 13801 + }, + { + "epoch": 1.73, + "grad_norm": 6.216872692108154, + "learning_rate": 8.452495502656572e-06, + "loss": 0.2628, + "step": 13802 + }, + { + "epoch": 1.73, + "grad_norm": 8.316340446472168, + "learning_rate": 8.451658787599884e-06, + "loss": 1.1693, + "step": 13803 + }, + { + "epoch": 1.73, + "grad_norm": 7.7603607177734375, + "learning_rate": 8.450822072543197e-06, + "loss": 0.8073, + "step": 13804 + }, + { + "epoch": 1.73, + "grad_norm": 9.606012344360352, + "learning_rate": 8.44998535748651e-06, + "loss": 1.0402, + "step": 13805 + }, + { + "epoch": 1.73, + "grad_norm": 13.379460334777832, + "learning_rate": 8.449148642429821e-06, + "loss": 1.7186, + "step": 13806 + }, + { + "epoch": 1.73, + "grad_norm": 22.462533950805664, + "learning_rate": 8.448311927373135e-06, + "loss": 1.9008, + "step": 13807 + }, + { + "epoch": 1.73, + "grad_norm": 12.43236255645752, + "learning_rate": 8.447475212316447e-06, + "loss": 0.4981, + "step": 13808 + }, + { + "epoch": 1.73, + "grad_norm": 6.2734761238098145, + "learning_rate": 8.446638497259759e-06, + "loss": 1.7295, + "step": 13809 + }, + { + "epoch": 1.73, + "grad_norm": 5.263993263244629, + "learning_rate": 8.44580178220307e-06, + "loss": 0.2166, + "step": 13810 + }, + { + "epoch": 1.73, + "grad_norm": 24.654264450073242, + "learning_rate": 8.444965067146384e-06, + "loss": 1.4845, + "step": 13811 + }, + { + "epoch": 1.73, + "grad_norm": 13.289724349975586, + "learning_rate": 8.444128352089696e-06, + "loss": 1.5027, + "step": 13812 + }, + { + "epoch": 1.73, + "grad_norm": 31.688879013061523, + "learning_rate": 8.443291637033008e-06, + "loss": 1.661, + "step": 13813 + }, + { + "epoch": 1.73, + "grad_norm": 9.262681007385254, + "learning_rate": 8.442454921976322e-06, + "loss": 0.403, + "step": 13814 + }, + { + "epoch": 1.73, + "grad_norm": 59.12746810913086, + "learning_rate": 8.441618206919634e-06, + "loss": 2.6707, + "step": 13815 + }, + { + "epoch": 1.73, + "grad_norm": 20.5118408203125, + "learning_rate": 8.440781491862947e-06, + "loss": 0.8513, + "step": 13816 + }, + { + "epoch": 1.73, + "grad_norm": 12.630598068237305, + "learning_rate": 8.43994477680626e-06, + "loss": 0.7384, + "step": 13817 + }, + { + "epoch": 1.73, + "grad_norm": 9.17889404296875, + "learning_rate": 8.439108061749573e-06, + "loss": 0.7034, + "step": 13818 + }, + { + "epoch": 1.73, + "grad_norm": 9.370962142944336, + "learning_rate": 8.438271346692885e-06, + "loss": 1.214, + "step": 13819 + }, + { + "epoch": 1.73, + "grad_norm": 4.087014198303223, + "learning_rate": 8.437434631636197e-06, + "loss": 0.078, + "step": 13820 + }, + { + "epoch": 1.73, + "grad_norm": 12.483458518981934, + "learning_rate": 8.43659791657951e-06, + "loss": 2.1561, + "step": 13821 + }, + { + "epoch": 1.73, + "grad_norm": 9.09365177154541, + "learning_rate": 8.435761201522823e-06, + "loss": 0.5027, + "step": 13822 + }, + { + "epoch": 1.73, + "grad_norm": 41.06399154663086, + "learning_rate": 8.434924486466135e-06, + "loss": 2.35, + "step": 13823 + }, + { + "epoch": 1.73, + "grad_norm": 25.1468563079834, + "learning_rate": 8.434087771409446e-06, + "loss": 1.0067, + "step": 13824 + }, + { + "epoch": 1.74, + "grad_norm": 27.10707664489746, + "learning_rate": 8.43325105635276e-06, + "loss": 1.8279, + "step": 13825 + }, + { + "epoch": 1.74, + "grad_norm": 5.643894672393799, + "learning_rate": 8.432414341296072e-06, + "loss": 1.0992, + "step": 13826 + }, + { + "epoch": 1.74, + "grad_norm": 41.79654312133789, + "learning_rate": 8.431577626239384e-06, + "loss": 1.5539, + "step": 13827 + }, + { + "epoch": 1.74, + "grad_norm": 40.047645568847656, + "learning_rate": 8.430740911182698e-06, + "loss": 1.2106, + "step": 13828 + }, + { + "epoch": 1.74, + "grad_norm": 13.573161125183105, + "learning_rate": 8.42990419612601e-06, + "loss": 0.9888, + "step": 13829 + }, + { + "epoch": 1.74, + "grad_norm": 58.52835464477539, + "learning_rate": 8.429067481069323e-06, + "loss": 1.9994, + "step": 13830 + }, + { + "epoch": 1.74, + "grad_norm": 5.157312870025635, + "learning_rate": 8.428230766012635e-06, + "loss": 0.1587, + "step": 13831 + }, + { + "epoch": 1.74, + "grad_norm": 38.501251220703125, + "learning_rate": 8.427394050955947e-06, + "loss": 1.1178, + "step": 13832 + }, + { + "epoch": 1.74, + "grad_norm": 26.000242233276367, + "learning_rate": 8.42655733589926e-06, + "loss": 1.9275, + "step": 13833 + }, + { + "epoch": 1.74, + "grad_norm": 3.8338160514831543, + "learning_rate": 8.425720620842573e-06, + "loss": 0.2231, + "step": 13834 + }, + { + "epoch": 1.74, + "grad_norm": 9.635165214538574, + "learning_rate": 8.424883905785885e-06, + "loss": 0.7556, + "step": 13835 + }, + { + "epoch": 1.74, + "grad_norm": 46.228145599365234, + "learning_rate": 8.424047190729198e-06, + "loss": 1.097, + "step": 13836 + }, + { + "epoch": 1.74, + "grad_norm": 45.72690963745117, + "learning_rate": 8.42321047567251e-06, + "loss": 2.284, + "step": 13837 + }, + { + "epoch": 1.74, + "grad_norm": 9.786887168884277, + "learning_rate": 8.422373760615822e-06, + "loss": 0.9054, + "step": 13838 + }, + { + "epoch": 1.74, + "grad_norm": 6.442713737487793, + "learning_rate": 8.421537045559134e-06, + "loss": 0.277, + "step": 13839 + }, + { + "epoch": 1.74, + "grad_norm": 18.807449340820312, + "learning_rate": 8.420700330502448e-06, + "loss": 1.4255, + "step": 13840 + }, + { + "epoch": 1.74, + "grad_norm": 12.640721321105957, + "learning_rate": 8.41986361544576e-06, + "loss": 0.7746, + "step": 13841 + }, + { + "epoch": 1.74, + "grad_norm": 11.13254165649414, + "learning_rate": 8.419026900389074e-06, + "loss": 0.8999, + "step": 13842 + }, + { + "epoch": 1.74, + "grad_norm": 20.162174224853516, + "learning_rate": 8.418190185332385e-06, + "loss": 2.4199, + "step": 13843 + }, + { + "epoch": 1.74, + "grad_norm": 16.63433074951172, + "learning_rate": 8.417353470275699e-06, + "loss": 1.207, + "step": 13844 + }, + { + "epoch": 1.74, + "grad_norm": 29.841510772705078, + "learning_rate": 8.416516755219011e-06, + "loss": 1.2163, + "step": 13845 + }, + { + "epoch": 1.74, + "grad_norm": 15.83189868927002, + "learning_rate": 8.415680040162323e-06, + "loss": 0.5019, + "step": 13846 + }, + { + "epoch": 1.74, + "grad_norm": 6.463802337646484, + "learning_rate": 8.414843325105637e-06, + "loss": 0.3681, + "step": 13847 + }, + { + "epoch": 1.74, + "grad_norm": 10.133244514465332, + "learning_rate": 8.414006610048949e-06, + "loss": 0.632, + "step": 13848 + }, + { + "epoch": 1.74, + "grad_norm": 10.25268268585205, + "learning_rate": 8.41316989499226e-06, + "loss": 0.8419, + "step": 13849 + }, + { + "epoch": 1.74, + "grad_norm": 10.213932037353516, + "learning_rate": 8.412333179935574e-06, + "loss": 0.5327, + "step": 13850 + }, + { + "epoch": 1.74, + "grad_norm": 16.98855972290039, + "learning_rate": 8.411496464878886e-06, + "loss": 0.4425, + "step": 13851 + }, + { + "epoch": 1.74, + "grad_norm": 12.402276039123535, + "learning_rate": 8.410659749822198e-06, + "loss": 0.9086, + "step": 13852 + }, + { + "epoch": 1.74, + "grad_norm": 2.999429225921631, + "learning_rate": 8.40982303476551e-06, + "loss": 0.1145, + "step": 13853 + }, + { + "epoch": 1.74, + "grad_norm": 7.634301662445068, + "learning_rate": 8.408986319708824e-06, + "loss": 0.1823, + "step": 13854 + }, + { + "epoch": 1.74, + "grad_norm": 31.303504943847656, + "learning_rate": 8.408149604652136e-06, + "loss": 1.3208, + "step": 13855 + }, + { + "epoch": 1.74, + "grad_norm": 2.7392351627349854, + "learning_rate": 8.40731288959545e-06, + "loss": 0.1621, + "step": 13856 + }, + { + "epoch": 1.74, + "grad_norm": 9.222172737121582, + "learning_rate": 8.406476174538761e-06, + "loss": 0.4276, + "step": 13857 + }, + { + "epoch": 1.74, + "grad_norm": 10.697710037231445, + "learning_rate": 8.405639459482075e-06, + "loss": 1.2623, + "step": 13858 + }, + { + "epoch": 1.74, + "grad_norm": 16.46355628967285, + "learning_rate": 8.404802744425387e-06, + "loss": 1.2663, + "step": 13859 + }, + { + "epoch": 1.74, + "grad_norm": 19.478023529052734, + "learning_rate": 8.403966029368699e-06, + "loss": 1.607, + "step": 13860 + }, + { + "epoch": 1.74, + "grad_norm": 13.396600723266602, + "learning_rate": 8.403129314312013e-06, + "loss": 0.8841, + "step": 13861 + }, + { + "epoch": 1.74, + "grad_norm": 39.11217498779297, + "learning_rate": 8.402292599255324e-06, + "loss": 2.5331, + "step": 13862 + }, + { + "epoch": 1.74, + "grad_norm": 20.548969268798828, + "learning_rate": 8.401455884198636e-06, + "loss": 0.7516, + "step": 13863 + }, + { + "epoch": 1.74, + "grad_norm": 14.2288179397583, + "learning_rate": 8.400619169141948e-06, + "loss": 2.0292, + "step": 13864 + }, + { + "epoch": 1.74, + "grad_norm": 21.902082443237305, + "learning_rate": 8.399782454085262e-06, + "loss": 1.5894, + "step": 13865 + }, + { + "epoch": 1.74, + "grad_norm": 7.677196979522705, + "learning_rate": 8.398945739028574e-06, + "loss": 0.302, + "step": 13866 + }, + { + "epoch": 1.74, + "grad_norm": 18.952892303466797, + "learning_rate": 8.398109023971886e-06, + "loss": 1.3935, + "step": 13867 + }, + { + "epoch": 1.74, + "grad_norm": 9.475656509399414, + "learning_rate": 8.3972723089152e-06, + "loss": 0.7635, + "step": 13868 + }, + { + "epoch": 1.74, + "grad_norm": 11.825992584228516, + "learning_rate": 8.396435593858512e-06, + "loss": 0.7922, + "step": 13869 + }, + { + "epoch": 1.74, + "grad_norm": 7.463834762573242, + "learning_rate": 8.395598878801825e-06, + "loss": 0.9997, + "step": 13870 + }, + { + "epoch": 1.74, + "grad_norm": 32.768089294433594, + "learning_rate": 8.394762163745137e-06, + "loss": 1.071, + "step": 13871 + }, + { + "epoch": 1.74, + "grad_norm": 19.2761173248291, + "learning_rate": 8.39392544868845e-06, + "loss": 1.2451, + "step": 13872 + }, + { + "epoch": 1.74, + "grad_norm": 17.122081756591797, + "learning_rate": 8.393088733631763e-06, + "loss": 0.5563, + "step": 13873 + }, + { + "epoch": 1.74, + "grad_norm": 15.508161544799805, + "learning_rate": 8.392252018575075e-06, + "loss": 1.5265, + "step": 13874 + }, + { + "epoch": 1.74, + "grad_norm": 9.615347862243652, + "learning_rate": 8.391415303518388e-06, + "loss": 0.7297, + "step": 13875 + }, + { + "epoch": 1.74, + "grad_norm": 36.48480224609375, + "learning_rate": 8.3905785884617e-06, + "loss": 1.2946, + "step": 13876 + }, + { + "epoch": 1.74, + "grad_norm": 16.75336456298828, + "learning_rate": 8.389741873405012e-06, + "loss": 0.4269, + "step": 13877 + }, + { + "epoch": 1.74, + "grad_norm": 10.70810604095459, + "learning_rate": 8.388905158348324e-06, + "loss": 0.6779, + "step": 13878 + }, + { + "epoch": 1.74, + "grad_norm": 52.989200592041016, + "learning_rate": 8.388068443291638e-06, + "loss": 0.5792, + "step": 13879 + }, + { + "epoch": 1.74, + "grad_norm": 17.84124755859375, + "learning_rate": 8.38723172823495e-06, + "loss": 2.1604, + "step": 13880 + }, + { + "epoch": 1.74, + "grad_norm": 15.700440406799316, + "learning_rate": 8.386395013178262e-06, + "loss": 0.8854, + "step": 13881 + }, + { + "epoch": 1.74, + "grad_norm": 5.550549030303955, + "learning_rate": 8.385558298121575e-06, + "loss": 1.4364, + "step": 13882 + }, + { + "epoch": 1.74, + "grad_norm": 5.984732151031494, + "learning_rate": 8.384721583064887e-06, + "loss": 0.7948, + "step": 13883 + }, + { + "epoch": 1.74, + "grad_norm": 13.10971450805664, + "learning_rate": 8.383884868008201e-06, + "loss": 0.768, + "step": 13884 + }, + { + "epoch": 1.74, + "grad_norm": 9.079219818115234, + "learning_rate": 8.383048152951513e-06, + "loss": 0.5529, + "step": 13885 + }, + { + "epoch": 1.74, + "grad_norm": 11.177825927734375, + "learning_rate": 8.382211437894827e-06, + "loss": 0.5724, + "step": 13886 + }, + { + "epoch": 1.74, + "grad_norm": 11.534337043762207, + "learning_rate": 8.381374722838139e-06, + "loss": 0.91, + "step": 13887 + }, + { + "epoch": 1.74, + "grad_norm": 11.55118179321289, + "learning_rate": 8.38053800778145e-06, + "loss": 0.3911, + "step": 13888 + }, + { + "epoch": 1.74, + "grad_norm": 5.919164180755615, + "learning_rate": 8.379701292724764e-06, + "loss": 0.2172, + "step": 13889 + }, + { + "epoch": 1.74, + "grad_norm": 40.55359649658203, + "learning_rate": 8.378864577668076e-06, + "loss": 1.3834, + "step": 13890 + }, + { + "epoch": 1.74, + "grad_norm": 27.290895462036133, + "learning_rate": 8.378027862611388e-06, + "loss": 1.6225, + "step": 13891 + }, + { + "epoch": 1.74, + "grad_norm": 42.276206970214844, + "learning_rate": 8.3771911475547e-06, + "loss": 0.2779, + "step": 13892 + }, + { + "epoch": 1.74, + "grad_norm": 12.790298461914062, + "learning_rate": 8.376354432498014e-06, + "loss": 1.0371, + "step": 13893 + }, + { + "epoch": 1.74, + "grad_norm": 10.467720031738281, + "learning_rate": 8.375517717441326e-06, + "loss": 0.5272, + "step": 13894 + }, + { + "epoch": 1.74, + "grad_norm": 17.65713882446289, + "learning_rate": 8.374681002384638e-06, + "loss": 0.5529, + "step": 13895 + }, + { + "epoch": 1.74, + "grad_norm": 70.17169952392578, + "learning_rate": 8.373844287327951e-06, + "loss": 1.4261, + "step": 13896 + }, + { + "epoch": 1.74, + "grad_norm": 11.62064266204834, + "learning_rate": 8.373007572271263e-06, + "loss": 1.475, + "step": 13897 + }, + { + "epoch": 1.74, + "grad_norm": 9.527636528015137, + "learning_rate": 8.372170857214577e-06, + "loss": 1.585, + "step": 13898 + }, + { + "epoch": 1.74, + "grad_norm": 20.809040069580078, + "learning_rate": 8.371334142157889e-06, + "loss": 0.7872, + "step": 13899 + }, + { + "epoch": 1.74, + "grad_norm": 33.864967346191406, + "learning_rate": 8.370497427101202e-06, + "loss": 2.1348, + "step": 13900 + }, + { + "epoch": 1.74, + "grad_norm": 36.15922927856445, + "learning_rate": 8.369660712044514e-06, + "loss": 1.2589, + "step": 13901 + }, + { + "epoch": 1.74, + "grad_norm": 15.41215705871582, + "learning_rate": 8.368823996987826e-06, + "loss": 1.1921, + "step": 13902 + }, + { + "epoch": 1.74, + "grad_norm": 21.640539169311523, + "learning_rate": 8.36798728193114e-06, + "loss": 1.2825, + "step": 13903 + }, + { + "epoch": 1.74, + "grad_norm": 31.704418182373047, + "learning_rate": 8.367150566874452e-06, + "loss": 1.0447, + "step": 13904 + }, + { + "epoch": 1.75, + "grad_norm": 7.182675361633301, + "learning_rate": 8.366313851817764e-06, + "loss": 0.4019, + "step": 13905 + }, + { + "epoch": 1.75, + "grad_norm": 32.93462371826172, + "learning_rate": 8.365477136761076e-06, + "loss": 2.2954, + "step": 13906 + }, + { + "epoch": 1.75, + "grad_norm": 11.313870429992676, + "learning_rate": 8.36464042170439e-06, + "loss": 1.4137, + "step": 13907 + }, + { + "epoch": 1.75, + "grad_norm": 5.952693939208984, + "learning_rate": 8.363803706647701e-06, + "loss": 0.7722, + "step": 13908 + }, + { + "epoch": 1.75, + "grad_norm": 10.723963737487793, + "learning_rate": 8.362966991591013e-06, + "loss": 2.7077, + "step": 13909 + }, + { + "epoch": 1.75, + "grad_norm": 12.148795127868652, + "learning_rate": 8.362130276534327e-06, + "loss": 2.0539, + "step": 13910 + }, + { + "epoch": 1.75, + "grad_norm": 9.466438293457031, + "learning_rate": 8.361293561477639e-06, + "loss": 0.5725, + "step": 13911 + }, + { + "epoch": 1.75, + "grad_norm": 11.479924201965332, + "learning_rate": 8.360456846420953e-06, + "loss": 0.8503, + "step": 13912 + }, + { + "epoch": 1.75, + "grad_norm": 12.030116081237793, + "learning_rate": 8.359620131364265e-06, + "loss": 1.1705, + "step": 13913 + }, + { + "epoch": 1.75, + "grad_norm": 7.061598300933838, + "learning_rate": 8.358783416307578e-06, + "loss": 0.3001, + "step": 13914 + }, + { + "epoch": 1.75, + "grad_norm": 14.592386245727539, + "learning_rate": 8.35794670125089e-06, + "loss": 0.9844, + "step": 13915 + }, + { + "epoch": 1.75, + "grad_norm": 6.461309909820557, + "learning_rate": 8.357109986194202e-06, + "loss": 0.3957, + "step": 13916 + }, + { + "epoch": 1.75, + "grad_norm": 6.254886627197266, + "learning_rate": 8.356273271137514e-06, + "loss": 0.1697, + "step": 13917 + }, + { + "epoch": 1.75, + "grad_norm": 14.68002986907959, + "learning_rate": 8.355436556080828e-06, + "loss": 1.0483, + "step": 13918 + }, + { + "epoch": 1.75, + "grad_norm": 15.130146980285645, + "learning_rate": 8.35459984102414e-06, + "loss": 1.4724, + "step": 13919 + }, + { + "epoch": 1.75, + "grad_norm": 20.888988494873047, + "learning_rate": 8.353763125967452e-06, + "loss": 1.3916, + "step": 13920 + }, + { + "epoch": 1.75, + "grad_norm": 10.216975212097168, + "learning_rate": 8.352926410910765e-06, + "loss": 0.3461, + "step": 13921 + }, + { + "epoch": 1.75, + "grad_norm": 22.507368087768555, + "learning_rate": 8.352089695854077e-06, + "loss": 0.6662, + "step": 13922 + }, + { + "epoch": 1.75, + "grad_norm": 33.56687545776367, + "learning_rate": 8.35125298079739e-06, + "loss": 0.6495, + "step": 13923 + }, + { + "epoch": 1.75, + "grad_norm": 36.28936767578125, + "learning_rate": 8.350416265740703e-06, + "loss": 1.6898, + "step": 13924 + }, + { + "epoch": 1.75, + "grad_norm": 11.19839096069336, + "learning_rate": 8.349579550684015e-06, + "loss": 1.4511, + "step": 13925 + }, + { + "epoch": 1.75, + "grad_norm": 10.867090225219727, + "learning_rate": 8.348742835627329e-06, + "loss": 1.0426, + "step": 13926 + }, + { + "epoch": 1.75, + "grad_norm": 8.462395668029785, + "learning_rate": 8.34790612057064e-06, + "loss": 0.7899, + "step": 13927 + }, + { + "epoch": 1.75, + "grad_norm": 5.737425804138184, + "learning_rate": 8.347069405513954e-06, + "loss": 1.1286, + "step": 13928 + }, + { + "epoch": 1.75, + "grad_norm": 15.307694435119629, + "learning_rate": 8.346232690457266e-06, + "loss": 0.1565, + "step": 13929 + }, + { + "epoch": 1.75, + "grad_norm": 6.2129364013671875, + "learning_rate": 8.345395975400578e-06, + "loss": 0.1186, + "step": 13930 + }, + { + "epoch": 1.75, + "grad_norm": 26.382848739624023, + "learning_rate": 8.34455926034389e-06, + "loss": 1.5627, + "step": 13931 + }, + { + "epoch": 1.75, + "grad_norm": 7.875923156738281, + "learning_rate": 8.343722545287204e-06, + "loss": 1.8287, + "step": 13932 + }, + { + "epoch": 1.75, + "grad_norm": 20.020475387573242, + "learning_rate": 8.342885830230516e-06, + "loss": 1.3609, + "step": 13933 + }, + { + "epoch": 1.75, + "grad_norm": 5.87630033493042, + "learning_rate": 8.342049115173828e-06, + "loss": 0.3188, + "step": 13934 + }, + { + "epoch": 1.75, + "grad_norm": 32.75434494018555, + "learning_rate": 8.341212400117141e-06, + "loss": 2.3851, + "step": 13935 + }, + { + "epoch": 1.75, + "grad_norm": 7.332067489624023, + "learning_rate": 8.340375685060453e-06, + "loss": 0.6301, + "step": 13936 + }, + { + "epoch": 1.75, + "grad_norm": 13.54127025604248, + "learning_rate": 8.339538970003765e-06, + "loss": 1.4954, + "step": 13937 + }, + { + "epoch": 1.75, + "grad_norm": 73.1351318359375, + "learning_rate": 8.338702254947079e-06, + "loss": 2.0352, + "step": 13938 + }, + { + "epoch": 1.75, + "grad_norm": 13.21261215209961, + "learning_rate": 8.33786553989039e-06, + "loss": 1.9045, + "step": 13939 + }, + { + "epoch": 1.75, + "grad_norm": 57.44038772583008, + "learning_rate": 8.337028824833704e-06, + "loss": 0.7105, + "step": 13940 + }, + { + "epoch": 1.75, + "grad_norm": 9.058891296386719, + "learning_rate": 8.336192109777016e-06, + "loss": 0.5623, + "step": 13941 + }, + { + "epoch": 1.75, + "grad_norm": 31.473182678222656, + "learning_rate": 8.33535539472033e-06, + "loss": 1.6002, + "step": 13942 + }, + { + "epoch": 1.75, + "grad_norm": 13.409120559692383, + "learning_rate": 8.334518679663642e-06, + "loss": 1.8, + "step": 13943 + }, + { + "epoch": 1.75, + "grad_norm": 7.674248218536377, + "learning_rate": 8.333681964606954e-06, + "loss": 0.7707, + "step": 13944 + }, + { + "epoch": 1.75, + "grad_norm": 20.194053649902344, + "learning_rate": 8.332845249550266e-06, + "loss": 0.9419, + "step": 13945 + }, + { + "epoch": 1.75, + "grad_norm": 23.43710708618164, + "learning_rate": 8.332008534493578e-06, + "loss": 0.7634, + "step": 13946 + }, + { + "epoch": 1.75, + "grad_norm": 28.353042602539062, + "learning_rate": 8.331171819436891e-06, + "loss": 1.4436, + "step": 13947 + }, + { + "epoch": 1.75, + "grad_norm": 12.709844589233398, + "learning_rate": 8.330335104380203e-06, + "loss": 0.6189, + "step": 13948 + }, + { + "epoch": 1.75, + "grad_norm": 7.055968761444092, + "learning_rate": 8.329498389323517e-06, + "loss": 0.2982, + "step": 13949 + }, + { + "epoch": 1.75, + "grad_norm": 6.116999626159668, + "learning_rate": 8.328661674266829e-06, + "loss": 0.6182, + "step": 13950 + }, + { + "epoch": 1.75, + "grad_norm": 17.69551658630371, + "learning_rate": 8.327824959210141e-06, + "loss": 1.749, + "step": 13951 + }, + { + "epoch": 1.75, + "grad_norm": 23.674076080322266, + "learning_rate": 8.326988244153455e-06, + "loss": 3.066, + "step": 13952 + }, + { + "epoch": 1.75, + "grad_norm": 22.856245040893555, + "learning_rate": 8.326151529096767e-06, + "loss": 0.8221, + "step": 13953 + }, + { + "epoch": 1.75, + "grad_norm": 22.558551788330078, + "learning_rate": 8.32531481404008e-06, + "loss": 0.754, + "step": 13954 + }, + { + "epoch": 1.75, + "grad_norm": 8.679530143737793, + "learning_rate": 8.324478098983392e-06, + "loss": 0.5943, + "step": 13955 + }, + { + "epoch": 1.75, + "grad_norm": 12.526307106018066, + "learning_rate": 8.323641383926704e-06, + "loss": 1.392, + "step": 13956 + }, + { + "epoch": 1.75, + "grad_norm": 18.145200729370117, + "learning_rate": 8.322804668870018e-06, + "loss": 1.5801, + "step": 13957 + }, + { + "epoch": 1.75, + "grad_norm": 9.041570663452148, + "learning_rate": 8.32196795381333e-06, + "loss": 0.6676, + "step": 13958 + }, + { + "epoch": 1.75, + "grad_norm": 34.781551361083984, + "learning_rate": 8.321131238756642e-06, + "loss": 0.4334, + "step": 13959 + }, + { + "epoch": 1.75, + "grad_norm": 58.627052307128906, + "learning_rate": 8.320294523699954e-06, + "loss": 0.6873, + "step": 13960 + }, + { + "epoch": 1.75, + "grad_norm": 7.583497047424316, + "learning_rate": 8.319457808643267e-06, + "loss": 0.4713, + "step": 13961 + }, + { + "epoch": 1.75, + "grad_norm": 15.208681106567383, + "learning_rate": 8.31862109358658e-06, + "loss": 1.3923, + "step": 13962 + }, + { + "epoch": 1.75, + "grad_norm": 10.86228084564209, + "learning_rate": 8.317784378529893e-06, + "loss": 1.069, + "step": 13963 + }, + { + "epoch": 1.75, + "grad_norm": 10.096314430236816, + "learning_rate": 8.316947663473205e-06, + "loss": 0.3963, + "step": 13964 + }, + { + "epoch": 1.75, + "grad_norm": 19.38264274597168, + "learning_rate": 8.316110948416517e-06, + "loss": 0.9396, + "step": 13965 + }, + { + "epoch": 1.75, + "grad_norm": 11.397652626037598, + "learning_rate": 8.31527423335983e-06, + "loss": 0.5981, + "step": 13966 + }, + { + "epoch": 1.75, + "grad_norm": 3.4790005683898926, + "learning_rate": 8.314437518303142e-06, + "loss": 0.1858, + "step": 13967 + }, + { + "epoch": 1.75, + "grad_norm": 12.452469825744629, + "learning_rate": 8.313600803246456e-06, + "loss": 0.807, + "step": 13968 + }, + { + "epoch": 1.75, + "grad_norm": 12.185338973999023, + "learning_rate": 8.312764088189768e-06, + "loss": 0.303, + "step": 13969 + }, + { + "epoch": 1.75, + "grad_norm": 5.862409591674805, + "learning_rate": 8.31192737313308e-06, + "loss": 0.4866, + "step": 13970 + }, + { + "epoch": 1.75, + "grad_norm": 25.07809829711914, + "learning_rate": 8.311090658076394e-06, + "loss": 1.3827, + "step": 13971 + }, + { + "epoch": 1.75, + "grad_norm": 17.37289810180664, + "learning_rate": 8.310253943019706e-06, + "loss": 0.9669, + "step": 13972 + }, + { + "epoch": 1.75, + "grad_norm": 7.514956951141357, + "learning_rate": 8.309417227963018e-06, + "loss": 0.3129, + "step": 13973 + }, + { + "epoch": 1.75, + "grad_norm": 6.670135498046875, + "learning_rate": 8.30858051290633e-06, + "loss": 0.3078, + "step": 13974 + }, + { + "epoch": 1.75, + "grad_norm": 6.870745658874512, + "learning_rate": 8.307743797849643e-06, + "loss": 0.7646, + "step": 13975 + }, + { + "epoch": 1.75, + "grad_norm": 14.9131498336792, + "learning_rate": 8.306907082792955e-06, + "loss": 0.6213, + "step": 13976 + }, + { + "epoch": 1.75, + "grad_norm": 85.65762329101562, + "learning_rate": 8.306070367736269e-06, + "loss": 3.9704, + "step": 13977 + }, + { + "epoch": 1.75, + "grad_norm": 12.706620216369629, + "learning_rate": 8.30523365267958e-06, + "loss": 1.3252, + "step": 13978 + }, + { + "epoch": 1.75, + "grad_norm": 10.356450080871582, + "learning_rate": 8.304396937622893e-06, + "loss": 1.0579, + "step": 13979 + }, + { + "epoch": 1.75, + "grad_norm": 23.153383255004883, + "learning_rate": 8.303560222566206e-06, + "loss": 2.8506, + "step": 13980 + }, + { + "epoch": 1.75, + "grad_norm": 22.212078094482422, + "learning_rate": 8.302723507509518e-06, + "loss": 0.7168, + "step": 13981 + }, + { + "epoch": 1.75, + "grad_norm": 7.272219181060791, + "learning_rate": 8.301886792452832e-06, + "loss": 0.3777, + "step": 13982 + }, + { + "epoch": 1.75, + "grad_norm": 15.351566314697266, + "learning_rate": 8.301050077396144e-06, + "loss": 2.7114, + "step": 13983 + }, + { + "epoch": 1.75, + "grad_norm": 12.014077186584473, + "learning_rate": 8.300213362339456e-06, + "loss": 2.2495, + "step": 13984 + }, + { + "epoch": 1.76, + "grad_norm": 11.368762969970703, + "learning_rate": 8.299376647282768e-06, + "loss": 0.302, + "step": 13985 + }, + { + "epoch": 1.76, + "grad_norm": 120.40667724609375, + "learning_rate": 8.298539932226081e-06, + "loss": 1.5484, + "step": 13986 + }, + { + "epoch": 1.76, + "grad_norm": 15.013331413269043, + "learning_rate": 8.297703217169393e-06, + "loss": 1.0579, + "step": 13987 + }, + { + "epoch": 1.76, + "grad_norm": 10.324745178222656, + "learning_rate": 8.296866502112705e-06, + "loss": 0.6416, + "step": 13988 + }, + { + "epoch": 1.76, + "grad_norm": 6.499182224273682, + "learning_rate": 8.296029787056019e-06, + "loss": 1.6797, + "step": 13989 + }, + { + "epoch": 1.76, + "grad_norm": 11.683255195617676, + "learning_rate": 8.295193071999331e-06, + "loss": 0.8339, + "step": 13990 + }, + { + "epoch": 1.76, + "grad_norm": 3.60404372215271, + "learning_rate": 8.294356356942645e-06, + "loss": 0.2472, + "step": 13991 + }, + { + "epoch": 1.76, + "grad_norm": 17.4386043548584, + "learning_rate": 8.293519641885957e-06, + "loss": 1.2046, + "step": 13992 + }, + { + "epoch": 1.76, + "grad_norm": 11.467900276184082, + "learning_rate": 8.292682926829268e-06, + "loss": 0.443, + "step": 13993 + }, + { + "epoch": 1.76, + "grad_norm": 21.231168746948242, + "learning_rate": 8.291846211772582e-06, + "loss": 0.9453, + "step": 13994 + }, + { + "epoch": 1.76, + "grad_norm": 30.515880584716797, + "learning_rate": 8.291009496715894e-06, + "loss": 1.3454, + "step": 13995 + }, + { + "epoch": 1.76, + "grad_norm": 30.542156219482422, + "learning_rate": 8.290172781659208e-06, + "loss": 2.3798, + "step": 13996 + }, + { + "epoch": 1.76, + "grad_norm": 43.25813674926758, + "learning_rate": 8.28933606660252e-06, + "loss": 1.7446, + "step": 13997 + }, + { + "epoch": 1.76, + "grad_norm": 93.33450317382812, + "learning_rate": 8.288499351545832e-06, + "loss": 0.9311, + "step": 13998 + }, + { + "epoch": 1.76, + "grad_norm": 23.29090118408203, + "learning_rate": 8.287662636489144e-06, + "loss": 2.2541, + "step": 13999 + }, + { + "epoch": 1.76, + "grad_norm": 13.520791053771973, + "learning_rate": 8.286825921432457e-06, + "loss": 1.5217, + "step": 14000 + }, + { + "epoch": 1.76, + "eval_loss": 0.07692699134349823, + "eval_runtime": 97.7806, + "eval_samples_per_second": 36.224, + "eval_steps_per_second": 36.224, + "step": 14000 + }, + { + "epoch": 1.76, + "grad_norm": 12.751508712768555, + "learning_rate": 8.28598920637577e-06, + "loss": 0.2391, + "step": 14001 + }, + { + "epoch": 1.76, + "grad_norm": 20.14134979248047, + "learning_rate": 8.285152491319081e-06, + "loss": 3.0113, + "step": 14002 + }, + { + "epoch": 1.76, + "grad_norm": 63.07255172729492, + "learning_rate": 8.284315776262395e-06, + "loss": 3.1447, + "step": 14003 + }, + { + "epoch": 1.76, + "grad_norm": 77.31236267089844, + "learning_rate": 8.283479061205707e-06, + "loss": 1.9684, + "step": 14004 + }, + { + "epoch": 1.76, + "grad_norm": 9.814595222473145, + "learning_rate": 8.282642346149019e-06, + "loss": 1.8572, + "step": 14005 + }, + { + "epoch": 1.76, + "grad_norm": 11.436347007751465, + "learning_rate": 8.281805631092332e-06, + "loss": 0.6581, + "step": 14006 + }, + { + "epoch": 1.76, + "grad_norm": 57.64645767211914, + "learning_rate": 8.280968916035644e-06, + "loss": 2.1964, + "step": 14007 + }, + { + "epoch": 1.76, + "grad_norm": 11.93832778930664, + "learning_rate": 8.280132200978958e-06, + "loss": 2.741, + "step": 14008 + }, + { + "epoch": 1.76, + "grad_norm": 29.666269302368164, + "learning_rate": 8.27929548592227e-06, + "loss": 1.2742, + "step": 14009 + }, + { + "epoch": 1.76, + "grad_norm": 2.4997217655181885, + "learning_rate": 8.278458770865584e-06, + "loss": 0.0794, + "step": 14010 + }, + { + "epoch": 1.76, + "grad_norm": 14.185319900512695, + "learning_rate": 8.277622055808896e-06, + "loss": 0.4328, + "step": 14011 + }, + { + "epoch": 1.76, + "grad_norm": 9.742040634155273, + "learning_rate": 8.276785340752207e-06, + "loss": 0.6479, + "step": 14012 + }, + { + "epoch": 1.76, + "grad_norm": 14.843554496765137, + "learning_rate": 8.27594862569552e-06, + "loss": 1.0409, + "step": 14013 + }, + { + "epoch": 1.76, + "grad_norm": 10.190027236938477, + "learning_rate": 8.275111910638833e-06, + "loss": 0.1852, + "step": 14014 + }, + { + "epoch": 1.76, + "grad_norm": 154.2579803466797, + "learning_rate": 8.274275195582145e-06, + "loss": 1.6351, + "step": 14015 + }, + { + "epoch": 1.76, + "grad_norm": 11.802145957946777, + "learning_rate": 8.273438480525457e-06, + "loss": 0.9051, + "step": 14016 + }, + { + "epoch": 1.76, + "grad_norm": 7.853428363800049, + "learning_rate": 8.27260176546877e-06, + "loss": 0.9526, + "step": 14017 + }, + { + "epoch": 1.76, + "grad_norm": 7.507288932800293, + "learning_rate": 8.271765050412083e-06, + "loss": 1.5627, + "step": 14018 + }, + { + "epoch": 1.76, + "grad_norm": 9.447890281677246, + "learning_rate": 8.270928335355395e-06, + "loss": 1.6398, + "step": 14019 + }, + { + "epoch": 1.76, + "grad_norm": 13.446758270263672, + "learning_rate": 8.270091620298708e-06, + "loss": 0.6096, + "step": 14020 + }, + { + "epoch": 1.76, + "grad_norm": 12.474729537963867, + "learning_rate": 8.26925490524202e-06, + "loss": 1.2577, + "step": 14021 + }, + { + "epoch": 1.76, + "grad_norm": 131.98252868652344, + "learning_rate": 8.268418190185334e-06, + "loss": 3.8721, + "step": 14022 + }, + { + "epoch": 1.76, + "grad_norm": 19.97817611694336, + "learning_rate": 8.267581475128646e-06, + "loss": 2.113, + "step": 14023 + }, + { + "epoch": 1.76, + "grad_norm": 8.92810344696045, + "learning_rate": 8.26674476007196e-06, + "loss": 0.4914, + "step": 14024 + }, + { + "epoch": 1.76, + "grad_norm": 11.805947303771973, + "learning_rate": 8.265908045015271e-06, + "loss": 1.5212, + "step": 14025 + }, + { + "epoch": 1.76, + "grad_norm": 13.755889892578125, + "learning_rate": 8.265071329958583e-06, + "loss": 0.5217, + "step": 14026 + }, + { + "epoch": 1.76, + "grad_norm": 21.62193489074707, + "learning_rate": 8.264234614901895e-06, + "loss": 0.7373, + "step": 14027 + }, + { + "epoch": 1.76, + "grad_norm": 63.61070251464844, + "learning_rate": 8.263397899845207e-06, + "loss": 1.6375, + "step": 14028 + }, + { + "epoch": 1.76, + "grad_norm": 2.767998218536377, + "learning_rate": 8.262561184788521e-06, + "loss": 0.022, + "step": 14029 + }, + { + "epoch": 1.76, + "grad_norm": 9.809958457946777, + "learning_rate": 8.261724469731833e-06, + "loss": 1.0422, + "step": 14030 + }, + { + "epoch": 1.76, + "grad_norm": 13.731112480163574, + "learning_rate": 8.260887754675146e-06, + "loss": 1.6344, + "step": 14031 + }, + { + "epoch": 1.76, + "grad_norm": 16.74528694152832, + "learning_rate": 8.260051039618458e-06, + "loss": 1.2197, + "step": 14032 + }, + { + "epoch": 1.76, + "grad_norm": 11.615007400512695, + "learning_rate": 8.25921432456177e-06, + "loss": 0.6503, + "step": 14033 + }, + { + "epoch": 1.76, + "grad_norm": 7.938024520874023, + "learning_rate": 8.258377609505084e-06, + "loss": 0.6201, + "step": 14034 + }, + { + "epoch": 1.76, + "grad_norm": 9.573639869689941, + "learning_rate": 8.257540894448396e-06, + "loss": 1.1605, + "step": 14035 + }, + { + "epoch": 1.76, + "grad_norm": 4.799045562744141, + "learning_rate": 8.25670417939171e-06, + "loss": 1.4988, + "step": 14036 + }, + { + "epoch": 1.76, + "grad_norm": 11.836624145507812, + "learning_rate": 8.255867464335022e-06, + "loss": 0.992, + "step": 14037 + }, + { + "epoch": 1.76, + "grad_norm": 7.944649696350098, + "learning_rate": 8.255030749278334e-06, + "loss": 1.1869, + "step": 14038 + }, + { + "epoch": 1.76, + "grad_norm": 11.460426330566406, + "learning_rate": 8.254194034221647e-06, + "loss": 2.0168, + "step": 14039 + }, + { + "epoch": 1.76, + "grad_norm": 11.50805377960205, + "learning_rate": 8.253357319164959e-06, + "loss": 2.0397, + "step": 14040 + }, + { + "epoch": 1.76, + "grad_norm": 82.7448959350586, + "learning_rate": 8.252520604108271e-06, + "loss": 1.589, + "step": 14041 + }, + { + "epoch": 1.76, + "grad_norm": 37.63884735107422, + "learning_rate": 8.251683889051583e-06, + "loss": 1.0803, + "step": 14042 + }, + { + "epoch": 1.76, + "grad_norm": 41.49068069458008, + "learning_rate": 8.250847173994897e-06, + "loss": 1.602, + "step": 14043 + }, + { + "epoch": 1.76, + "grad_norm": 9.409775733947754, + "learning_rate": 8.250010458938209e-06, + "loss": 0.7266, + "step": 14044 + }, + { + "epoch": 1.76, + "grad_norm": 19.899547576904297, + "learning_rate": 8.249173743881522e-06, + "loss": 1.347, + "step": 14045 + }, + { + "epoch": 1.76, + "grad_norm": 21.121326446533203, + "learning_rate": 8.248337028824834e-06, + "loss": 0.9298, + "step": 14046 + }, + { + "epoch": 1.76, + "grad_norm": 13.431509017944336, + "learning_rate": 8.247500313768146e-06, + "loss": 0.9772, + "step": 14047 + }, + { + "epoch": 1.76, + "grad_norm": 19.138832092285156, + "learning_rate": 8.24666359871146e-06, + "loss": 0.7483, + "step": 14048 + }, + { + "epoch": 1.76, + "grad_norm": 10.172293663024902, + "learning_rate": 8.245826883654772e-06, + "loss": 0.886, + "step": 14049 + }, + { + "epoch": 1.76, + "grad_norm": 15.299717903137207, + "learning_rate": 8.244990168598085e-06, + "loss": 0.2258, + "step": 14050 + }, + { + "epoch": 1.76, + "grad_norm": 6.301414489746094, + "learning_rate": 8.244153453541397e-06, + "loss": 0.4489, + "step": 14051 + }, + { + "epoch": 1.76, + "grad_norm": 11.925097465515137, + "learning_rate": 8.24331673848471e-06, + "loss": 0.3128, + "step": 14052 + }, + { + "epoch": 1.76, + "grad_norm": 12.037699699401855, + "learning_rate": 8.242480023428023e-06, + "loss": 1.4974, + "step": 14053 + }, + { + "epoch": 1.76, + "grad_norm": 14.054902076721191, + "learning_rate": 8.241643308371335e-06, + "loss": 1.1296, + "step": 14054 + }, + { + "epoch": 1.76, + "grad_norm": 75.2170181274414, + "learning_rate": 8.240806593314647e-06, + "loss": 3.0972, + "step": 14055 + }, + { + "epoch": 1.76, + "grad_norm": 25.608165740966797, + "learning_rate": 8.239969878257959e-06, + "loss": 0.7163, + "step": 14056 + }, + { + "epoch": 1.76, + "grad_norm": 3.8469531536102295, + "learning_rate": 8.239133163201273e-06, + "loss": 0.0497, + "step": 14057 + }, + { + "epoch": 1.76, + "grad_norm": 26.958236694335938, + "learning_rate": 8.238296448144584e-06, + "loss": 2.1268, + "step": 14058 + }, + { + "epoch": 1.76, + "grad_norm": 10.09694766998291, + "learning_rate": 8.237459733087898e-06, + "loss": 0.5967, + "step": 14059 + }, + { + "epoch": 1.76, + "grad_norm": 18.695819854736328, + "learning_rate": 8.23662301803121e-06, + "loss": 1.8521, + "step": 14060 + }, + { + "epoch": 1.76, + "grad_norm": 8.077091217041016, + "learning_rate": 8.235786302974522e-06, + "loss": 0.4429, + "step": 14061 + }, + { + "epoch": 1.76, + "grad_norm": 10.94905948638916, + "learning_rate": 8.234949587917836e-06, + "loss": 1.2048, + "step": 14062 + }, + { + "epoch": 1.76, + "grad_norm": 62.44631576538086, + "learning_rate": 8.234112872861148e-06, + "loss": 1.0883, + "step": 14063 + }, + { + "epoch": 1.77, + "grad_norm": 4.850184440612793, + "learning_rate": 8.233276157804461e-06, + "loss": 0.5964, + "step": 14064 + }, + { + "epoch": 1.77, + "grad_norm": 81.4451675415039, + "learning_rate": 8.232439442747773e-06, + "loss": 2.5383, + "step": 14065 + }, + { + "epoch": 1.77, + "grad_norm": 13.646984100341797, + "learning_rate": 8.231602727691085e-06, + "loss": 1.8475, + "step": 14066 + }, + { + "epoch": 1.77, + "grad_norm": 271.2912902832031, + "learning_rate": 8.230766012634397e-06, + "loss": 2.6256, + "step": 14067 + }, + { + "epoch": 1.77, + "grad_norm": 33.16635513305664, + "learning_rate": 8.22992929757771e-06, + "loss": 1.236, + "step": 14068 + }, + { + "epoch": 1.77, + "grad_norm": 15.70523738861084, + "learning_rate": 8.229092582521023e-06, + "loss": 1.4069, + "step": 14069 + }, + { + "epoch": 1.77, + "grad_norm": 28.73826026916504, + "learning_rate": 8.228255867464335e-06, + "loss": 1.5104, + "step": 14070 + }, + { + "epoch": 1.77, + "grad_norm": 12.562192916870117, + "learning_rate": 8.227419152407648e-06, + "loss": 1.5651, + "step": 14071 + }, + { + "epoch": 1.77, + "grad_norm": 7.837984085083008, + "learning_rate": 8.22658243735096e-06, + "loss": 1.742, + "step": 14072 + }, + { + "epoch": 1.77, + "grad_norm": 6.106812000274658, + "learning_rate": 8.225745722294274e-06, + "loss": 0.6934, + "step": 14073 + }, + { + "epoch": 1.77, + "grad_norm": 7.599335193634033, + "learning_rate": 8.224909007237586e-06, + "loss": 0.5154, + "step": 14074 + }, + { + "epoch": 1.77, + "grad_norm": 27.655122756958008, + "learning_rate": 8.224072292180898e-06, + "loss": 1.2087, + "step": 14075 + }, + { + "epoch": 1.77, + "grad_norm": 18.773101806640625, + "learning_rate": 8.223235577124212e-06, + "loss": 2.5761, + "step": 14076 + }, + { + "epoch": 1.77, + "grad_norm": 7.021270275115967, + "learning_rate": 8.222398862067523e-06, + "loss": 0.5751, + "step": 14077 + }, + { + "epoch": 1.77, + "grad_norm": 7.0785064697265625, + "learning_rate": 8.221562147010837e-06, + "loss": 0.3811, + "step": 14078 + }, + { + "epoch": 1.77, + "grad_norm": 8.324564933776855, + "learning_rate": 8.220725431954149e-06, + "loss": 0.7991, + "step": 14079 + }, + { + "epoch": 1.77, + "grad_norm": 30.627126693725586, + "learning_rate": 8.219888716897461e-06, + "loss": 1.1427, + "step": 14080 + }, + { + "epoch": 1.77, + "grad_norm": 9.819294929504395, + "learning_rate": 8.219052001840773e-06, + "loss": 0.6441, + "step": 14081 + }, + { + "epoch": 1.77, + "grad_norm": 10.612582206726074, + "learning_rate": 8.218215286784087e-06, + "loss": 0.8362, + "step": 14082 + }, + { + "epoch": 1.77, + "grad_norm": 9.291300773620605, + "learning_rate": 8.217378571727399e-06, + "loss": 0.3432, + "step": 14083 + }, + { + "epoch": 1.77, + "grad_norm": 29.742712020874023, + "learning_rate": 8.21654185667071e-06, + "loss": 2.0117, + "step": 14084 + }, + { + "epoch": 1.77, + "grad_norm": 19.590259552001953, + "learning_rate": 8.215705141614024e-06, + "loss": 1.6899, + "step": 14085 + }, + { + "epoch": 1.77, + "grad_norm": 23.724468231201172, + "learning_rate": 8.214868426557336e-06, + "loss": 2.7822, + "step": 14086 + }, + { + "epoch": 1.77, + "grad_norm": 78.61354064941406, + "learning_rate": 8.21403171150065e-06, + "loss": 2.2423, + "step": 14087 + }, + { + "epoch": 1.77, + "grad_norm": 30.620384216308594, + "learning_rate": 8.213194996443962e-06, + "loss": 1.3772, + "step": 14088 + }, + { + "epoch": 1.77, + "grad_norm": 27.140432357788086, + "learning_rate": 8.212358281387274e-06, + "loss": 2.3649, + "step": 14089 + }, + { + "epoch": 1.77, + "grad_norm": 13.873764038085938, + "learning_rate": 8.211521566330587e-06, + "loss": 1.0493, + "step": 14090 + }, + { + "epoch": 1.77, + "grad_norm": 5.370067596435547, + "learning_rate": 8.2106848512739e-06, + "loss": 0.2943, + "step": 14091 + }, + { + "epoch": 1.77, + "grad_norm": 54.14467239379883, + "learning_rate": 8.209848136217213e-06, + "loss": 1.8964, + "step": 14092 + }, + { + "epoch": 1.77, + "grad_norm": 4.073047637939453, + "learning_rate": 8.209011421160525e-06, + "loss": 0.1377, + "step": 14093 + }, + { + "epoch": 1.77, + "grad_norm": 12.148030281066895, + "learning_rate": 8.208174706103837e-06, + "loss": 0.8864, + "step": 14094 + }, + { + "epoch": 1.77, + "grad_norm": 6.65673303604126, + "learning_rate": 8.207337991047149e-06, + "loss": 0.3389, + "step": 14095 + }, + { + "epoch": 1.77, + "grad_norm": 6.8014116287231445, + "learning_rate": 8.20650127599046e-06, + "loss": 1.4713, + "step": 14096 + }, + { + "epoch": 1.77, + "grad_norm": 23.743858337402344, + "learning_rate": 8.205664560933774e-06, + "loss": 0.955, + "step": 14097 + }, + { + "epoch": 1.77, + "grad_norm": 5.458376884460449, + "learning_rate": 8.204827845877086e-06, + "loss": 0.3562, + "step": 14098 + }, + { + "epoch": 1.77, + "grad_norm": 23.411649703979492, + "learning_rate": 8.2039911308204e-06, + "loss": 2.3615, + "step": 14099 + }, + { + "epoch": 1.77, + "grad_norm": 16.24994468688965, + "learning_rate": 8.203154415763712e-06, + "loss": 0.3418, + "step": 14100 + }, + { + "epoch": 1.77, + "grad_norm": 18.841440200805664, + "learning_rate": 8.202317700707026e-06, + "loss": 2.1813, + "step": 14101 + }, + { + "epoch": 1.77, + "grad_norm": 21.16748046875, + "learning_rate": 8.201480985650338e-06, + "loss": 0.3784, + "step": 14102 + }, + { + "epoch": 1.77, + "grad_norm": 7.720735549926758, + "learning_rate": 8.20064427059365e-06, + "loss": 1.1331, + "step": 14103 + }, + { + "epoch": 1.77, + "grad_norm": 6.038744926452637, + "learning_rate": 8.199807555536963e-06, + "loss": 0.2231, + "step": 14104 + }, + { + "epoch": 1.77, + "grad_norm": 26.808597564697266, + "learning_rate": 8.198970840480275e-06, + "loss": 2.4855, + "step": 14105 + }, + { + "epoch": 1.77, + "grad_norm": 16.5147705078125, + "learning_rate": 8.198134125423589e-06, + "loss": 1.165, + "step": 14106 + }, + { + "epoch": 1.77, + "grad_norm": 26.96607208251953, + "learning_rate": 8.1972974103669e-06, + "loss": 1.0483, + "step": 14107 + }, + { + "epoch": 1.77, + "grad_norm": 9.280531883239746, + "learning_rate": 8.196460695310213e-06, + "loss": 0.786, + "step": 14108 + }, + { + "epoch": 1.77, + "grad_norm": 41.95281219482422, + "learning_rate": 8.195623980253525e-06, + "loss": 1.6602, + "step": 14109 + }, + { + "epoch": 1.77, + "grad_norm": 7.45953369140625, + "learning_rate": 8.194787265196837e-06, + "loss": 0.4899, + "step": 14110 + }, + { + "epoch": 1.77, + "grad_norm": 4.454230308532715, + "learning_rate": 8.19395055014015e-06, + "loss": 0.1959, + "step": 14111 + }, + { + "epoch": 1.77, + "grad_norm": 7.201144695281982, + "learning_rate": 8.193113835083462e-06, + "loss": 1.1016, + "step": 14112 + }, + { + "epoch": 1.77, + "grad_norm": 34.14670181274414, + "learning_rate": 8.192277120026776e-06, + "loss": 2.4579, + "step": 14113 + }, + { + "epoch": 1.77, + "grad_norm": 13.853291511535645, + "learning_rate": 8.191440404970088e-06, + "loss": 0.8053, + "step": 14114 + }, + { + "epoch": 1.77, + "grad_norm": 8.906975746154785, + "learning_rate": 8.190603689913401e-06, + "loss": 1.3813, + "step": 14115 + }, + { + "epoch": 1.77, + "grad_norm": 6.100425720214844, + "learning_rate": 8.189766974856713e-06, + "loss": 0.8159, + "step": 14116 + }, + { + "epoch": 1.77, + "grad_norm": 16.280963897705078, + "learning_rate": 8.188930259800025e-06, + "loss": 0.8809, + "step": 14117 + }, + { + "epoch": 1.77, + "grad_norm": 14.504768371582031, + "learning_rate": 8.188093544743339e-06, + "loss": 2.6587, + "step": 14118 + }, + { + "epoch": 1.77, + "grad_norm": 34.045616149902344, + "learning_rate": 8.187256829686651e-06, + "loss": 3.184, + "step": 14119 + }, + { + "epoch": 1.77, + "grad_norm": 20.449115753173828, + "learning_rate": 8.186420114629963e-06, + "loss": 1.1062, + "step": 14120 + }, + { + "epoch": 1.77, + "grad_norm": 13.727039337158203, + "learning_rate": 8.185583399573277e-06, + "loss": 0.9284, + "step": 14121 + }, + { + "epoch": 1.77, + "grad_norm": 10.568921089172363, + "learning_rate": 8.184746684516589e-06, + "loss": 0.9385, + "step": 14122 + }, + { + "epoch": 1.77, + "grad_norm": 19.89391326904297, + "learning_rate": 8.1839099694599e-06, + "loss": 0.916, + "step": 14123 + }, + { + "epoch": 1.77, + "grad_norm": 10.833699226379395, + "learning_rate": 8.183073254403212e-06, + "loss": 0.8083, + "step": 14124 + }, + { + "epoch": 1.77, + "grad_norm": 22.74962615966797, + "learning_rate": 8.182236539346526e-06, + "loss": 2.1172, + "step": 14125 + }, + { + "epoch": 1.77, + "grad_norm": 13.44655704498291, + "learning_rate": 8.181399824289838e-06, + "loss": 1.4098, + "step": 14126 + }, + { + "epoch": 1.77, + "grad_norm": 13.826528549194336, + "learning_rate": 8.180563109233152e-06, + "loss": 2.3645, + "step": 14127 + }, + { + "epoch": 1.77, + "grad_norm": 10.453386306762695, + "learning_rate": 8.179726394176464e-06, + "loss": 1.8492, + "step": 14128 + }, + { + "epoch": 1.77, + "grad_norm": 33.9998664855957, + "learning_rate": 8.178889679119777e-06, + "loss": 1.2318, + "step": 14129 + }, + { + "epoch": 1.77, + "grad_norm": 7.9178619384765625, + "learning_rate": 8.17805296406309e-06, + "loss": 2.0647, + "step": 14130 + }, + { + "epoch": 1.77, + "grad_norm": 267.4306640625, + "learning_rate": 8.177216249006401e-06, + "loss": 0.9789, + "step": 14131 + }, + { + "epoch": 1.77, + "grad_norm": 15.695507049560547, + "learning_rate": 8.176379533949715e-06, + "loss": 1.3647, + "step": 14132 + }, + { + "epoch": 1.77, + "grad_norm": 12.526225090026855, + "learning_rate": 8.175542818893027e-06, + "loss": 1.476, + "step": 14133 + }, + { + "epoch": 1.77, + "grad_norm": 18.775936126708984, + "learning_rate": 8.174706103836339e-06, + "loss": 1.061, + "step": 14134 + }, + { + "epoch": 1.77, + "grad_norm": 3.695633888244629, + "learning_rate": 8.173869388779652e-06, + "loss": 0.1435, + "step": 14135 + }, + { + "epoch": 1.77, + "grad_norm": 11.461366653442383, + "learning_rate": 8.173032673722964e-06, + "loss": 1.7876, + "step": 14136 + }, + { + "epoch": 1.77, + "grad_norm": 16.700559616088867, + "learning_rate": 8.172195958666276e-06, + "loss": 1.9508, + "step": 14137 + }, + { + "epoch": 1.77, + "grad_norm": 21.389442443847656, + "learning_rate": 8.171359243609588e-06, + "loss": 3.045, + "step": 14138 + }, + { + "epoch": 1.77, + "grad_norm": 12.076715469360352, + "learning_rate": 8.170522528552902e-06, + "loss": 1.1381, + "step": 14139 + }, + { + "epoch": 1.77, + "grad_norm": 32.985836029052734, + "learning_rate": 8.169685813496214e-06, + "loss": 1.6389, + "step": 14140 + }, + { + "epoch": 1.77, + "grad_norm": 8.656893730163574, + "learning_rate": 8.168849098439528e-06, + "loss": 0.5599, + "step": 14141 + }, + { + "epoch": 1.77, + "grad_norm": 10.285635948181152, + "learning_rate": 8.16801238338284e-06, + "loss": 0.6236, + "step": 14142 + }, + { + "epoch": 1.77, + "grad_norm": 13.626985549926758, + "learning_rate": 8.167175668326153e-06, + "loss": 2.7633, + "step": 14143 + }, + { + "epoch": 1.78, + "grad_norm": 14.445494651794434, + "learning_rate": 8.166338953269465e-06, + "loss": 1.8344, + "step": 14144 + }, + { + "epoch": 1.78, + "grad_norm": 12.0984468460083, + "learning_rate": 8.165502238212777e-06, + "loss": 1.9759, + "step": 14145 + }, + { + "epoch": 1.78, + "grad_norm": 33.258758544921875, + "learning_rate": 8.16466552315609e-06, + "loss": 2.6521, + "step": 14146 + }, + { + "epoch": 1.78, + "grad_norm": 17.95985984802246, + "learning_rate": 8.163828808099403e-06, + "loss": 0.6462, + "step": 14147 + }, + { + "epoch": 1.78, + "grad_norm": 11.247654914855957, + "learning_rate": 8.162992093042715e-06, + "loss": 0.7743, + "step": 14148 + }, + { + "epoch": 1.78, + "grad_norm": 19.912494659423828, + "learning_rate": 8.162155377986027e-06, + "loss": 0.9983, + "step": 14149 + }, + { + "epoch": 1.78, + "grad_norm": 5.2008585929870605, + "learning_rate": 8.16131866292934e-06, + "loss": 0.5633, + "step": 14150 + }, + { + "epoch": 1.78, + "grad_norm": 7.434240341186523, + "learning_rate": 8.160481947872652e-06, + "loss": 0.2281, + "step": 14151 + }, + { + "epoch": 1.78, + "grad_norm": 47.61603546142578, + "learning_rate": 8.159645232815964e-06, + "loss": 2.083, + "step": 14152 + }, + { + "epoch": 1.78, + "grad_norm": 11.545709609985352, + "learning_rate": 8.158808517759278e-06, + "loss": 1.0948, + "step": 14153 + }, + { + "epoch": 1.78, + "grad_norm": 32.54110336303711, + "learning_rate": 8.15797180270259e-06, + "loss": 2.0602, + "step": 14154 + }, + { + "epoch": 1.78, + "grad_norm": 88.15934753417969, + "learning_rate": 8.157135087645903e-06, + "loss": 2.3074, + "step": 14155 + }, + { + "epoch": 1.78, + "grad_norm": 3.6791977882385254, + "learning_rate": 8.156298372589215e-06, + "loss": 0.4666, + "step": 14156 + }, + { + "epoch": 1.78, + "grad_norm": 14.77776050567627, + "learning_rate": 8.155461657532529e-06, + "loss": 0.6486, + "step": 14157 + }, + { + "epoch": 1.78, + "grad_norm": 12.394466400146484, + "learning_rate": 8.154624942475841e-06, + "loss": 1.6881, + "step": 14158 + }, + { + "epoch": 1.78, + "grad_norm": 23.44948387145996, + "learning_rate": 8.153788227419153e-06, + "loss": 1.8498, + "step": 14159 + }, + { + "epoch": 1.78, + "grad_norm": 219.22438049316406, + "learning_rate": 8.152951512362467e-06, + "loss": 1.3467, + "step": 14160 + }, + { + "epoch": 1.78, + "grad_norm": 5.641712188720703, + "learning_rate": 8.152114797305779e-06, + "loss": 0.3743, + "step": 14161 + }, + { + "epoch": 1.78, + "grad_norm": 16.614871978759766, + "learning_rate": 8.15127808224909e-06, + "loss": 1.1639, + "step": 14162 + }, + { + "epoch": 1.78, + "grad_norm": 8.148021697998047, + "learning_rate": 8.150441367192402e-06, + "loss": 0.7238, + "step": 14163 + }, + { + "epoch": 1.78, + "grad_norm": 5.253996849060059, + "learning_rate": 8.149604652135716e-06, + "loss": 0.2262, + "step": 14164 + }, + { + "epoch": 1.78, + "grad_norm": 27.66922378540039, + "learning_rate": 8.148767937079028e-06, + "loss": 1.6873, + "step": 14165 + }, + { + "epoch": 1.78, + "grad_norm": 22.25579071044922, + "learning_rate": 8.14793122202234e-06, + "loss": 1.5399, + "step": 14166 + }, + { + "epoch": 1.78, + "grad_norm": 13.656011581420898, + "learning_rate": 8.147094506965654e-06, + "loss": 1.4157, + "step": 14167 + }, + { + "epoch": 1.78, + "grad_norm": 15.185012817382812, + "learning_rate": 8.146257791908966e-06, + "loss": 0.71, + "step": 14168 + }, + { + "epoch": 1.78, + "grad_norm": 10.431121826171875, + "learning_rate": 8.14542107685228e-06, + "loss": 2.5234, + "step": 14169 + }, + { + "epoch": 1.78, + "grad_norm": 7.757891654968262, + "learning_rate": 8.144584361795591e-06, + "loss": 0.9268, + "step": 14170 + }, + { + "epoch": 1.78, + "grad_norm": 18.855506896972656, + "learning_rate": 8.143747646738905e-06, + "loss": 0.881, + "step": 14171 + }, + { + "epoch": 1.78, + "grad_norm": 18.44350242614746, + "learning_rate": 8.142910931682217e-06, + "loss": 1.6093, + "step": 14172 + }, + { + "epoch": 1.78, + "grad_norm": 8.91901969909668, + "learning_rate": 8.142074216625529e-06, + "loss": 0.8963, + "step": 14173 + }, + { + "epoch": 1.78, + "grad_norm": 18.291046142578125, + "learning_rate": 8.141237501568842e-06, + "loss": 1.1103, + "step": 14174 + }, + { + "epoch": 1.78, + "grad_norm": 1.663735032081604, + "learning_rate": 8.140400786512154e-06, + "loss": 0.1063, + "step": 14175 + }, + { + "epoch": 1.78, + "grad_norm": 10.795211791992188, + "learning_rate": 8.139564071455466e-06, + "loss": 1.5014, + "step": 14176 + }, + { + "epoch": 1.78, + "grad_norm": 26.331180572509766, + "learning_rate": 8.138727356398778e-06, + "loss": 1.9751, + "step": 14177 + }, + { + "epoch": 1.78, + "grad_norm": 15.649175643920898, + "learning_rate": 8.137890641342092e-06, + "loss": 0.6444, + "step": 14178 + }, + { + "epoch": 1.78, + "grad_norm": 8.898287773132324, + "learning_rate": 8.137053926285404e-06, + "loss": 0.8392, + "step": 14179 + }, + { + "epoch": 1.78, + "grad_norm": 25.118547439575195, + "learning_rate": 8.136217211228716e-06, + "loss": 1.2187, + "step": 14180 + }, + { + "epoch": 1.78, + "grad_norm": 31.6370849609375, + "learning_rate": 8.13538049617203e-06, + "loss": 1.7968, + "step": 14181 + }, + { + "epoch": 1.78, + "grad_norm": 11.394192695617676, + "learning_rate": 8.134543781115341e-06, + "loss": 1.9994, + "step": 14182 + }, + { + "epoch": 1.78, + "grad_norm": 5.50026273727417, + "learning_rate": 8.133707066058655e-06, + "loss": 0.2194, + "step": 14183 + }, + { + "epoch": 1.78, + "grad_norm": 5.361149311065674, + "learning_rate": 8.132870351001967e-06, + "loss": 0.5353, + "step": 14184 + }, + { + "epoch": 1.78, + "grad_norm": 46.48427963256836, + "learning_rate": 8.132033635945279e-06, + "loss": 0.948, + "step": 14185 + }, + { + "epoch": 1.78, + "grad_norm": 5.385955810546875, + "learning_rate": 8.131196920888593e-06, + "loss": 1.5825, + "step": 14186 + }, + { + "epoch": 1.78, + "grad_norm": 90.32432556152344, + "learning_rate": 8.130360205831905e-06, + "loss": 0.8186, + "step": 14187 + }, + { + "epoch": 1.78, + "grad_norm": 25.5366268157959, + "learning_rate": 8.129523490775218e-06, + "loss": 2.5684, + "step": 14188 + }, + { + "epoch": 1.78, + "grad_norm": 43.078800201416016, + "learning_rate": 8.12868677571853e-06, + "loss": 1.8272, + "step": 14189 + }, + { + "epoch": 1.78, + "grad_norm": 7.625771522521973, + "learning_rate": 8.127850060661842e-06, + "loss": 3.1224, + "step": 14190 + }, + { + "epoch": 1.78, + "grad_norm": 21.902969360351562, + "learning_rate": 8.127013345605154e-06, + "loss": 0.4124, + "step": 14191 + }, + { + "epoch": 1.78, + "grad_norm": 13.544557571411133, + "learning_rate": 8.126176630548466e-06, + "loss": 1.4524, + "step": 14192 + }, + { + "epoch": 1.78, + "grad_norm": 62.5804557800293, + "learning_rate": 8.12533991549178e-06, + "loss": 2.0876, + "step": 14193 + }, + { + "epoch": 1.78, + "grad_norm": 5.881214618682861, + "learning_rate": 8.124503200435092e-06, + "loss": 0.6644, + "step": 14194 + }, + { + "epoch": 1.78, + "grad_norm": 13.090020179748535, + "learning_rate": 8.123666485378405e-06, + "loss": 0.4593, + "step": 14195 + }, + { + "epoch": 1.78, + "grad_norm": 12.378315925598145, + "learning_rate": 8.122829770321717e-06, + "loss": 1.5352, + "step": 14196 + }, + { + "epoch": 1.78, + "grad_norm": 15.287260055541992, + "learning_rate": 8.121993055265031e-06, + "loss": 1.8672, + "step": 14197 + }, + { + "epoch": 1.78, + "grad_norm": 17.9168701171875, + "learning_rate": 8.121156340208343e-06, + "loss": 2.1657, + "step": 14198 + }, + { + "epoch": 1.78, + "grad_norm": 179.5956573486328, + "learning_rate": 8.120319625151655e-06, + "loss": 0.6685, + "step": 14199 + }, + { + "epoch": 1.78, + "grad_norm": 7.373868465423584, + "learning_rate": 8.119482910094968e-06, + "loss": 0.5316, + "step": 14200 + }, + { + "epoch": 1.78, + "grad_norm": 18.164833068847656, + "learning_rate": 8.11864619503828e-06, + "loss": 1.935, + "step": 14201 + }, + { + "epoch": 1.78, + "grad_norm": 9.35062026977539, + "learning_rate": 8.117809479981592e-06, + "loss": 0.9188, + "step": 14202 + }, + { + "epoch": 1.78, + "grad_norm": 6.372532844543457, + "learning_rate": 8.116972764924906e-06, + "loss": 0.3663, + "step": 14203 + }, + { + "epoch": 1.78, + "grad_norm": 15.599231719970703, + "learning_rate": 8.116136049868218e-06, + "loss": 0.9538, + "step": 14204 + }, + { + "epoch": 1.78, + "grad_norm": 12.862131118774414, + "learning_rate": 8.11529933481153e-06, + "loss": 1.077, + "step": 14205 + }, + { + "epoch": 1.78, + "grad_norm": 9.15827751159668, + "learning_rate": 8.114462619754842e-06, + "loss": 1.4036, + "step": 14206 + }, + { + "epoch": 1.78, + "grad_norm": 36.890323638916016, + "learning_rate": 8.113625904698156e-06, + "loss": 0.778, + "step": 14207 + }, + { + "epoch": 1.78, + "grad_norm": 14.244501113891602, + "learning_rate": 8.112789189641467e-06, + "loss": 0.4864, + "step": 14208 + }, + { + "epoch": 1.78, + "grad_norm": 9.773188591003418, + "learning_rate": 8.111952474584781e-06, + "loss": 0.7433, + "step": 14209 + }, + { + "epoch": 1.78, + "grad_norm": 13.432384490966797, + "learning_rate": 8.111115759528093e-06, + "loss": 1.1146, + "step": 14210 + }, + { + "epoch": 1.78, + "grad_norm": 10.874054908752441, + "learning_rate": 8.110279044471407e-06, + "loss": 0.5435, + "step": 14211 + }, + { + "epoch": 1.78, + "grad_norm": 4.427059650421143, + "learning_rate": 8.109442329414719e-06, + "loss": 0.1933, + "step": 14212 + }, + { + "epoch": 1.78, + "grad_norm": 9.961760520935059, + "learning_rate": 8.10860561435803e-06, + "loss": 2.4765, + "step": 14213 + }, + { + "epoch": 1.78, + "grad_norm": 11.333853721618652, + "learning_rate": 8.107768899301344e-06, + "loss": 0.7581, + "step": 14214 + }, + { + "epoch": 1.78, + "grad_norm": 15.988113403320312, + "learning_rate": 8.106932184244656e-06, + "loss": 0.8706, + "step": 14215 + }, + { + "epoch": 1.78, + "grad_norm": 11.896188735961914, + "learning_rate": 8.106095469187968e-06, + "loss": 0.7623, + "step": 14216 + }, + { + "epoch": 1.78, + "grad_norm": 21.775907516479492, + "learning_rate": 8.105258754131282e-06, + "loss": 1.1781, + "step": 14217 + }, + { + "epoch": 1.78, + "grad_norm": 20.36774444580078, + "learning_rate": 8.104422039074594e-06, + "loss": 1.7795, + "step": 14218 + }, + { + "epoch": 1.78, + "grad_norm": 9.5056734085083, + "learning_rate": 8.103585324017906e-06, + "loss": 0.9233, + "step": 14219 + }, + { + "epoch": 1.78, + "grad_norm": 130.77703857421875, + "learning_rate": 8.102748608961218e-06, + "loss": 2.7693, + "step": 14220 + }, + { + "epoch": 1.78, + "grad_norm": 13.03814697265625, + "learning_rate": 8.101911893904531e-06, + "loss": 0.494, + "step": 14221 + }, + { + "epoch": 1.78, + "grad_norm": 11.015776634216309, + "learning_rate": 8.101075178847843e-06, + "loss": 0.4717, + "step": 14222 + }, + { + "epoch": 1.78, + "grad_norm": 21.49491310119629, + "learning_rate": 8.100238463791157e-06, + "loss": 2.0776, + "step": 14223 + }, + { + "epoch": 1.79, + "grad_norm": 5.668808460235596, + "learning_rate": 8.099401748734469e-06, + "loss": 0.5059, + "step": 14224 + }, + { + "epoch": 1.79, + "grad_norm": 6.428109169006348, + "learning_rate": 8.098565033677783e-06, + "loss": 1.5735, + "step": 14225 + }, + { + "epoch": 1.79, + "grad_norm": 5.883421897888184, + "learning_rate": 8.097728318621095e-06, + "loss": 0.8716, + "step": 14226 + }, + { + "epoch": 1.79, + "grad_norm": 7.014955997467041, + "learning_rate": 8.096891603564406e-06, + "loss": 0.6923, + "step": 14227 + }, + { + "epoch": 1.79, + "grad_norm": 31.45018196105957, + "learning_rate": 8.09605488850772e-06, + "loss": 0.9072, + "step": 14228 + }, + { + "epoch": 1.79, + "grad_norm": 51.637786865234375, + "learning_rate": 8.095218173451032e-06, + "loss": 3.0075, + "step": 14229 + }, + { + "epoch": 1.79, + "grad_norm": 3.055673837661743, + "learning_rate": 8.094381458394344e-06, + "loss": 0.1078, + "step": 14230 + }, + { + "epoch": 1.79, + "grad_norm": 6.919978141784668, + "learning_rate": 8.093544743337656e-06, + "loss": 0.4284, + "step": 14231 + }, + { + "epoch": 1.79, + "grad_norm": 26.354515075683594, + "learning_rate": 8.09270802828097e-06, + "loss": 1.4861, + "step": 14232 + }, + { + "epoch": 1.79, + "grad_norm": 38.69197463989258, + "learning_rate": 8.091871313224282e-06, + "loss": 4.0157, + "step": 14233 + }, + { + "epoch": 1.79, + "grad_norm": 11.464272499084473, + "learning_rate": 8.091034598167594e-06, + "loss": 1.4697, + "step": 14234 + }, + { + "epoch": 1.79, + "grad_norm": 4.368750095367432, + "learning_rate": 8.090197883110907e-06, + "loss": 0.1549, + "step": 14235 + }, + { + "epoch": 1.79, + "grad_norm": 5.186683654785156, + "learning_rate": 8.08936116805422e-06, + "loss": 0.2221, + "step": 14236 + }, + { + "epoch": 1.79, + "grad_norm": 12.998003005981445, + "learning_rate": 8.088524452997533e-06, + "loss": 1.4524, + "step": 14237 + }, + { + "epoch": 1.79, + "grad_norm": 108.32467651367188, + "learning_rate": 8.087687737940845e-06, + "loss": 1.4167, + "step": 14238 + }, + { + "epoch": 1.79, + "grad_norm": 65.82601928710938, + "learning_rate": 8.086851022884158e-06, + "loss": 1.1568, + "step": 14239 + }, + { + "epoch": 1.79, + "grad_norm": 9.281325340270996, + "learning_rate": 8.08601430782747e-06, + "loss": 0.4164, + "step": 14240 + }, + { + "epoch": 1.79, + "grad_norm": 9.825818061828613, + "learning_rate": 8.085177592770782e-06, + "loss": 0.3728, + "step": 14241 + }, + { + "epoch": 1.79, + "grad_norm": 11.437962532043457, + "learning_rate": 8.084340877714096e-06, + "loss": 0.3512, + "step": 14242 + }, + { + "epoch": 1.79, + "grad_norm": 18.751483917236328, + "learning_rate": 8.083504162657408e-06, + "loss": 2.0616, + "step": 14243 + }, + { + "epoch": 1.79, + "grad_norm": 28.451984405517578, + "learning_rate": 8.08266744760072e-06, + "loss": 1.5825, + "step": 14244 + }, + { + "epoch": 1.79, + "grad_norm": 4.15352725982666, + "learning_rate": 8.081830732544032e-06, + "loss": 0.2291, + "step": 14245 + }, + { + "epoch": 1.79, + "grad_norm": 47.6756591796875, + "learning_rate": 8.080994017487345e-06, + "loss": 1.9295, + "step": 14246 + }, + { + "epoch": 1.79, + "grad_norm": 15.957259178161621, + "learning_rate": 8.080157302430657e-06, + "loss": 1.4346, + "step": 14247 + }, + { + "epoch": 1.79, + "grad_norm": 9.788030624389648, + "learning_rate": 8.07932058737397e-06, + "loss": 0.4594, + "step": 14248 + }, + { + "epoch": 1.79, + "grad_norm": 12.14069938659668, + "learning_rate": 8.078483872317283e-06, + "loss": 2.1253, + "step": 14249 + }, + { + "epoch": 1.79, + "grad_norm": 10.096846580505371, + "learning_rate": 8.077647157260595e-06, + "loss": 0.5818, + "step": 14250 + }, + { + "epoch": 1.79, + "grad_norm": 24.04420280456543, + "learning_rate": 8.076810442203909e-06, + "loss": 2.1495, + "step": 14251 + }, + { + "epoch": 1.79, + "grad_norm": 7.906283378601074, + "learning_rate": 8.07597372714722e-06, + "loss": 1.853, + "step": 14252 + }, + { + "epoch": 1.79, + "grad_norm": 6.8881516456604, + "learning_rate": 8.075137012090534e-06, + "loss": 0.4135, + "step": 14253 + }, + { + "epoch": 1.79, + "grad_norm": 33.854827880859375, + "learning_rate": 8.074300297033846e-06, + "loss": 1.7228, + "step": 14254 + }, + { + "epoch": 1.79, + "grad_norm": 12.448065757751465, + "learning_rate": 8.073463581977158e-06, + "loss": 1.63, + "step": 14255 + }, + { + "epoch": 1.79, + "grad_norm": 8.010289192199707, + "learning_rate": 8.072626866920472e-06, + "loss": 0.3541, + "step": 14256 + }, + { + "epoch": 1.79, + "grad_norm": 18.160022735595703, + "learning_rate": 8.071790151863784e-06, + "loss": 1.7952, + "step": 14257 + }, + { + "epoch": 1.79, + "grad_norm": 27.748252868652344, + "learning_rate": 8.070953436807096e-06, + "loss": 0.6974, + "step": 14258 + }, + { + "epoch": 1.79, + "grad_norm": 17.5703125, + "learning_rate": 8.070116721750408e-06, + "loss": 0.9125, + "step": 14259 + }, + { + "epoch": 1.79, + "grad_norm": 18.15052032470703, + "learning_rate": 8.069280006693721e-06, + "loss": 0.5414, + "step": 14260 + }, + { + "epoch": 1.79, + "grad_norm": 10.297247886657715, + "learning_rate": 8.068443291637033e-06, + "loss": 2.8713, + "step": 14261 + }, + { + "epoch": 1.79, + "grad_norm": 10.959202766418457, + "learning_rate": 8.067606576580345e-06, + "loss": 0.9516, + "step": 14262 + }, + { + "epoch": 1.79, + "grad_norm": 15.244864463806152, + "learning_rate": 8.066769861523659e-06, + "loss": 0.725, + "step": 14263 + }, + { + "epoch": 1.79, + "grad_norm": 13.854104042053223, + "learning_rate": 8.06593314646697e-06, + "loss": 0.5473, + "step": 14264 + }, + { + "epoch": 1.79, + "grad_norm": 113.05213165283203, + "learning_rate": 8.065096431410284e-06, + "loss": 1.9104, + "step": 14265 + }, + { + "epoch": 1.79, + "grad_norm": 8.714113235473633, + "learning_rate": 8.064259716353596e-06, + "loss": 1.2544, + "step": 14266 + }, + { + "epoch": 1.79, + "grad_norm": 9.027963638305664, + "learning_rate": 8.06342300129691e-06, + "loss": 1.4344, + "step": 14267 + }, + { + "epoch": 1.79, + "grad_norm": 13.1775541305542, + "learning_rate": 8.062586286240222e-06, + "loss": 1.0498, + "step": 14268 + }, + { + "epoch": 1.79, + "grad_norm": 8.681795120239258, + "learning_rate": 8.061749571183534e-06, + "loss": 0.1476, + "step": 14269 + }, + { + "epoch": 1.79, + "grad_norm": 37.30206298828125, + "learning_rate": 8.060912856126848e-06, + "loss": 2.6026, + "step": 14270 + }, + { + "epoch": 1.79, + "grad_norm": 18.547700881958008, + "learning_rate": 8.06007614107016e-06, + "loss": 1.3664, + "step": 14271 + }, + { + "epoch": 1.79, + "grad_norm": 28.999473571777344, + "learning_rate": 8.059239426013472e-06, + "loss": 3.6555, + "step": 14272 + }, + { + "epoch": 1.79, + "grad_norm": 14.13399600982666, + "learning_rate": 8.058402710956784e-06, + "loss": 0.6258, + "step": 14273 + }, + { + "epoch": 1.79, + "grad_norm": 19.898513793945312, + "learning_rate": 8.057565995900097e-06, + "loss": 2.7374, + "step": 14274 + }, + { + "epoch": 1.79, + "grad_norm": 22.894289016723633, + "learning_rate": 8.056729280843409e-06, + "loss": 1.3485, + "step": 14275 + }, + { + "epoch": 1.79, + "grad_norm": 9.624289512634277, + "learning_rate": 8.055892565786721e-06, + "loss": 0.9094, + "step": 14276 + }, + { + "epoch": 1.79, + "grad_norm": 14.443469047546387, + "learning_rate": 8.055055850730035e-06, + "loss": 0.8546, + "step": 14277 + }, + { + "epoch": 1.79, + "grad_norm": 18.679956436157227, + "learning_rate": 8.054219135673347e-06, + "loss": 1.6593, + "step": 14278 + }, + { + "epoch": 1.79, + "grad_norm": 13.102360725402832, + "learning_rate": 8.05338242061666e-06, + "loss": 0.5293, + "step": 14279 + }, + { + "epoch": 1.79, + "grad_norm": 9.42329216003418, + "learning_rate": 8.052545705559972e-06, + "loss": 1.0723, + "step": 14280 + }, + { + "epoch": 1.79, + "grad_norm": 6.173603057861328, + "learning_rate": 8.051708990503286e-06, + "loss": 0.0611, + "step": 14281 + }, + { + "epoch": 1.79, + "grad_norm": 13.688335418701172, + "learning_rate": 8.050872275446598e-06, + "loss": 0.5864, + "step": 14282 + }, + { + "epoch": 1.79, + "grad_norm": 11.294967651367188, + "learning_rate": 8.05003556038991e-06, + "loss": 1.3249, + "step": 14283 + }, + { + "epoch": 1.79, + "grad_norm": 11.166751861572266, + "learning_rate": 8.049198845333222e-06, + "loss": 1.4536, + "step": 14284 + }, + { + "epoch": 1.79, + "grad_norm": 19.163232803344727, + "learning_rate": 8.048362130276535e-06, + "loss": 1.6504, + "step": 14285 + }, + { + "epoch": 1.79, + "grad_norm": 13.887179374694824, + "learning_rate": 8.047525415219847e-06, + "loss": 1.0023, + "step": 14286 + }, + { + "epoch": 1.79, + "grad_norm": 13.513861656188965, + "learning_rate": 8.04668870016316e-06, + "loss": 2.4072, + "step": 14287 + }, + { + "epoch": 1.79, + "grad_norm": 38.558311462402344, + "learning_rate": 8.045851985106473e-06, + "loss": 1.6424, + "step": 14288 + }, + { + "epoch": 1.79, + "grad_norm": 10.030594825744629, + "learning_rate": 8.045015270049785e-06, + "loss": 0.9199, + "step": 14289 + }, + { + "epoch": 1.79, + "grad_norm": 18.584436416625977, + "learning_rate": 8.044178554993097e-06, + "loss": 1.5709, + "step": 14290 + }, + { + "epoch": 1.79, + "grad_norm": 7.418522357940674, + "learning_rate": 8.04334183993641e-06, + "loss": 0.5635, + "step": 14291 + }, + { + "epoch": 1.79, + "grad_norm": 19.099185943603516, + "learning_rate": 8.042505124879723e-06, + "loss": 0.1956, + "step": 14292 + }, + { + "epoch": 1.79, + "grad_norm": 7.259555339813232, + "learning_rate": 8.041668409823036e-06, + "loss": 0.8623, + "step": 14293 + }, + { + "epoch": 1.79, + "grad_norm": 7.537904739379883, + "learning_rate": 8.040831694766348e-06, + "loss": 0.4155, + "step": 14294 + }, + { + "epoch": 1.79, + "grad_norm": 5.026714324951172, + "learning_rate": 8.039994979709662e-06, + "loss": 0.2345, + "step": 14295 + }, + { + "epoch": 1.79, + "grad_norm": 15.554062843322754, + "learning_rate": 8.039158264652974e-06, + "loss": 1.1563, + "step": 14296 + }, + { + "epoch": 1.79, + "grad_norm": 17.680042266845703, + "learning_rate": 8.038321549596286e-06, + "loss": 1.4585, + "step": 14297 + }, + { + "epoch": 1.79, + "grad_norm": 32.36421203613281, + "learning_rate": 8.037484834539598e-06, + "loss": 1.1143, + "step": 14298 + }, + { + "epoch": 1.79, + "grad_norm": 87.64347839355469, + "learning_rate": 8.036648119482911e-06, + "loss": 2.7058, + "step": 14299 + }, + { + "epoch": 1.79, + "grad_norm": 11.312725067138672, + "learning_rate": 8.035811404426223e-06, + "loss": 0.7953, + "step": 14300 + }, + { + "epoch": 1.79, + "grad_norm": 8.867121696472168, + "learning_rate": 8.034974689369535e-06, + "loss": 0.476, + "step": 14301 + }, + { + "epoch": 1.79, + "grad_norm": 10.188661575317383, + "learning_rate": 8.034137974312849e-06, + "loss": 0.5231, + "step": 14302 + }, + { + "epoch": 1.79, + "grad_norm": 25.78326988220215, + "learning_rate": 8.03330125925616e-06, + "loss": 2.0026, + "step": 14303 + }, + { + "epoch": 1.8, + "grad_norm": 3.5009584426879883, + "learning_rate": 8.032464544199473e-06, + "loss": 0.2596, + "step": 14304 + }, + { + "epoch": 1.8, + "grad_norm": 11.701454162597656, + "learning_rate": 8.031627829142786e-06, + "loss": 1.0258, + "step": 14305 + }, + { + "epoch": 1.8, + "grad_norm": 33.025184631347656, + "learning_rate": 8.030791114086098e-06, + "loss": 1.0877, + "step": 14306 + }, + { + "epoch": 1.8, + "grad_norm": 6.063713550567627, + "learning_rate": 8.029954399029412e-06, + "loss": 0.3886, + "step": 14307 + }, + { + "epoch": 1.8, + "grad_norm": 10.470044136047363, + "learning_rate": 8.029117683972724e-06, + "loss": 0.4611, + "step": 14308 + }, + { + "epoch": 1.8, + "grad_norm": 20.49445343017578, + "learning_rate": 8.028280968916038e-06, + "loss": 1.836, + "step": 14309 + }, + { + "epoch": 1.8, + "grad_norm": 13.73381233215332, + "learning_rate": 8.02744425385935e-06, + "loss": 0.8764, + "step": 14310 + }, + { + "epoch": 1.8, + "grad_norm": 20.117389678955078, + "learning_rate": 8.026607538802662e-06, + "loss": 1.4659, + "step": 14311 + }, + { + "epoch": 1.8, + "grad_norm": 12.779775619506836, + "learning_rate": 8.025770823745973e-06, + "loss": 0.4024, + "step": 14312 + }, + { + "epoch": 1.8, + "grad_norm": 10.237881660461426, + "learning_rate": 8.024934108689285e-06, + "loss": 2.0096, + "step": 14313 + }, + { + "epoch": 1.8, + "grad_norm": 10.170612335205078, + "learning_rate": 8.024097393632599e-06, + "loss": 1.0357, + "step": 14314 + }, + { + "epoch": 1.8, + "grad_norm": 15.791288375854492, + "learning_rate": 8.023260678575911e-06, + "loss": 0.7219, + "step": 14315 + }, + { + "epoch": 1.8, + "grad_norm": 12.148241996765137, + "learning_rate": 8.022423963519225e-06, + "loss": 1.5481, + "step": 14316 + }, + { + "epoch": 1.8, + "grad_norm": 6.0537261962890625, + "learning_rate": 8.021587248462537e-06, + "loss": 1.1468, + "step": 14317 + }, + { + "epoch": 1.8, + "grad_norm": 10.948807716369629, + "learning_rate": 8.020750533405849e-06, + "loss": 0.6462, + "step": 14318 + }, + { + "epoch": 1.8, + "grad_norm": 9.998846054077148, + "learning_rate": 8.019913818349162e-06, + "loss": 1.608, + "step": 14319 + }, + { + "epoch": 1.8, + "grad_norm": 7.889427661895752, + "learning_rate": 8.019077103292474e-06, + "loss": 0.7336, + "step": 14320 + }, + { + "epoch": 1.8, + "grad_norm": 20.464832305908203, + "learning_rate": 8.018240388235788e-06, + "loss": 1.2012, + "step": 14321 + }, + { + "epoch": 1.8, + "grad_norm": 2.512769937515259, + "learning_rate": 8.0174036731791e-06, + "loss": 0.1633, + "step": 14322 + }, + { + "epoch": 1.8, + "grad_norm": 10.887861251831055, + "learning_rate": 8.016566958122412e-06, + "loss": 1.2358, + "step": 14323 + }, + { + "epoch": 1.8, + "grad_norm": 13.0720796585083, + "learning_rate": 8.015730243065725e-06, + "loss": 0.688, + "step": 14324 + }, + { + "epoch": 1.8, + "grad_norm": 14.510822296142578, + "learning_rate": 8.014893528009037e-06, + "loss": 2.0057, + "step": 14325 + }, + { + "epoch": 1.8, + "grad_norm": 8.325578689575195, + "learning_rate": 8.01405681295235e-06, + "loss": 0.8386, + "step": 14326 + }, + { + "epoch": 1.8, + "grad_norm": 2.9541542530059814, + "learning_rate": 8.013220097895661e-06, + "loss": 0.1813, + "step": 14327 + }, + { + "epoch": 1.8, + "grad_norm": 13.947324752807617, + "learning_rate": 8.012383382838975e-06, + "loss": 0.8859, + "step": 14328 + }, + { + "epoch": 1.8, + "grad_norm": 7.45787239074707, + "learning_rate": 8.011546667782287e-06, + "loss": 2.1223, + "step": 14329 + }, + { + "epoch": 1.8, + "grad_norm": 20.63152503967285, + "learning_rate": 8.0107099527256e-06, + "loss": 0.9686, + "step": 14330 + }, + { + "epoch": 1.8, + "grad_norm": 14.866303443908691, + "learning_rate": 8.009873237668912e-06, + "loss": 2.4096, + "step": 14331 + }, + { + "epoch": 1.8, + "grad_norm": 8.556191444396973, + "learning_rate": 8.009036522612224e-06, + "loss": 1.2394, + "step": 14332 + }, + { + "epoch": 1.8, + "grad_norm": 11.858455657958984, + "learning_rate": 8.008199807555538e-06, + "loss": 1.675, + "step": 14333 + }, + { + "epoch": 1.8, + "grad_norm": 7.185389041900635, + "learning_rate": 8.00736309249885e-06, + "loss": 2.8998, + "step": 14334 + }, + { + "epoch": 1.8, + "grad_norm": 9.578290939331055, + "learning_rate": 8.006526377442164e-06, + "loss": 1.0241, + "step": 14335 + }, + { + "epoch": 1.8, + "grad_norm": 13.962796211242676, + "learning_rate": 8.005689662385476e-06, + "loss": 1.2446, + "step": 14336 + }, + { + "epoch": 1.8, + "grad_norm": 11.544517517089844, + "learning_rate": 8.004852947328788e-06, + "loss": 2.1161, + "step": 14337 + }, + { + "epoch": 1.8, + "grad_norm": 7.659705638885498, + "learning_rate": 8.004016232272101e-06, + "loss": 1.0362, + "step": 14338 + }, + { + "epoch": 1.8, + "grad_norm": 9.644447326660156, + "learning_rate": 8.003179517215413e-06, + "loss": 0.4681, + "step": 14339 + }, + { + "epoch": 1.8, + "grad_norm": 28.88718032836914, + "learning_rate": 8.002342802158725e-06, + "loss": 2.6071, + "step": 14340 + }, + { + "epoch": 1.8, + "grad_norm": 13.294441223144531, + "learning_rate": 8.001506087102037e-06, + "loss": 1.8365, + "step": 14341 + }, + { + "epoch": 1.8, + "grad_norm": 17.310266494750977, + "learning_rate": 8.00066937204535e-06, + "loss": 0.9588, + "step": 14342 + }, + { + "epoch": 1.8, + "grad_norm": 4.964139938354492, + "learning_rate": 7.999832656988663e-06, + "loss": 1.5954, + "step": 14343 + }, + { + "epoch": 1.8, + "grad_norm": 9.747537612915039, + "learning_rate": 7.998995941931976e-06, + "loss": 2.5247, + "step": 14344 + }, + { + "epoch": 1.8, + "grad_norm": 5.901177406311035, + "learning_rate": 7.998159226875288e-06, + "loss": 0.1717, + "step": 14345 + }, + { + "epoch": 1.8, + "grad_norm": 6.909663200378418, + "learning_rate": 7.9973225118186e-06, + "loss": 0.453, + "step": 14346 + }, + { + "epoch": 1.8, + "grad_norm": 14.984963417053223, + "learning_rate": 7.996485796761914e-06, + "loss": 0.9857, + "step": 14347 + }, + { + "epoch": 1.8, + "grad_norm": 5.209060192108154, + "learning_rate": 7.995649081705226e-06, + "loss": 1.797, + "step": 14348 + }, + { + "epoch": 1.8, + "grad_norm": 61.35810852050781, + "learning_rate": 7.99481236664854e-06, + "loss": 2.0787, + "step": 14349 + }, + { + "epoch": 1.8, + "grad_norm": 9.355850219726562, + "learning_rate": 7.993975651591851e-06, + "loss": 0.2956, + "step": 14350 + }, + { + "epoch": 1.8, + "grad_norm": 9.559131622314453, + "learning_rate": 7.993138936535163e-06, + "loss": 0.6137, + "step": 14351 + }, + { + "epoch": 1.8, + "grad_norm": 25.988943099975586, + "learning_rate": 7.992302221478477e-06, + "loss": 1.1929, + "step": 14352 + }, + { + "epoch": 1.8, + "grad_norm": 29.633085250854492, + "learning_rate": 7.991465506421789e-06, + "loss": 1.7604, + "step": 14353 + }, + { + "epoch": 1.8, + "grad_norm": 13.114699363708496, + "learning_rate": 7.990628791365101e-06, + "loss": 1.2965, + "step": 14354 + }, + { + "epoch": 1.8, + "grad_norm": 8.263970375061035, + "learning_rate": 7.989792076308413e-06, + "loss": 0.4648, + "step": 14355 + }, + { + "epoch": 1.8, + "grad_norm": 13.104530334472656, + "learning_rate": 7.988955361251727e-06, + "loss": 1.2807, + "step": 14356 + }, + { + "epoch": 1.8, + "grad_norm": 5.3938703536987305, + "learning_rate": 7.988118646195039e-06, + "loss": 1.6159, + "step": 14357 + }, + { + "epoch": 1.8, + "grad_norm": 40.70235824584961, + "learning_rate": 7.987281931138352e-06, + "loss": 1.9411, + "step": 14358 + }, + { + "epoch": 1.8, + "grad_norm": 23.1292667388916, + "learning_rate": 7.986445216081664e-06, + "loss": 1.7476, + "step": 14359 + }, + { + "epoch": 1.8, + "grad_norm": 27.007932662963867, + "learning_rate": 7.985608501024976e-06, + "loss": 1.8868, + "step": 14360 + }, + { + "epoch": 1.8, + "grad_norm": 14.89322566986084, + "learning_rate": 7.98477178596829e-06, + "loss": 0.9967, + "step": 14361 + }, + { + "epoch": 1.8, + "grad_norm": 28.900524139404297, + "learning_rate": 7.983935070911602e-06, + "loss": 2.4159, + "step": 14362 + }, + { + "epoch": 1.8, + "grad_norm": 6.286021709442139, + "learning_rate": 7.983098355854915e-06, + "loss": 0.9095, + "step": 14363 + }, + { + "epoch": 1.8, + "grad_norm": 11.280611991882324, + "learning_rate": 7.982261640798227e-06, + "loss": 0.294, + "step": 14364 + }, + { + "epoch": 1.8, + "grad_norm": 14.967876434326172, + "learning_rate": 7.98142492574154e-06, + "loss": 1.035, + "step": 14365 + }, + { + "epoch": 1.8, + "grad_norm": 84.54759979248047, + "learning_rate": 7.980588210684851e-06, + "loss": 3.8704, + "step": 14366 + }, + { + "epoch": 1.8, + "grad_norm": 79.22369384765625, + "learning_rate": 7.979751495628165e-06, + "loss": 1.0864, + "step": 14367 + }, + { + "epoch": 1.8, + "grad_norm": 8.267379760742188, + "learning_rate": 7.978914780571477e-06, + "loss": 1.2849, + "step": 14368 + }, + { + "epoch": 1.8, + "grad_norm": 18.904296875, + "learning_rate": 7.978078065514789e-06, + "loss": 0.6259, + "step": 14369 + }, + { + "epoch": 1.8, + "grad_norm": 16.8665771484375, + "learning_rate": 7.977241350458102e-06, + "loss": 1.8917, + "step": 14370 + }, + { + "epoch": 1.8, + "grad_norm": 31.569623947143555, + "learning_rate": 7.976404635401414e-06, + "loss": 1.4306, + "step": 14371 + }, + { + "epoch": 1.8, + "grad_norm": 58.07877731323242, + "learning_rate": 7.975567920344726e-06, + "loss": 3.0194, + "step": 14372 + }, + { + "epoch": 1.8, + "grad_norm": 34.71226501464844, + "learning_rate": 7.97473120528804e-06, + "loss": 1.6203, + "step": 14373 + }, + { + "epoch": 1.8, + "grad_norm": 10.487889289855957, + "learning_rate": 7.973894490231352e-06, + "loss": 0.83, + "step": 14374 + }, + { + "epoch": 1.8, + "grad_norm": 13.767682075500488, + "learning_rate": 7.973057775174666e-06, + "loss": 0.6044, + "step": 14375 + }, + { + "epoch": 1.8, + "grad_norm": 14.913378715515137, + "learning_rate": 7.972221060117978e-06, + "loss": 1.2755, + "step": 14376 + }, + { + "epoch": 1.8, + "grad_norm": 9.47493839263916, + "learning_rate": 7.971384345061291e-06, + "loss": 1.0999, + "step": 14377 + }, + { + "epoch": 1.8, + "grad_norm": 13.59195613861084, + "learning_rate": 7.970547630004603e-06, + "loss": 2.0894, + "step": 14378 + }, + { + "epoch": 1.8, + "grad_norm": 14.368037223815918, + "learning_rate": 7.969710914947915e-06, + "loss": 1.3127, + "step": 14379 + }, + { + "epoch": 1.8, + "grad_norm": 36.372535705566406, + "learning_rate": 7.968874199891227e-06, + "loss": 2.1036, + "step": 14380 + }, + { + "epoch": 1.8, + "grad_norm": 17.263235092163086, + "learning_rate": 7.96803748483454e-06, + "loss": 1.8845, + "step": 14381 + }, + { + "epoch": 1.8, + "grad_norm": 32.459693908691406, + "learning_rate": 7.967200769777853e-06, + "loss": 1.5034, + "step": 14382 + }, + { + "epoch": 1.81, + "grad_norm": 4.841275215148926, + "learning_rate": 7.966364054721165e-06, + "loss": 1.1678, + "step": 14383 + }, + { + "epoch": 1.81, + "grad_norm": 5.018244743347168, + "learning_rate": 7.965527339664478e-06, + "loss": 0.2264, + "step": 14384 + }, + { + "epoch": 1.81, + "grad_norm": 8.65173053741455, + "learning_rate": 7.96469062460779e-06, + "loss": 0.8554, + "step": 14385 + }, + { + "epoch": 1.81, + "grad_norm": 9.671903610229492, + "learning_rate": 7.963853909551102e-06, + "loss": 1.9722, + "step": 14386 + }, + { + "epoch": 1.81, + "grad_norm": 15.357810020446777, + "learning_rate": 7.963017194494416e-06, + "loss": 0.979, + "step": 14387 + }, + { + "epoch": 1.81, + "grad_norm": 9.76091480255127, + "learning_rate": 7.962180479437728e-06, + "loss": 1.3369, + "step": 14388 + }, + { + "epoch": 1.81, + "grad_norm": 9.63212776184082, + "learning_rate": 7.961343764381041e-06, + "loss": 0.4692, + "step": 14389 + }, + { + "epoch": 1.81, + "grad_norm": 21.49897575378418, + "learning_rate": 7.960507049324353e-06, + "loss": 0.9434, + "step": 14390 + }, + { + "epoch": 1.81, + "grad_norm": 6.799454689025879, + "learning_rate": 7.959670334267667e-06, + "loss": 0.3093, + "step": 14391 + }, + { + "epoch": 1.81, + "grad_norm": 30.406591415405273, + "learning_rate": 7.958833619210979e-06, + "loss": 1.7786, + "step": 14392 + }, + { + "epoch": 1.81, + "grad_norm": 14.645602226257324, + "learning_rate": 7.957996904154291e-06, + "loss": 1.3068, + "step": 14393 + }, + { + "epoch": 1.81, + "grad_norm": 5.332977771759033, + "learning_rate": 7.957160189097603e-06, + "loss": 1.8079, + "step": 14394 + }, + { + "epoch": 1.81, + "grad_norm": 13.62096881866455, + "learning_rate": 7.956323474040915e-06, + "loss": 1.9015, + "step": 14395 + }, + { + "epoch": 1.81, + "grad_norm": 15.009598731994629, + "learning_rate": 7.955486758984228e-06, + "loss": 1.5518, + "step": 14396 + }, + { + "epoch": 1.81, + "grad_norm": 21.20696449279785, + "learning_rate": 7.95465004392754e-06, + "loss": 1.5061, + "step": 14397 + }, + { + "epoch": 1.81, + "grad_norm": 21.663663864135742, + "learning_rate": 7.953813328870854e-06, + "loss": 1.268, + "step": 14398 + }, + { + "epoch": 1.81, + "grad_norm": 16.903648376464844, + "learning_rate": 7.952976613814166e-06, + "loss": 2.0583, + "step": 14399 + }, + { + "epoch": 1.81, + "grad_norm": 60.051414489746094, + "learning_rate": 7.952139898757478e-06, + "loss": 0.7644, + "step": 14400 + }, + { + "epoch": 1.81, + "eval_loss": 0.07709412276744843, + "eval_runtime": 98.0422, + "eval_samples_per_second": 36.127, + "eval_steps_per_second": 36.127, + "step": 14400 + }, + { + "epoch": 1.81, + "grad_norm": 10.575669288635254, + "learning_rate": 7.951303183700792e-06, + "loss": 3.3803, + "step": 14401 + }, + { + "epoch": 1.81, + "grad_norm": 15.85393238067627, + "learning_rate": 7.950466468644104e-06, + "loss": 0.9647, + "step": 14402 + }, + { + "epoch": 1.81, + "grad_norm": 6.105818748474121, + "learning_rate": 7.949629753587417e-06, + "loss": 1.2717, + "step": 14403 + }, + { + "epoch": 1.81, + "grad_norm": 21.075420379638672, + "learning_rate": 7.94879303853073e-06, + "loss": 2.2108, + "step": 14404 + }, + { + "epoch": 1.81, + "grad_norm": 7.1770453453063965, + "learning_rate": 7.947956323474041e-06, + "loss": 1.3509, + "step": 14405 + }, + { + "epoch": 1.81, + "grad_norm": 12.135576248168945, + "learning_rate": 7.947119608417355e-06, + "loss": 2.5509, + "step": 14406 + }, + { + "epoch": 1.81, + "grad_norm": 8.19250774383545, + "learning_rate": 7.946282893360667e-06, + "loss": 1.0113, + "step": 14407 + }, + { + "epoch": 1.81, + "grad_norm": 1.2168388366699219, + "learning_rate": 7.945446178303979e-06, + "loss": 0.0406, + "step": 14408 + }, + { + "epoch": 1.81, + "grad_norm": 34.41596984863281, + "learning_rate": 7.94460946324729e-06, + "loss": 3.8037, + "step": 14409 + }, + { + "epoch": 1.81, + "grad_norm": 14.644426345825195, + "learning_rate": 7.943772748190604e-06, + "loss": 1.299, + "step": 14410 + }, + { + "epoch": 1.81, + "grad_norm": 25.0706844329834, + "learning_rate": 7.942936033133916e-06, + "loss": 1.041, + "step": 14411 + }, + { + "epoch": 1.81, + "grad_norm": 8.17798900604248, + "learning_rate": 7.94209931807723e-06, + "loss": 0.5255, + "step": 14412 + }, + { + "epoch": 1.81, + "grad_norm": 20.191364288330078, + "learning_rate": 7.941262603020542e-06, + "loss": 0.7884, + "step": 14413 + }, + { + "epoch": 1.81, + "grad_norm": 8.380793571472168, + "learning_rate": 7.940425887963854e-06, + "loss": 0.6298, + "step": 14414 + }, + { + "epoch": 1.81, + "grad_norm": 5.755029678344727, + "learning_rate": 7.939589172907167e-06, + "loss": 0.4435, + "step": 14415 + }, + { + "epoch": 1.81, + "grad_norm": 8.83551025390625, + "learning_rate": 7.93875245785048e-06, + "loss": 0.4982, + "step": 14416 + }, + { + "epoch": 1.81, + "grad_norm": 8.85175609588623, + "learning_rate": 7.937915742793793e-06, + "loss": 0.5442, + "step": 14417 + }, + { + "epoch": 1.81, + "grad_norm": 13.807879447937012, + "learning_rate": 7.937079027737105e-06, + "loss": 1.4413, + "step": 14418 + }, + { + "epoch": 1.81, + "grad_norm": 12.331297874450684, + "learning_rate": 7.936242312680417e-06, + "loss": 0.4959, + "step": 14419 + }, + { + "epoch": 1.81, + "grad_norm": 208.26344299316406, + "learning_rate": 7.93540559762373e-06, + "loss": 0.5404, + "step": 14420 + }, + { + "epoch": 1.81, + "grad_norm": 23.06793212890625, + "learning_rate": 7.934568882567043e-06, + "loss": 1.0801, + "step": 14421 + }, + { + "epoch": 1.81, + "grad_norm": 11.011923789978027, + "learning_rate": 7.933732167510355e-06, + "loss": 1.3406, + "step": 14422 + }, + { + "epoch": 1.81, + "grad_norm": 9.207767486572266, + "learning_rate": 7.932895452453667e-06, + "loss": 0.8914, + "step": 14423 + }, + { + "epoch": 1.81, + "grad_norm": 2.7527880668640137, + "learning_rate": 7.93205873739698e-06, + "loss": 0.029, + "step": 14424 + }, + { + "epoch": 1.81, + "grad_norm": 11.621235847473145, + "learning_rate": 7.931222022340292e-06, + "loss": 0.2108, + "step": 14425 + }, + { + "epoch": 1.81, + "grad_norm": 11.2488431930542, + "learning_rate": 7.930385307283606e-06, + "loss": 0.9488, + "step": 14426 + }, + { + "epoch": 1.81, + "grad_norm": 54.50273132324219, + "learning_rate": 7.929548592226918e-06, + "loss": 3.4363, + "step": 14427 + }, + { + "epoch": 1.81, + "grad_norm": 527.145263671875, + "learning_rate": 7.92871187717023e-06, + "loss": 1.9903, + "step": 14428 + }, + { + "epoch": 1.81, + "grad_norm": 17.940338134765625, + "learning_rate": 7.927875162113543e-06, + "loss": 2.1123, + "step": 14429 + }, + { + "epoch": 1.81, + "grad_norm": 10.652819633483887, + "learning_rate": 7.927038447056855e-06, + "loss": 0.8332, + "step": 14430 + }, + { + "epoch": 1.81, + "grad_norm": 9.21216106414795, + "learning_rate": 7.926201732000169e-06, + "loss": 0.2088, + "step": 14431 + }, + { + "epoch": 1.81, + "grad_norm": 71.3212661743164, + "learning_rate": 7.925365016943481e-06, + "loss": 2.5172, + "step": 14432 + }, + { + "epoch": 1.81, + "grad_norm": 8.186037063598633, + "learning_rate": 7.924528301886793e-06, + "loss": 0.3602, + "step": 14433 + }, + { + "epoch": 1.81, + "grad_norm": 29.011112213134766, + "learning_rate": 7.923691586830105e-06, + "loss": 1.2023, + "step": 14434 + }, + { + "epoch": 1.81, + "grad_norm": 18.202524185180664, + "learning_rate": 7.922854871773418e-06, + "loss": 1.8014, + "step": 14435 + }, + { + "epoch": 1.81, + "grad_norm": 20.9152774810791, + "learning_rate": 7.92201815671673e-06, + "loss": 1.1115, + "step": 14436 + }, + { + "epoch": 1.81, + "grad_norm": 11.133270263671875, + "learning_rate": 7.921181441660042e-06, + "loss": 0.6653, + "step": 14437 + }, + { + "epoch": 1.81, + "grad_norm": 8.92380428314209, + "learning_rate": 7.920344726603356e-06, + "loss": 0.3128, + "step": 14438 + }, + { + "epoch": 1.81, + "grad_norm": 13.854774475097656, + "learning_rate": 7.919508011546668e-06, + "loss": 1.9934, + "step": 14439 + }, + { + "epoch": 1.81, + "grad_norm": 18.137493133544922, + "learning_rate": 7.918671296489982e-06, + "loss": 0.723, + "step": 14440 + }, + { + "epoch": 1.81, + "grad_norm": 13.559948921203613, + "learning_rate": 7.917834581433294e-06, + "loss": 1.102, + "step": 14441 + }, + { + "epoch": 1.81, + "grad_norm": 6.407212257385254, + "learning_rate": 7.916997866376606e-06, + "loss": 0.3254, + "step": 14442 + }, + { + "epoch": 1.81, + "grad_norm": 10.264758110046387, + "learning_rate": 7.91616115131992e-06, + "loss": 1.9795, + "step": 14443 + }, + { + "epoch": 1.81, + "grad_norm": 71.19867706298828, + "learning_rate": 7.915324436263231e-06, + "loss": 0.6712, + "step": 14444 + }, + { + "epoch": 1.81, + "grad_norm": 9.976835250854492, + "learning_rate": 7.914487721206545e-06, + "loss": 0.8841, + "step": 14445 + }, + { + "epoch": 1.81, + "grad_norm": 19.097244262695312, + "learning_rate": 7.913651006149857e-06, + "loss": 1.1689, + "step": 14446 + }, + { + "epoch": 1.81, + "grad_norm": 5.357875823974609, + "learning_rate": 7.912814291093169e-06, + "loss": 0.1382, + "step": 14447 + }, + { + "epoch": 1.81, + "grad_norm": 42.744598388671875, + "learning_rate": 7.91197757603648e-06, + "loss": 2.3061, + "step": 14448 + }, + { + "epoch": 1.81, + "grad_norm": 16.11562728881836, + "learning_rate": 7.911140860979794e-06, + "loss": 0.6837, + "step": 14449 + }, + { + "epoch": 1.81, + "grad_norm": 75.79932403564453, + "learning_rate": 7.910304145923106e-06, + "loss": 0.8448, + "step": 14450 + }, + { + "epoch": 1.81, + "grad_norm": 11.990781784057617, + "learning_rate": 7.909467430866418e-06, + "loss": 0.762, + "step": 14451 + }, + { + "epoch": 1.81, + "grad_norm": 75.82522583007812, + "learning_rate": 7.908630715809732e-06, + "loss": 1.7107, + "step": 14452 + }, + { + "epoch": 1.81, + "grad_norm": 42.937198638916016, + "learning_rate": 7.907794000753044e-06, + "loss": 1.0458, + "step": 14453 + }, + { + "epoch": 1.81, + "grad_norm": 5.65687370300293, + "learning_rate": 7.906957285696357e-06, + "loss": 0.1425, + "step": 14454 + }, + { + "epoch": 1.81, + "grad_norm": 16.94680404663086, + "learning_rate": 7.90612057063967e-06, + "loss": 0.5582, + "step": 14455 + }, + { + "epoch": 1.81, + "grad_norm": 21.313919067382812, + "learning_rate": 7.905283855582981e-06, + "loss": 1.1394, + "step": 14456 + }, + { + "epoch": 1.81, + "grad_norm": 31.395153045654297, + "learning_rate": 7.904447140526295e-06, + "loss": 1.1894, + "step": 14457 + }, + { + "epoch": 1.81, + "grad_norm": 14.642378807067871, + "learning_rate": 7.903610425469607e-06, + "loss": 0.9938, + "step": 14458 + }, + { + "epoch": 1.81, + "grad_norm": 6.3422017097473145, + "learning_rate": 7.90277371041292e-06, + "loss": 0.7284, + "step": 14459 + }, + { + "epoch": 1.81, + "grad_norm": 9.563097953796387, + "learning_rate": 7.901936995356233e-06, + "loss": 0.4289, + "step": 14460 + }, + { + "epoch": 1.81, + "grad_norm": 31.980749130249023, + "learning_rate": 7.901100280299545e-06, + "loss": 2.886, + "step": 14461 + }, + { + "epoch": 1.81, + "grad_norm": 16.99601936340332, + "learning_rate": 7.900263565242856e-06, + "loss": 0.4885, + "step": 14462 + }, + { + "epoch": 1.82, + "grad_norm": 21.302080154418945, + "learning_rate": 7.89942685018617e-06, + "loss": 0.6471, + "step": 14463 + }, + { + "epoch": 1.82, + "grad_norm": 15.44736099243164, + "learning_rate": 7.898590135129482e-06, + "loss": 1.4289, + "step": 14464 + }, + { + "epoch": 1.82, + "grad_norm": 30.569433212280273, + "learning_rate": 7.897753420072794e-06, + "loss": 0.611, + "step": 14465 + }, + { + "epoch": 1.82, + "grad_norm": 22.321868896484375, + "learning_rate": 7.896916705016108e-06, + "loss": 1.15, + "step": 14466 + }, + { + "epoch": 1.82, + "grad_norm": 10.689000129699707, + "learning_rate": 7.89607998995942e-06, + "loss": 0.9393, + "step": 14467 + }, + { + "epoch": 1.82, + "grad_norm": 112.66310119628906, + "learning_rate": 7.895243274902733e-06, + "loss": 1.1678, + "step": 14468 + }, + { + "epoch": 1.82, + "grad_norm": 53.48061752319336, + "learning_rate": 7.894406559846045e-06, + "loss": 2.8504, + "step": 14469 + }, + { + "epoch": 1.82, + "grad_norm": 17.995603561401367, + "learning_rate": 7.893569844789357e-06, + "loss": 2.6686, + "step": 14470 + }, + { + "epoch": 1.82, + "grad_norm": 9.140344619750977, + "learning_rate": 7.89273312973267e-06, + "loss": 0.9577, + "step": 14471 + }, + { + "epoch": 1.82, + "grad_norm": 12.35569953918457, + "learning_rate": 7.891896414675983e-06, + "loss": 1.2718, + "step": 14472 + }, + { + "epoch": 1.82, + "grad_norm": 18.828767776489258, + "learning_rate": 7.891059699619296e-06, + "loss": 3.5721, + "step": 14473 + }, + { + "epoch": 1.82, + "grad_norm": 8.44033432006836, + "learning_rate": 7.890222984562608e-06, + "loss": 0.3851, + "step": 14474 + }, + { + "epoch": 1.82, + "grad_norm": 8.504138946533203, + "learning_rate": 7.88938626950592e-06, + "loss": 0.5076, + "step": 14475 + }, + { + "epoch": 1.82, + "grad_norm": 7.0276408195495605, + "learning_rate": 7.888549554449232e-06, + "loss": 0.4118, + "step": 14476 + }, + { + "epoch": 1.82, + "grad_norm": 11.683658599853516, + "learning_rate": 7.887712839392544e-06, + "loss": 1.0484, + "step": 14477 + }, + { + "epoch": 1.82, + "grad_norm": 89.66896057128906, + "learning_rate": 7.886876124335858e-06, + "loss": 1.621, + "step": 14478 + }, + { + "epoch": 1.82, + "grad_norm": 12.588370323181152, + "learning_rate": 7.88603940927917e-06, + "loss": 0.4962, + "step": 14479 + }, + { + "epoch": 1.82, + "grad_norm": 17.480802536010742, + "learning_rate": 7.885202694222484e-06, + "loss": 0.9598, + "step": 14480 + }, + { + "epoch": 1.82, + "grad_norm": 20.175277709960938, + "learning_rate": 7.884365979165795e-06, + "loss": 1.4753, + "step": 14481 + }, + { + "epoch": 1.82, + "grad_norm": 1.6198923587799072, + "learning_rate": 7.883529264109109e-06, + "loss": 0.0245, + "step": 14482 + }, + { + "epoch": 1.82, + "grad_norm": 10.016743659973145, + "learning_rate": 7.882692549052421e-06, + "loss": 0.7644, + "step": 14483 + }, + { + "epoch": 1.82, + "grad_norm": 7.49308443069458, + "learning_rate": 7.881855833995733e-06, + "loss": 0.7991, + "step": 14484 + }, + { + "epoch": 1.82, + "grad_norm": 13.122968673706055, + "learning_rate": 7.881019118939047e-06, + "loss": 1.0511, + "step": 14485 + }, + { + "epoch": 1.82, + "grad_norm": 9.601958274841309, + "learning_rate": 7.880182403882359e-06, + "loss": 0.6712, + "step": 14486 + }, + { + "epoch": 1.82, + "grad_norm": 13.161881446838379, + "learning_rate": 7.87934568882567e-06, + "loss": 0.1473, + "step": 14487 + }, + { + "epoch": 1.82, + "grad_norm": 14.766838073730469, + "learning_rate": 7.878508973768984e-06, + "loss": 0.6535, + "step": 14488 + }, + { + "epoch": 1.82, + "grad_norm": 35.4913330078125, + "learning_rate": 7.877672258712296e-06, + "loss": 0.6545, + "step": 14489 + }, + { + "epoch": 1.82, + "grad_norm": 35.55754852294922, + "learning_rate": 7.876835543655608e-06, + "loss": 2.9493, + "step": 14490 + }, + { + "epoch": 1.82, + "grad_norm": 19.41988754272461, + "learning_rate": 7.87599882859892e-06, + "loss": 1.2309, + "step": 14491 + }, + { + "epoch": 1.82, + "grad_norm": 13.2708740234375, + "learning_rate": 7.875162113542234e-06, + "loss": 0.794, + "step": 14492 + }, + { + "epoch": 1.82, + "grad_norm": 22.65144157409668, + "learning_rate": 7.874325398485546e-06, + "loss": 0.7917, + "step": 14493 + }, + { + "epoch": 1.82, + "grad_norm": 47.246463775634766, + "learning_rate": 7.87348868342886e-06, + "loss": 2.1365, + "step": 14494 + }, + { + "epoch": 1.82, + "grad_norm": 10.207687377929688, + "learning_rate": 7.872651968372171e-06, + "loss": 1.2046, + "step": 14495 + }, + { + "epoch": 1.82, + "grad_norm": 27.179967880249023, + "learning_rate": 7.871815253315485e-06, + "loss": 2.4935, + "step": 14496 + }, + { + "epoch": 1.82, + "grad_norm": 15.798068046569824, + "learning_rate": 7.870978538258797e-06, + "loss": 0.8742, + "step": 14497 + }, + { + "epoch": 1.82, + "grad_norm": 14.868792533874512, + "learning_rate": 7.870141823202109e-06, + "loss": 1.1622, + "step": 14498 + }, + { + "epoch": 1.82, + "grad_norm": 23.81890296936035, + "learning_rate": 7.869305108145423e-06, + "loss": 2.0811, + "step": 14499 + }, + { + "epoch": 1.82, + "grad_norm": 10.45630168914795, + "learning_rate": 7.868468393088734e-06, + "loss": 0.8939, + "step": 14500 + }, + { + "epoch": 1.82, + "grad_norm": 8.123615264892578, + "learning_rate": 7.867631678032046e-06, + "loss": 1.2297, + "step": 14501 + }, + { + "epoch": 1.82, + "grad_norm": 13.614934921264648, + "learning_rate": 7.86679496297536e-06, + "loss": 1.1254, + "step": 14502 + }, + { + "epoch": 1.82, + "grad_norm": 15.482418060302734, + "learning_rate": 7.865958247918672e-06, + "loss": 1.7316, + "step": 14503 + }, + { + "epoch": 1.82, + "grad_norm": 9.831905364990234, + "learning_rate": 7.865121532861984e-06, + "loss": 1.8212, + "step": 14504 + }, + { + "epoch": 1.82, + "grad_norm": 6.973489284515381, + "learning_rate": 7.864284817805296e-06, + "loss": 0.3571, + "step": 14505 + }, + { + "epoch": 1.82, + "grad_norm": 18.5034236907959, + "learning_rate": 7.86344810274861e-06, + "loss": 2.5581, + "step": 14506 + }, + { + "epoch": 1.82, + "grad_norm": 5.724293231964111, + "learning_rate": 7.862611387691922e-06, + "loss": 0.198, + "step": 14507 + }, + { + "epoch": 1.82, + "grad_norm": 16.614166259765625, + "learning_rate": 7.861774672635235e-06, + "loss": 1.0352, + "step": 14508 + }, + { + "epoch": 1.82, + "grad_norm": 44.372352600097656, + "learning_rate": 7.860937957578547e-06, + "loss": 2.7569, + "step": 14509 + }, + { + "epoch": 1.82, + "grad_norm": 46.56011962890625, + "learning_rate": 7.86010124252186e-06, + "loss": 1.5423, + "step": 14510 + }, + { + "epoch": 1.82, + "grad_norm": 8.329614639282227, + "learning_rate": 7.859264527465173e-06, + "loss": 1.6975, + "step": 14511 + }, + { + "epoch": 1.82, + "grad_norm": 11.13457202911377, + "learning_rate": 7.858427812408485e-06, + "loss": 1.9166, + "step": 14512 + }, + { + "epoch": 1.82, + "grad_norm": 24.75688934326172, + "learning_rate": 7.857591097351798e-06, + "loss": 0.908, + "step": 14513 + }, + { + "epoch": 1.82, + "grad_norm": 9.076860427856445, + "learning_rate": 7.85675438229511e-06, + "loss": 1.9261, + "step": 14514 + }, + { + "epoch": 1.82, + "grad_norm": 6.307610034942627, + "learning_rate": 7.855917667238422e-06, + "loss": 0.2568, + "step": 14515 + }, + { + "epoch": 1.82, + "grad_norm": 8.085229873657227, + "learning_rate": 7.855080952181734e-06, + "loss": 0.8695, + "step": 14516 + }, + { + "epoch": 1.82, + "grad_norm": 21.385927200317383, + "learning_rate": 7.854244237125048e-06, + "loss": 1.48, + "step": 14517 + }, + { + "epoch": 1.82, + "grad_norm": 17.968355178833008, + "learning_rate": 7.85340752206836e-06, + "loss": 0.8138, + "step": 14518 + }, + { + "epoch": 1.82, + "grad_norm": 7.746437072753906, + "learning_rate": 7.852570807011672e-06, + "loss": 0.8136, + "step": 14519 + }, + { + "epoch": 1.82, + "grad_norm": 15.064603805541992, + "learning_rate": 7.851734091954985e-06, + "loss": 0.8686, + "step": 14520 + }, + { + "epoch": 1.82, + "grad_norm": 11.113859176635742, + "learning_rate": 7.850897376898297e-06, + "loss": 0.6047, + "step": 14521 + }, + { + "epoch": 1.82, + "grad_norm": 78.57706451416016, + "learning_rate": 7.850060661841611e-06, + "loss": 0.8011, + "step": 14522 + }, + { + "epoch": 1.82, + "grad_norm": 13.151373863220215, + "learning_rate": 7.849223946784923e-06, + "loss": 0.7002, + "step": 14523 + }, + { + "epoch": 1.82, + "grad_norm": 12.312122344970703, + "learning_rate": 7.848387231728237e-06, + "loss": 0.9362, + "step": 14524 + }, + { + "epoch": 1.82, + "grad_norm": 17.823299407958984, + "learning_rate": 7.847550516671549e-06, + "loss": 0.4392, + "step": 14525 + }, + { + "epoch": 1.82, + "grad_norm": 27.160539627075195, + "learning_rate": 7.84671380161486e-06, + "loss": 1.3058, + "step": 14526 + }, + { + "epoch": 1.82, + "grad_norm": 4.870408535003662, + "learning_rate": 7.845877086558174e-06, + "loss": 0.2979, + "step": 14527 + }, + { + "epoch": 1.82, + "grad_norm": 42.98374557495117, + "learning_rate": 7.845040371501486e-06, + "loss": 1.6995, + "step": 14528 + }, + { + "epoch": 1.82, + "grad_norm": 25.020404815673828, + "learning_rate": 7.844203656444798e-06, + "loss": 2.2122, + "step": 14529 + }, + { + "epoch": 1.82, + "grad_norm": 27.879436492919922, + "learning_rate": 7.84336694138811e-06, + "loss": 1.0222, + "step": 14530 + }, + { + "epoch": 1.82, + "grad_norm": 3.8494162559509277, + "learning_rate": 7.842530226331424e-06, + "loss": 0.392, + "step": 14531 + }, + { + "epoch": 1.82, + "grad_norm": 18.847829818725586, + "learning_rate": 7.841693511274736e-06, + "loss": 3.4313, + "step": 14532 + }, + { + "epoch": 1.82, + "grad_norm": 18.116872787475586, + "learning_rate": 7.840856796218048e-06, + "loss": 2.1904, + "step": 14533 + }, + { + "epoch": 1.82, + "grad_norm": 18.75259017944336, + "learning_rate": 7.840020081161361e-06, + "loss": 0.673, + "step": 14534 + }, + { + "epoch": 1.82, + "grad_norm": 7.266660213470459, + "learning_rate": 7.839183366104673e-06, + "loss": 1.7115, + "step": 14535 + }, + { + "epoch": 1.82, + "grad_norm": 5.787752628326416, + "learning_rate": 7.838346651047987e-06, + "loss": 0.3172, + "step": 14536 + }, + { + "epoch": 1.82, + "grad_norm": 7.596453666687012, + "learning_rate": 7.837509935991299e-06, + "loss": 0.4714, + "step": 14537 + }, + { + "epoch": 1.82, + "grad_norm": 10.35500717163086, + "learning_rate": 7.836673220934612e-06, + "loss": 0.7654, + "step": 14538 + }, + { + "epoch": 1.82, + "grad_norm": 22.067216873168945, + "learning_rate": 7.835836505877924e-06, + "loss": 3.4565, + "step": 14539 + }, + { + "epoch": 1.82, + "grad_norm": 18.70140266418457, + "learning_rate": 7.834999790821236e-06, + "loss": 1.6778, + "step": 14540 + }, + { + "epoch": 1.82, + "grad_norm": 11.594501495361328, + "learning_rate": 7.83416307576455e-06, + "loss": 0.8165, + "step": 14541 + }, + { + "epoch": 1.82, + "grad_norm": 39.64685821533203, + "learning_rate": 7.833326360707862e-06, + "loss": 3.063, + "step": 14542 + }, + { + "epoch": 1.83, + "grad_norm": 10.162467002868652, + "learning_rate": 7.832489645651174e-06, + "loss": 0.5666, + "step": 14543 + }, + { + "epoch": 1.83, + "grad_norm": 105.2118148803711, + "learning_rate": 7.831652930594486e-06, + "loss": 1.9304, + "step": 14544 + }, + { + "epoch": 1.83, + "grad_norm": 3.5929176807403564, + "learning_rate": 7.830816215537798e-06, + "loss": 0.1045, + "step": 14545 + }, + { + "epoch": 1.83, + "grad_norm": 6.858730316162109, + "learning_rate": 7.829979500481111e-06, + "loss": 0.2439, + "step": 14546 + }, + { + "epoch": 1.83, + "grad_norm": 5.487667560577393, + "learning_rate": 7.829142785424423e-06, + "loss": 0.2675, + "step": 14547 + }, + { + "epoch": 1.83, + "grad_norm": 24.481765747070312, + "learning_rate": 7.828306070367737e-06, + "loss": 1.3063, + "step": 14548 + }, + { + "epoch": 1.83, + "grad_norm": 15.480400085449219, + "learning_rate": 7.827469355311049e-06, + "loss": 2.4106, + "step": 14549 + }, + { + "epoch": 1.83, + "grad_norm": 10.6516752243042, + "learning_rate": 7.826632640254363e-06, + "loss": 1.8167, + "step": 14550 + }, + { + "epoch": 1.83, + "grad_norm": 32.08213424682617, + "learning_rate": 7.825795925197675e-06, + "loss": 1.9974, + "step": 14551 + }, + { + "epoch": 1.83, + "grad_norm": 16.49408531188965, + "learning_rate": 7.824959210140987e-06, + "loss": 1.6478, + "step": 14552 + }, + { + "epoch": 1.83, + "grad_norm": 46.75410842895508, + "learning_rate": 7.8241224950843e-06, + "loss": 0.192, + "step": 14553 + }, + { + "epoch": 1.83, + "grad_norm": 22.896223068237305, + "learning_rate": 7.823285780027612e-06, + "loss": 1.2882, + "step": 14554 + }, + { + "epoch": 1.83, + "grad_norm": 14.967391967773438, + "learning_rate": 7.822449064970926e-06, + "loss": 1.7651, + "step": 14555 + }, + { + "epoch": 1.83, + "grad_norm": 20.13538932800293, + "learning_rate": 7.821612349914238e-06, + "loss": 2.5185, + "step": 14556 + }, + { + "epoch": 1.83, + "grad_norm": 7.627354621887207, + "learning_rate": 7.82077563485755e-06, + "loss": 1.418, + "step": 14557 + }, + { + "epoch": 1.83, + "grad_norm": 16.376483917236328, + "learning_rate": 7.819938919800862e-06, + "loss": 0.2539, + "step": 14558 + }, + { + "epoch": 1.83, + "grad_norm": 19.46470832824707, + "learning_rate": 7.819102204744174e-06, + "loss": 0.8282, + "step": 14559 + }, + { + "epoch": 1.83, + "grad_norm": 10.967860221862793, + "learning_rate": 7.818265489687487e-06, + "loss": 3.0796, + "step": 14560 + }, + { + "epoch": 1.83, + "grad_norm": 29.03691864013672, + "learning_rate": 7.8174287746308e-06, + "loss": 1.3847, + "step": 14561 + }, + { + "epoch": 1.83, + "grad_norm": 24.95924949645996, + "learning_rate": 7.816592059574113e-06, + "loss": 0.65, + "step": 14562 + }, + { + "epoch": 1.83, + "grad_norm": 20.359928131103516, + "learning_rate": 7.815755344517425e-06, + "loss": 1.2683, + "step": 14563 + }, + { + "epoch": 1.83, + "grad_norm": 11.161372184753418, + "learning_rate": 7.814918629460739e-06, + "loss": 0.7836, + "step": 14564 + }, + { + "epoch": 1.83, + "grad_norm": 23.24300193786621, + "learning_rate": 7.81408191440405e-06, + "loss": 1.2312, + "step": 14565 + }, + { + "epoch": 1.83, + "grad_norm": 29.766010284423828, + "learning_rate": 7.813245199347362e-06, + "loss": 1.2391, + "step": 14566 + }, + { + "epoch": 1.83, + "grad_norm": 17.175212860107422, + "learning_rate": 7.812408484290676e-06, + "loss": 0.8, + "step": 14567 + }, + { + "epoch": 1.83, + "grad_norm": 43.0667724609375, + "learning_rate": 7.811571769233988e-06, + "loss": 3.6364, + "step": 14568 + }, + { + "epoch": 1.83, + "grad_norm": 46.05887985229492, + "learning_rate": 7.8107350541773e-06, + "loss": 1.5329, + "step": 14569 + }, + { + "epoch": 1.83, + "grad_norm": 29.1179256439209, + "learning_rate": 7.809898339120614e-06, + "loss": 1.6498, + "step": 14570 + }, + { + "epoch": 1.83, + "grad_norm": 12.148987770080566, + "learning_rate": 7.809061624063926e-06, + "loss": 1.7415, + "step": 14571 + }, + { + "epoch": 1.83, + "grad_norm": 23.32232666015625, + "learning_rate": 7.808224909007238e-06, + "loss": 1.8076, + "step": 14572 + }, + { + "epoch": 1.83, + "grad_norm": 43.80778121948242, + "learning_rate": 7.80738819395055e-06, + "loss": 2.2583, + "step": 14573 + }, + { + "epoch": 1.83, + "grad_norm": 7.639410972595215, + "learning_rate": 7.806551478893863e-06, + "loss": 1.4383, + "step": 14574 + }, + { + "epoch": 1.83, + "grad_norm": 15.752108573913574, + "learning_rate": 7.805714763837175e-06, + "loss": 2.1638, + "step": 14575 + }, + { + "epoch": 1.83, + "grad_norm": 24.567739486694336, + "learning_rate": 7.804878048780489e-06, + "loss": 1.9584, + "step": 14576 + }, + { + "epoch": 1.83, + "grad_norm": 6.122366428375244, + "learning_rate": 7.8040413337238e-06, + "loss": 0.6865, + "step": 14577 + }, + { + "epoch": 1.83, + "grad_norm": 8.692014694213867, + "learning_rate": 7.803204618667114e-06, + "loss": 1.5606, + "step": 14578 + }, + { + "epoch": 1.83, + "grad_norm": 16.712642669677734, + "learning_rate": 7.802367903610426e-06, + "loss": 0.8948, + "step": 14579 + }, + { + "epoch": 1.83, + "grad_norm": 10.310211181640625, + "learning_rate": 7.801531188553738e-06, + "loss": 0.6789, + "step": 14580 + }, + { + "epoch": 1.83, + "grad_norm": 15.829083442687988, + "learning_rate": 7.800694473497052e-06, + "loss": 0.4798, + "step": 14581 + }, + { + "epoch": 1.83, + "grad_norm": 12.211121559143066, + "learning_rate": 7.799857758440364e-06, + "loss": 1.0991, + "step": 14582 + }, + { + "epoch": 1.83, + "grad_norm": 49.79775619506836, + "learning_rate": 7.799021043383676e-06, + "loss": 1.4883, + "step": 14583 + }, + { + "epoch": 1.83, + "grad_norm": 17.090513229370117, + "learning_rate": 7.79818432832699e-06, + "loss": 1.6109, + "step": 14584 + }, + { + "epoch": 1.83, + "grad_norm": 25.796220779418945, + "learning_rate": 7.797347613270301e-06, + "loss": 0.9772, + "step": 14585 + }, + { + "epoch": 1.83, + "grad_norm": 4.717153072357178, + "learning_rate": 7.796510898213613e-06, + "loss": 0.1331, + "step": 14586 + }, + { + "epoch": 1.83, + "grad_norm": 5.1751017570495605, + "learning_rate": 7.795674183156925e-06, + "loss": 0.1403, + "step": 14587 + }, + { + "epoch": 1.83, + "grad_norm": 94.42440032958984, + "learning_rate": 7.794837468100239e-06, + "loss": 1.7024, + "step": 14588 + }, + { + "epoch": 1.83, + "grad_norm": 11.68253231048584, + "learning_rate": 7.794000753043551e-06, + "loss": 0.4139, + "step": 14589 + }, + { + "epoch": 1.83, + "grad_norm": 9.50575065612793, + "learning_rate": 7.793164037986865e-06, + "loss": 1.5483, + "step": 14590 + }, + { + "epoch": 1.83, + "grad_norm": 66.21617126464844, + "learning_rate": 7.792327322930177e-06, + "loss": 1.3414, + "step": 14591 + }, + { + "epoch": 1.83, + "grad_norm": 8.727616310119629, + "learning_rate": 7.79149060787349e-06, + "loss": 0.9599, + "step": 14592 + }, + { + "epoch": 1.83, + "grad_norm": 27.565093994140625, + "learning_rate": 7.790653892816802e-06, + "loss": 1.7694, + "step": 14593 + }, + { + "epoch": 1.83, + "grad_norm": 6.212733268737793, + "learning_rate": 7.789817177760114e-06, + "loss": 0.6293, + "step": 14594 + }, + { + "epoch": 1.83, + "grad_norm": 13.79759407043457, + "learning_rate": 7.788980462703428e-06, + "loss": 0.9985, + "step": 14595 + }, + { + "epoch": 1.83, + "grad_norm": 4.914252281188965, + "learning_rate": 7.78814374764674e-06, + "loss": 0.1219, + "step": 14596 + }, + { + "epoch": 1.83, + "grad_norm": 27.71527099609375, + "learning_rate": 7.787307032590052e-06, + "loss": 2.2464, + "step": 14597 + }, + { + "epoch": 1.83, + "grad_norm": 28.01056671142578, + "learning_rate": 7.786470317533364e-06, + "loss": 0.8147, + "step": 14598 + }, + { + "epoch": 1.83, + "grad_norm": 29.393339157104492, + "learning_rate": 7.785633602476677e-06, + "loss": 1.8519, + "step": 14599 + }, + { + "epoch": 1.83, + "grad_norm": 13.588300704956055, + "learning_rate": 7.78479688741999e-06, + "loss": 1.0838, + "step": 14600 + }, + { + "epoch": 1.83, + "grad_norm": 26.342918395996094, + "learning_rate": 7.783960172363301e-06, + "loss": 1.3437, + "step": 14601 + }, + { + "epoch": 1.83, + "grad_norm": 21.386323928833008, + "learning_rate": 7.783123457306615e-06, + "loss": 1.8337, + "step": 14602 + }, + { + "epoch": 1.83, + "grad_norm": 18.504051208496094, + "learning_rate": 7.782286742249927e-06, + "loss": 2.125, + "step": 14603 + }, + { + "epoch": 1.83, + "grad_norm": 17.427810668945312, + "learning_rate": 7.78145002719324e-06, + "loss": 0.9518, + "step": 14604 + }, + { + "epoch": 1.83, + "grad_norm": 46.07899856567383, + "learning_rate": 7.780613312136552e-06, + "loss": 1.5458, + "step": 14605 + }, + { + "epoch": 1.83, + "grad_norm": 15.297252655029297, + "learning_rate": 7.779776597079866e-06, + "loss": 1.9646, + "step": 14606 + }, + { + "epoch": 1.83, + "grad_norm": 27.815570831298828, + "learning_rate": 7.778939882023178e-06, + "loss": 2.0745, + "step": 14607 + }, + { + "epoch": 1.83, + "grad_norm": 5.807898998260498, + "learning_rate": 7.77810316696649e-06, + "loss": 0.5374, + "step": 14608 + }, + { + "epoch": 1.83, + "grad_norm": 22.669795989990234, + "learning_rate": 7.777266451909804e-06, + "loss": 1.9899, + "step": 14609 + }, + { + "epoch": 1.83, + "grad_norm": 11.786237716674805, + "learning_rate": 7.776429736853116e-06, + "loss": 1.2184, + "step": 14610 + }, + { + "epoch": 1.83, + "grad_norm": 6.201798915863037, + "learning_rate": 7.775593021796428e-06, + "loss": 0.2181, + "step": 14611 + }, + { + "epoch": 1.83, + "grad_norm": 10.68503189086914, + "learning_rate": 7.77475630673974e-06, + "loss": 1.588, + "step": 14612 + }, + { + "epoch": 1.83, + "grad_norm": 13.119375228881836, + "learning_rate": 7.773919591683053e-06, + "loss": 0.7835, + "step": 14613 + }, + { + "epoch": 1.83, + "grad_norm": 7.963985443115234, + "learning_rate": 7.773082876626365e-06, + "loss": 0.2345, + "step": 14614 + }, + { + "epoch": 1.83, + "grad_norm": 26.554868698120117, + "learning_rate": 7.772246161569677e-06, + "loss": 2.4617, + "step": 14615 + }, + { + "epoch": 1.83, + "grad_norm": 19.229042053222656, + "learning_rate": 7.77140944651299e-06, + "loss": 1.583, + "step": 14616 + }, + { + "epoch": 1.83, + "grad_norm": 24.596824645996094, + "learning_rate": 7.770572731456303e-06, + "loss": 0.5835, + "step": 14617 + }, + { + "epoch": 1.83, + "grad_norm": 14.019936561584473, + "learning_rate": 7.769736016399616e-06, + "loss": 1.1474, + "step": 14618 + }, + { + "epoch": 1.83, + "grad_norm": 14.718634605407715, + "learning_rate": 7.768899301342928e-06, + "loss": 3.4, + "step": 14619 + }, + { + "epoch": 1.83, + "grad_norm": 10.23465347290039, + "learning_rate": 7.768062586286242e-06, + "loss": 1.4429, + "step": 14620 + }, + { + "epoch": 1.83, + "grad_norm": 54.38119888305664, + "learning_rate": 7.767225871229554e-06, + "loss": 3.5724, + "step": 14621 + }, + { + "epoch": 1.84, + "grad_norm": 8.59969711303711, + "learning_rate": 7.766389156172866e-06, + "loss": 0.463, + "step": 14622 + }, + { + "epoch": 1.84, + "grad_norm": 16.999919891357422, + "learning_rate": 7.76555244111618e-06, + "loss": 0.4796, + "step": 14623 + }, + { + "epoch": 1.84, + "grad_norm": 8.698204040527344, + "learning_rate": 7.764715726059491e-06, + "loss": 1.8318, + "step": 14624 + }, + { + "epoch": 1.84, + "grad_norm": 46.9920768737793, + "learning_rate": 7.763879011002803e-06, + "loss": 1.7167, + "step": 14625 + }, + { + "epoch": 1.84, + "grad_norm": 15.677593231201172, + "learning_rate": 7.763042295946115e-06, + "loss": 0.9368, + "step": 14626 + }, + { + "epoch": 1.84, + "grad_norm": 10.155533790588379, + "learning_rate": 7.762205580889429e-06, + "loss": 1.7626, + "step": 14627 + }, + { + "epoch": 1.84, + "grad_norm": 11.302998542785645, + "learning_rate": 7.761368865832741e-06, + "loss": 2.3986, + "step": 14628 + }, + { + "epoch": 1.84, + "grad_norm": 29.012435913085938, + "learning_rate": 7.760532150776053e-06, + "loss": 2.3208, + "step": 14629 + }, + { + "epoch": 1.84, + "grad_norm": 50.5739631652832, + "learning_rate": 7.759695435719367e-06, + "loss": 2.657, + "step": 14630 + }, + { + "epoch": 1.84, + "grad_norm": 22.324176788330078, + "learning_rate": 7.758858720662678e-06, + "loss": 1.8601, + "step": 14631 + }, + { + "epoch": 1.84, + "grad_norm": 37.7600212097168, + "learning_rate": 7.758022005605992e-06, + "loss": 2.0259, + "step": 14632 + }, + { + "epoch": 1.84, + "grad_norm": 22.62678337097168, + "learning_rate": 7.757185290549304e-06, + "loss": 1.1992, + "step": 14633 + }, + { + "epoch": 1.84, + "grad_norm": 7.476841449737549, + "learning_rate": 7.756348575492618e-06, + "loss": 0.6818, + "step": 14634 + }, + { + "epoch": 1.84, + "grad_norm": 16.30000114440918, + "learning_rate": 7.75551186043593e-06, + "loss": 1.5676, + "step": 14635 + }, + { + "epoch": 1.84, + "grad_norm": 5.904687404632568, + "learning_rate": 7.754675145379242e-06, + "loss": 1.7429, + "step": 14636 + }, + { + "epoch": 1.84, + "grad_norm": 6.143351078033447, + "learning_rate": 7.753838430322555e-06, + "loss": 1.8704, + "step": 14637 + }, + { + "epoch": 1.84, + "grad_norm": 8.69153118133545, + "learning_rate": 7.753001715265867e-06, + "loss": 0.5042, + "step": 14638 + }, + { + "epoch": 1.84, + "grad_norm": 12.439741134643555, + "learning_rate": 7.75216500020918e-06, + "loss": 0.3078, + "step": 14639 + }, + { + "epoch": 1.84, + "grad_norm": 5.229491233825684, + "learning_rate": 7.751328285152491e-06, + "loss": 1.3007, + "step": 14640 + }, + { + "epoch": 1.84, + "grad_norm": 19.291664123535156, + "learning_rate": 7.750491570095805e-06, + "loss": 0.5928, + "step": 14641 + }, + { + "epoch": 1.84, + "grad_norm": 11.47659683227539, + "learning_rate": 7.749654855039117e-06, + "loss": 1.8454, + "step": 14642 + }, + { + "epoch": 1.84, + "grad_norm": 11.25069808959961, + "learning_rate": 7.748818139982429e-06, + "loss": 0.8562, + "step": 14643 + }, + { + "epoch": 1.84, + "grad_norm": 7.662071228027344, + "learning_rate": 7.747981424925742e-06, + "loss": 0.5109, + "step": 14644 + }, + { + "epoch": 1.84, + "grad_norm": 21.549152374267578, + "learning_rate": 7.747144709869054e-06, + "loss": 0.6217, + "step": 14645 + }, + { + "epoch": 1.84, + "grad_norm": 9.104865074157715, + "learning_rate": 7.746307994812368e-06, + "loss": 1.246, + "step": 14646 + }, + { + "epoch": 1.84, + "grad_norm": 7.59805154800415, + "learning_rate": 7.74547127975568e-06, + "loss": 0.4916, + "step": 14647 + }, + { + "epoch": 1.84, + "grad_norm": 8.185526847839355, + "learning_rate": 7.744634564698994e-06, + "loss": 1.2289, + "step": 14648 + }, + { + "epoch": 1.84, + "grad_norm": 13.667840957641602, + "learning_rate": 7.743797849642306e-06, + "loss": 1.3791, + "step": 14649 + }, + { + "epoch": 1.84, + "grad_norm": 14.437935829162598, + "learning_rate": 7.742961134585617e-06, + "loss": 1.9665, + "step": 14650 + }, + { + "epoch": 1.84, + "grad_norm": 5.515650749206543, + "learning_rate": 7.74212441952893e-06, + "loss": 0.9934, + "step": 14651 + }, + { + "epoch": 1.84, + "grad_norm": 9.91360855102539, + "learning_rate": 7.741287704472243e-06, + "loss": 1.5728, + "step": 14652 + }, + { + "epoch": 1.84, + "grad_norm": 13.074822425842285, + "learning_rate": 7.740450989415555e-06, + "loss": 0.7348, + "step": 14653 + }, + { + "epoch": 1.84, + "grad_norm": 26.621301651000977, + "learning_rate": 7.739614274358867e-06, + "loss": 2.1105, + "step": 14654 + }, + { + "epoch": 1.84, + "grad_norm": 10.325733184814453, + "learning_rate": 7.73877755930218e-06, + "loss": 0.3978, + "step": 14655 + }, + { + "epoch": 1.84, + "grad_norm": 6.7510576248168945, + "learning_rate": 7.737940844245493e-06, + "loss": 1.862, + "step": 14656 + }, + { + "epoch": 1.84, + "grad_norm": 8.192282676696777, + "learning_rate": 7.737104129188805e-06, + "loss": 0.5877, + "step": 14657 + }, + { + "epoch": 1.84, + "grad_norm": 8.965519905090332, + "learning_rate": 7.736267414132118e-06, + "loss": 0.5455, + "step": 14658 + }, + { + "epoch": 1.84, + "grad_norm": 15.416940689086914, + "learning_rate": 7.73543069907543e-06, + "loss": 3.6079, + "step": 14659 + }, + { + "epoch": 1.84, + "grad_norm": 27.16056251525879, + "learning_rate": 7.734593984018744e-06, + "loss": 1.0839, + "step": 14660 + }, + { + "epoch": 1.84, + "grad_norm": 16.25208854675293, + "learning_rate": 7.733757268962056e-06, + "loss": 0.6543, + "step": 14661 + }, + { + "epoch": 1.84, + "grad_norm": 19.351118087768555, + "learning_rate": 7.73292055390537e-06, + "loss": 3.4736, + "step": 14662 + }, + { + "epoch": 1.84, + "grad_norm": 7.313667297363281, + "learning_rate": 7.732083838848681e-06, + "loss": 0.3627, + "step": 14663 + }, + { + "epoch": 1.84, + "grad_norm": 34.64920425415039, + "learning_rate": 7.731247123791993e-06, + "loss": 2.8451, + "step": 14664 + }, + { + "epoch": 1.84, + "grad_norm": 156.4160614013672, + "learning_rate": 7.730410408735305e-06, + "loss": 1.3522, + "step": 14665 + }, + { + "epoch": 1.84, + "grad_norm": 28.074020385742188, + "learning_rate": 7.729573693678619e-06, + "loss": 1.6769, + "step": 14666 + }, + { + "epoch": 1.84, + "grad_norm": 20.513277053833008, + "learning_rate": 7.728736978621931e-06, + "loss": 0.4861, + "step": 14667 + }, + { + "epoch": 1.84, + "grad_norm": 22.602537155151367, + "learning_rate": 7.727900263565243e-06, + "loss": 2.1944, + "step": 14668 + }, + { + "epoch": 1.84, + "grad_norm": 15.476371765136719, + "learning_rate": 7.727063548508556e-06, + "loss": 1.178, + "step": 14669 + }, + { + "epoch": 1.84, + "grad_norm": 13.422551155090332, + "learning_rate": 7.726226833451868e-06, + "loss": 0.803, + "step": 14670 + }, + { + "epoch": 1.84, + "grad_norm": 16.891130447387695, + "learning_rate": 7.72539011839518e-06, + "loss": 3.0551, + "step": 14671 + }, + { + "epoch": 1.84, + "grad_norm": 20.250980377197266, + "learning_rate": 7.724553403338494e-06, + "loss": 0.2396, + "step": 14672 + }, + { + "epoch": 1.84, + "grad_norm": 11.72658634185791, + "learning_rate": 7.723716688281806e-06, + "loss": 0.6455, + "step": 14673 + }, + { + "epoch": 1.84, + "grad_norm": 6.41309118270874, + "learning_rate": 7.72287997322512e-06, + "loss": 0.7244, + "step": 14674 + }, + { + "epoch": 1.84, + "grad_norm": 79.53043365478516, + "learning_rate": 7.722043258168432e-06, + "loss": 1.1777, + "step": 14675 + }, + { + "epoch": 1.84, + "grad_norm": 70.87897491455078, + "learning_rate": 7.721206543111745e-06, + "loss": 1.7493, + "step": 14676 + }, + { + "epoch": 1.84, + "grad_norm": 21.449668884277344, + "learning_rate": 7.720369828055057e-06, + "loss": 1.5234, + "step": 14677 + }, + { + "epoch": 1.84, + "grad_norm": 88.9782485961914, + "learning_rate": 7.719533112998369e-06, + "loss": 3.8735, + "step": 14678 + }, + { + "epoch": 1.84, + "grad_norm": 9.213892936706543, + "learning_rate": 7.718696397941681e-06, + "loss": 0.8011, + "step": 14679 + }, + { + "epoch": 1.84, + "grad_norm": 625.9251708984375, + "learning_rate": 7.717859682884993e-06, + "loss": 0.577, + "step": 14680 + }, + { + "epoch": 1.84, + "grad_norm": 4.340551376342773, + "learning_rate": 7.717022967828307e-06, + "loss": 0.2504, + "step": 14681 + }, + { + "epoch": 1.84, + "grad_norm": 1.4213769435882568, + "learning_rate": 7.716186252771619e-06, + "loss": 0.0401, + "step": 14682 + }, + { + "epoch": 1.84, + "grad_norm": 67.31068420410156, + "learning_rate": 7.715349537714932e-06, + "loss": 3.0705, + "step": 14683 + }, + { + "epoch": 1.84, + "grad_norm": 12.602607727050781, + "learning_rate": 7.714512822658244e-06, + "loss": 0.7183, + "step": 14684 + }, + { + "epoch": 1.84, + "grad_norm": 21.028234481811523, + "learning_rate": 7.713676107601556e-06, + "loss": 1.0286, + "step": 14685 + }, + { + "epoch": 1.84, + "grad_norm": 8.835209846496582, + "learning_rate": 7.71283939254487e-06, + "loss": 1.2344, + "step": 14686 + }, + { + "epoch": 1.84, + "grad_norm": 5.893700122833252, + "learning_rate": 7.712002677488182e-06, + "loss": 1.1592, + "step": 14687 + }, + { + "epoch": 1.84, + "grad_norm": 11.222153663635254, + "learning_rate": 7.711165962431495e-06, + "loss": 0.6997, + "step": 14688 + }, + { + "epoch": 1.84, + "grad_norm": 10.17320442199707, + "learning_rate": 7.710329247374807e-06, + "loss": 0.9973, + "step": 14689 + }, + { + "epoch": 1.84, + "grad_norm": 19.59346580505371, + "learning_rate": 7.70949253231812e-06, + "loss": 0.9439, + "step": 14690 + }, + { + "epoch": 1.84, + "grad_norm": 6.132411956787109, + "learning_rate": 7.708655817261433e-06, + "loss": 0.5027, + "step": 14691 + }, + { + "epoch": 1.84, + "grad_norm": 9.631757736206055, + "learning_rate": 7.707819102204745e-06, + "loss": 0.4147, + "step": 14692 + }, + { + "epoch": 1.84, + "grad_norm": 12.173003196716309, + "learning_rate": 7.706982387148057e-06, + "loss": 2.1582, + "step": 14693 + }, + { + "epoch": 1.84, + "grad_norm": 104.57372283935547, + "learning_rate": 7.706145672091369e-06, + "loss": 2.0988, + "step": 14694 + }, + { + "epoch": 1.84, + "grad_norm": 42.085662841796875, + "learning_rate": 7.705308957034683e-06, + "loss": 1.8397, + "step": 14695 + }, + { + "epoch": 1.84, + "grad_norm": 8.223543167114258, + "learning_rate": 7.704472241977994e-06, + "loss": 0.3691, + "step": 14696 + }, + { + "epoch": 1.84, + "grad_norm": 39.029788970947266, + "learning_rate": 7.703635526921308e-06, + "loss": 1.5826, + "step": 14697 + }, + { + "epoch": 1.84, + "grad_norm": 20.499826431274414, + "learning_rate": 7.70279881186462e-06, + "loss": 1.1586, + "step": 14698 + }, + { + "epoch": 1.84, + "grad_norm": 28.13475799560547, + "learning_rate": 7.701962096807932e-06, + "loss": 0.3709, + "step": 14699 + }, + { + "epoch": 1.84, + "grad_norm": 16.042461395263672, + "learning_rate": 7.701125381751246e-06, + "loss": 0.8329, + "step": 14700 + }, + { + "epoch": 1.84, + "grad_norm": 12.51712703704834, + "learning_rate": 7.700288666694558e-06, + "loss": 0.8813, + "step": 14701 + }, + { + "epoch": 1.85, + "grad_norm": 29.742050170898438, + "learning_rate": 7.699451951637871e-06, + "loss": 2.4081, + "step": 14702 + }, + { + "epoch": 1.85, + "grad_norm": 44.979732513427734, + "learning_rate": 7.698615236581183e-06, + "loss": 0.6644, + "step": 14703 + }, + { + "epoch": 1.85, + "grad_norm": 12.372272491455078, + "learning_rate": 7.697778521524495e-06, + "loss": 1.7375, + "step": 14704 + }, + { + "epoch": 1.85, + "grad_norm": 22.812719345092773, + "learning_rate": 7.696941806467809e-06, + "loss": 1.7477, + "step": 14705 + }, + { + "epoch": 1.85, + "grad_norm": 36.66621780395508, + "learning_rate": 7.69610509141112e-06, + "loss": 2.3893, + "step": 14706 + }, + { + "epoch": 1.85, + "grad_norm": 11.402688026428223, + "learning_rate": 7.695268376354433e-06, + "loss": 1.1545, + "step": 14707 + }, + { + "epoch": 1.85, + "grad_norm": 6.952476978302002, + "learning_rate": 7.694431661297745e-06, + "loss": 0.3191, + "step": 14708 + }, + { + "epoch": 1.85, + "grad_norm": 68.07209014892578, + "learning_rate": 7.693594946241058e-06, + "loss": 1.3018, + "step": 14709 + }, + { + "epoch": 1.85, + "grad_norm": 17.782407760620117, + "learning_rate": 7.69275823118437e-06, + "loss": 1.2765, + "step": 14710 + }, + { + "epoch": 1.85, + "grad_norm": 64.7061538696289, + "learning_rate": 7.691921516127684e-06, + "loss": 1.2451, + "step": 14711 + }, + { + "epoch": 1.85, + "grad_norm": 9.56583023071289, + "learning_rate": 7.691084801070996e-06, + "loss": 0.5217, + "step": 14712 + }, + { + "epoch": 1.85, + "grad_norm": 43.400611877441406, + "learning_rate": 7.690248086014308e-06, + "loss": 1.5767, + "step": 14713 + }, + { + "epoch": 1.85, + "grad_norm": 12.612919807434082, + "learning_rate": 7.689411370957622e-06, + "loss": 1.3096, + "step": 14714 + }, + { + "epoch": 1.85, + "grad_norm": 13.8457612991333, + "learning_rate": 7.688574655900933e-06, + "loss": 1.7108, + "step": 14715 + }, + { + "epoch": 1.85, + "grad_norm": 16.337207794189453, + "learning_rate": 7.687737940844247e-06, + "loss": 0.9688, + "step": 14716 + }, + { + "epoch": 1.85, + "grad_norm": 68.52069091796875, + "learning_rate": 7.686901225787559e-06, + "loss": 1.7413, + "step": 14717 + }, + { + "epoch": 1.85, + "grad_norm": 12.928807258605957, + "learning_rate": 7.686064510730871e-06, + "loss": 0.5263, + "step": 14718 + }, + { + "epoch": 1.85, + "grad_norm": 12.465153694152832, + "learning_rate": 7.685227795674185e-06, + "loss": 0.6653, + "step": 14719 + }, + { + "epoch": 1.85, + "grad_norm": 10.703567504882812, + "learning_rate": 7.684391080617497e-06, + "loss": 1.7437, + "step": 14720 + }, + { + "epoch": 1.85, + "grad_norm": 16.249418258666992, + "learning_rate": 7.683554365560809e-06, + "loss": 1.6232, + "step": 14721 + }, + { + "epoch": 1.85, + "grad_norm": 12.612680435180664, + "learning_rate": 7.68271765050412e-06, + "loss": 1.3508, + "step": 14722 + }, + { + "epoch": 1.85, + "grad_norm": 17.31722068786621, + "learning_rate": 7.681880935447434e-06, + "loss": 1.5709, + "step": 14723 + }, + { + "epoch": 1.85, + "grad_norm": 12.659685134887695, + "learning_rate": 7.681044220390746e-06, + "loss": 1.0214, + "step": 14724 + }, + { + "epoch": 1.85, + "grad_norm": 69.76463317871094, + "learning_rate": 7.680207505334058e-06, + "loss": 2.1385, + "step": 14725 + }, + { + "epoch": 1.85, + "grad_norm": 2.5032670497894287, + "learning_rate": 7.679370790277372e-06, + "loss": 0.0833, + "step": 14726 + }, + { + "epoch": 1.85, + "grad_norm": 17.35916519165039, + "learning_rate": 7.678534075220684e-06, + "loss": 1.9738, + "step": 14727 + }, + { + "epoch": 1.85, + "grad_norm": 20.854421615600586, + "learning_rate": 7.677697360163997e-06, + "loss": 1.0489, + "step": 14728 + }, + { + "epoch": 1.85, + "grad_norm": 15.953429222106934, + "learning_rate": 7.67686064510731e-06, + "loss": 1.6066, + "step": 14729 + }, + { + "epoch": 1.85, + "grad_norm": 9.436120986938477, + "learning_rate": 7.676023930050623e-06, + "loss": 1.6007, + "step": 14730 + }, + { + "epoch": 1.85, + "grad_norm": 17.540395736694336, + "learning_rate": 7.675187214993935e-06, + "loss": 1.9825, + "step": 14731 + }, + { + "epoch": 1.85, + "grad_norm": 36.389583587646484, + "learning_rate": 7.674350499937247e-06, + "loss": 1.3227, + "step": 14732 + }, + { + "epoch": 1.85, + "grad_norm": 10.731135368347168, + "learning_rate": 7.673513784880559e-06, + "loss": 0.6461, + "step": 14733 + }, + { + "epoch": 1.85, + "grad_norm": 68.90753936767578, + "learning_rate": 7.672677069823872e-06, + "loss": 1.2232, + "step": 14734 + }, + { + "epoch": 1.85, + "grad_norm": 22.537322998046875, + "learning_rate": 7.671840354767184e-06, + "loss": 2.7363, + "step": 14735 + }, + { + "epoch": 1.85, + "grad_norm": 12.679781913757324, + "learning_rate": 7.671003639710496e-06, + "loss": 0.39, + "step": 14736 + }, + { + "epoch": 1.85, + "grad_norm": 18.542882919311523, + "learning_rate": 7.67016692465381e-06, + "loss": 0.7766, + "step": 14737 + }, + { + "epoch": 1.85, + "grad_norm": 13.160587310791016, + "learning_rate": 7.669330209597122e-06, + "loss": 0.9405, + "step": 14738 + }, + { + "epoch": 1.85, + "grad_norm": 21.700645446777344, + "learning_rate": 7.668493494540434e-06, + "loss": 0.4254, + "step": 14739 + }, + { + "epoch": 1.85, + "grad_norm": 10.037639617919922, + "learning_rate": 7.667656779483748e-06, + "loss": 1.0496, + "step": 14740 + }, + { + "epoch": 1.85, + "grad_norm": 55.460689544677734, + "learning_rate": 7.66682006442706e-06, + "loss": 1.3667, + "step": 14741 + }, + { + "epoch": 1.85, + "grad_norm": 11.1635160446167, + "learning_rate": 7.665983349370373e-06, + "loss": 0.4446, + "step": 14742 + }, + { + "epoch": 1.85, + "grad_norm": 7.231917858123779, + "learning_rate": 7.665146634313685e-06, + "loss": 0.2691, + "step": 14743 + }, + { + "epoch": 1.85, + "grad_norm": 11.715499877929688, + "learning_rate": 7.664309919256999e-06, + "loss": 0.9348, + "step": 14744 + }, + { + "epoch": 1.85, + "grad_norm": 18.822839736938477, + "learning_rate": 7.66347320420031e-06, + "loss": 1.1269, + "step": 14745 + }, + { + "epoch": 1.85, + "grad_norm": 19.825674057006836, + "learning_rate": 7.662636489143623e-06, + "loss": 1.0912, + "step": 14746 + }, + { + "epoch": 1.85, + "grad_norm": 11.040472030639648, + "learning_rate": 7.661799774086935e-06, + "loss": 1.2, + "step": 14747 + }, + { + "epoch": 1.85, + "grad_norm": 23.976301193237305, + "learning_rate": 7.660963059030248e-06, + "loss": 0.807, + "step": 14748 + }, + { + "epoch": 1.85, + "grad_norm": 64.25426483154297, + "learning_rate": 7.66012634397356e-06, + "loss": 2.6122, + "step": 14749 + }, + { + "epoch": 1.85, + "grad_norm": 9.116251945495605, + "learning_rate": 7.659289628916872e-06, + "loss": 0.7184, + "step": 14750 + }, + { + "epoch": 1.85, + "grad_norm": 15.621927261352539, + "learning_rate": 7.658452913860186e-06, + "loss": 1.1001, + "step": 14751 + }, + { + "epoch": 1.85, + "grad_norm": 5.6078104972839355, + "learning_rate": 7.657616198803498e-06, + "loss": 1.6181, + "step": 14752 + }, + { + "epoch": 1.85, + "grad_norm": 5.419527530670166, + "learning_rate": 7.65677948374681e-06, + "loss": 0.1343, + "step": 14753 + }, + { + "epoch": 1.85, + "grad_norm": 18.74810028076172, + "learning_rate": 7.655942768690123e-06, + "loss": 1.0452, + "step": 14754 + }, + { + "epoch": 1.85, + "grad_norm": 11.465399742126465, + "learning_rate": 7.655106053633435e-06, + "loss": 0.6569, + "step": 14755 + }, + { + "epoch": 1.85, + "grad_norm": 15.211071014404297, + "learning_rate": 7.654269338576749e-06, + "loss": 0.6556, + "step": 14756 + }, + { + "epoch": 1.85, + "grad_norm": 13.307765007019043, + "learning_rate": 7.653432623520061e-06, + "loss": 0.6282, + "step": 14757 + }, + { + "epoch": 1.85, + "grad_norm": 6.284524917602539, + "learning_rate": 7.652595908463375e-06, + "loss": 0.4561, + "step": 14758 + }, + { + "epoch": 1.85, + "grad_norm": 8.023004531860352, + "learning_rate": 7.651759193406687e-06, + "loss": 0.5253, + "step": 14759 + }, + { + "epoch": 1.85, + "grad_norm": 20.6444149017334, + "learning_rate": 7.650922478349999e-06, + "loss": 0.6476, + "step": 14760 + }, + { + "epoch": 1.85, + "grad_norm": 6.272350311279297, + "learning_rate": 7.65008576329331e-06, + "loss": 0.4519, + "step": 14761 + }, + { + "epoch": 1.85, + "grad_norm": 19.79265785217285, + "learning_rate": 7.649249048236622e-06, + "loss": 1.0469, + "step": 14762 + }, + { + "epoch": 1.85, + "grad_norm": 12.268784523010254, + "learning_rate": 7.648412333179936e-06, + "loss": 0.7808, + "step": 14763 + }, + { + "epoch": 1.85, + "grad_norm": 11.571566581726074, + "learning_rate": 7.647575618123248e-06, + "loss": 0.6196, + "step": 14764 + }, + { + "epoch": 1.85, + "grad_norm": 42.86050033569336, + "learning_rate": 7.646738903066562e-06, + "loss": 2.3287, + "step": 14765 + }, + { + "epoch": 1.85, + "grad_norm": 36.44502258300781, + "learning_rate": 7.645902188009874e-06, + "loss": 2.2601, + "step": 14766 + }, + { + "epoch": 1.85, + "grad_norm": 20.920602798461914, + "learning_rate": 7.645065472953186e-06, + "loss": 1.1987, + "step": 14767 + }, + { + "epoch": 1.85, + "grad_norm": 15.131333351135254, + "learning_rate": 7.6442287578965e-06, + "loss": 0.5331, + "step": 14768 + }, + { + "epoch": 1.85, + "grad_norm": 60.51508712768555, + "learning_rate": 7.643392042839811e-06, + "loss": 0.4155, + "step": 14769 + }, + { + "epoch": 1.85, + "grad_norm": 61.69184875488281, + "learning_rate": 7.642555327783125e-06, + "loss": 1.8619, + "step": 14770 + }, + { + "epoch": 1.85, + "grad_norm": 14.843707084655762, + "learning_rate": 7.641718612726437e-06, + "loss": 0.518, + "step": 14771 + }, + { + "epoch": 1.85, + "grad_norm": 38.67082214355469, + "learning_rate": 7.640881897669749e-06, + "loss": 1.871, + "step": 14772 + }, + { + "epoch": 1.85, + "grad_norm": 31.577425003051758, + "learning_rate": 7.640045182613062e-06, + "loss": 1.8729, + "step": 14773 + }, + { + "epoch": 1.85, + "grad_norm": 34.70793914794922, + "learning_rate": 7.639208467556374e-06, + "loss": 1.9818, + "step": 14774 + }, + { + "epoch": 1.85, + "grad_norm": 9.932486534118652, + "learning_rate": 7.638371752499686e-06, + "loss": 1.055, + "step": 14775 + }, + { + "epoch": 1.85, + "grad_norm": 4.367156505584717, + "learning_rate": 7.637535037442998e-06, + "loss": 0.2814, + "step": 14776 + }, + { + "epoch": 1.85, + "grad_norm": 26.68075180053711, + "learning_rate": 7.636698322386312e-06, + "loss": 1.1387, + "step": 14777 + }, + { + "epoch": 1.85, + "grad_norm": 11.871581077575684, + "learning_rate": 7.635861607329624e-06, + "loss": 1.7331, + "step": 14778 + }, + { + "epoch": 1.85, + "grad_norm": 10.713318824768066, + "learning_rate": 7.635024892272938e-06, + "loss": 1.1925, + "step": 14779 + }, + { + "epoch": 1.85, + "grad_norm": 17.177345275878906, + "learning_rate": 7.63418817721625e-06, + "loss": 0.8947, + "step": 14780 + }, + { + "epoch": 1.85, + "grad_norm": 20.661283493041992, + "learning_rate": 7.633351462159561e-06, + "loss": 2.3745, + "step": 14781 + }, + { + "epoch": 1.86, + "grad_norm": 17.319683074951172, + "learning_rate": 7.632514747102875e-06, + "loss": 0.6588, + "step": 14782 + }, + { + "epoch": 1.86, + "grad_norm": 29.432462692260742, + "learning_rate": 7.631678032046187e-06, + "loss": 2.2025, + "step": 14783 + }, + { + "epoch": 1.86, + "grad_norm": 28.53501319885254, + "learning_rate": 7.6308413169895e-06, + "loss": 1.0598, + "step": 14784 + }, + { + "epoch": 1.86, + "grad_norm": 25.438735961914062, + "learning_rate": 7.630004601932813e-06, + "loss": 0.5378, + "step": 14785 + }, + { + "epoch": 1.86, + "grad_norm": 53.4843864440918, + "learning_rate": 7.6291678868761255e-06, + "loss": 1.4765, + "step": 14786 + }, + { + "epoch": 1.86, + "grad_norm": 4.252384185791016, + "learning_rate": 7.6283311718194374e-06, + "loss": 0.461, + "step": 14787 + }, + { + "epoch": 1.86, + "grad_norm": 7.987371444702148, + "learning_rate": 7.627494456762749e-06, + "loss": 0.4523, + "step": 14788 + }, + { + "epoch": 1.86, + "grad_norm": 21.252737045288086, + "learning_rate": 7.626657741706063e-06, + "loss": 1.0041, + "step": 14789 + }, + { + "epoch": 1.86, + "grad_norm": 12.794703483581543, + "learning_rate": 7.625821026649375e-06, + "loss": 0.7371, + "step": 14790 + }, + { + "epoch": 1.86, + "grad_norm": 11.99533462524414, + "learning_rate": 7.624984311592688e-06, + "loss": 1.334, + "step": 14791 + }, + { + "epoch": 1.86, + "grad_norm": 8.046095848083496, + "learning_rate": 7.624147596536e-06, + "loss": 0.7383, + "step": 14792 + }, + { + "epoch": 1.86, + "grad_norm": 3.4559438228607178, + "learning_rate": 7.623310881479313e-06, + "loss": 0.1237, + "step": 14793 + }, + { + "epoch": 1.86, + "grad_norm": 12.042539596557617, + "learning_rate": 7.622474166422625e-06, + "loss": 0.5694, + "step": 14794 + }, + { + "epoch": 1.86, + "grad_norm": 94.61004638671875, + "learning_rate": 7.621637451365937e-06, + "loss": 1.7079, + "step": 14795 + }, + { + "epoch": 1.86, + "grad_norm": 9.512558937072754, + "learning_rate": 7.620800736309251e-06, + "loss": 0.2887, + "step": 14796 + }, + { + "epoch": 1.86, + "grad_norm": 73.2804183959961, + "learning_rate": 7.619964021252563e-06, + "loss": 2.4657, + "step": 14797 + }, + { + "epoch": 1.86, + "grad_norm": 16.300159454345703, + "learning_rate": 7.619127306195876e-06, + "loss": 0.6152, + "step": 14798 + }, + { + "epoch": 1.86, + "grad_norm": 20.082332611083984, + "learning_rate": 7.618290591139188e-06, + "loss": 1.8323, + "step": 14799 + }, + { + "epoch": 1.86, + "grad_norm": 9.368802070617676, + "learning_rate": 7.617453876082501e-06, + "loss": 1.3725, + "step": 14800 + }, + { + "epoch": 1.86, + "eval_loss": 0.07997239381074905, + "eval_runtime": 97.6038, + "eval_samples_per_second": 36.29, + "eval_steps_per_second": 36.29, + "step": 14800 + }, + { + "epoch": 1.86, + "grad_norm": 246.64675903320312, + "learning_rate": 7.616617161025813e-06, + "loss": 1.6255, + "step": 14801 + }, + { + "epoch": 1.86, + "grad_norm": 6.828473091125488, + "learning_rate": 7.615780445969125e-06, + "loss": 1.6192, + "step": 14802 + }, + { + "epoch": 1.86, + "grad_norm": 12.192270278930664, + "learning_rate": 7.614943730912439e-06, + "loss": 1.0769, + "step": 14803 + }, + { + "epoch": 1.86, + "grad_norm": 15.74098014831543, + "learning_rate": 7.614107015855751e-06, + "loss": 1.4209, + "step": 14804 + }, + { + "epoch": 1.86, + "grad_norm": 12.044049263000488, + "learning_rate": 7.613270300799064e-06, + "loss": 1.6556, + "step": 14805 + }, + { + "epoch": 1.86, + "grad_norm": 14.061810493469238, + "learning_rate": 7.612433585742376e-06, + "loss": 0.8869, + "step": 14806 + }, + { + "epoch": 1.86, + "grad_norm": 15.723001480102539, + "learning_rate": 7.611596870685689e-06, + "loss": 0.9136, + "step": 14807 + }, + { + "epoch": 1.86, + "grad_norm": 16.836034774780273, + "learning_rate": 7.610760155629001e-06, + "loss": 1.0389, + "step": 14808 + }, + { + "epoch": 1.86, + "grad_norm": 8.526412963867188, + "learning_rate": 7.609923440572313e-06, + "loss": 0.8741, + "step": 14809 + }, + { + "epoch": 1.86, + "grad_norm": 40.982303619384766, + "learning_rate": 7.609086725515627e-06, + "loss": 1.6724, + "step": 14810 + }, + { + "epoch": 1.86, + "grad_norm": 48.365840911865234, + "learning_rate": 7.608250010458939e-06, + "loss": 0.829, + "step": 14811 + }, + { + "epoch": 1.86, + "grad_norm": 13.846466064453125, + "learning_rate": 7.6074132954022516e-06, + "loss": 1.6373, + "step": 14812 + }, + { + "epoch": 1.86, + "grad_norm": 10.954010963439941, + "learning_rate": 7.6065765803455635e-06, + "loss": 1.8855, + "step": 14813 + }, + { + "epoch": 1.86, + "grad_norm": 13.028228759765625, + "learning_rate": 7.605739865288877e-06, + "loss": 1.7035, + "step": 14814 + }, + { + "epoch": 1.86, + "grad_norm": 9.14072036743164, + "learning_rate": 7.604903150232189e-06, + "loss": 0.2946, + "step": 14815 + }, + { + "epoch": 1.86, + "grad_norm": 21.410184860229492, + "learning_rate": 7.604066435175501e-06, + "loss": 0.5266, + "step": 14816 + }, + { + "epoch": 1.86, + "grad_norm": 9.846100807189941, + "learning_rate": 7.603229720118815e-06, + "loss": 0.5662, + "step": 14817 + }, + { + "epoch": 1.86, + "grad_norm": 9.277616500854492, + "learning_rate": 7.602393005062127e-06, + "loss": 0.3525, + "step": 14818 + }, + { + "epoch": 1.86, + "grad_norm": 13.545755386352539, + "learning_rate": 7.6015562900054395e-06, + "loss": 1.0353, + "step": 14819 + }, + { + "epoch": 1.86, + "grad_norm": 11.16395092010498, + "learning_rate": 7.6007195749487514e-06, + "loss": 0.9289, + "step": 14820 + }, + { + "epoch": 1.86, + "grad_norm": 8.918290138244629, + "learning_rate": 7.599882859892065e-06, + "loss": 2.2482, + "step": 14821 + }, + { + "epoch": 1.86, + "grad_norm": 8.14470100402832, + "learning_rate": 7.599046144835377e-06, + "loss": 0.9612, + "step": 14822 + }, + { + "epoch": 1.86, + "grad_norm": 35.48596954345703, + "learning_rate": 7.598209429778689e-06, + "loss": 1.5181, + "step": 14823 + }, + { + "epoch": 1.86, + "grad_norm": 15.159586906433105, + "learning_rate": 7.597372714722003e-06, + "loss": 2.5833, + "step": 14824 + }, + { + "epoch": 1.86, + "grad_norm": 5.813267230987549, + "learning_rate": 7.596535999665315e-06, + "loss": 0.232, + "step": 14825 + }, + { + "epoch": 1.86, + "grad_norm": 18.843538284301758, + "learning_rate": 7.595699284608627e-06, + "loss": 0.7198, + "step": 14826 + }, + { + "epoch": 1.86, + "grad_norm": 13.109396934509277, + "learning_rate": 7.594862569551939e-06, + "loss": 0.8969, + "step": 14827 + }, + { + "epoch": 1.86, + "grad_norm": 7.410354137420654, + "learning_rate": 7.594025854495253e-06, + "loss": 1.4, + "step": 14828 + }, + { + "epoch": 1.86, + "grad_norm": 10.625185012817383, + "learning_rate": 7.593189139438565e-06, + "loss": 0.6854, + "step": 14829 + }, + { + "epoch": 1.86, + "grad_norm": 8.607718467712402, + "learning_rate": 7.592352424381877e-06, + "loss": 0.724, + "step": 14830 + }, + { + "epoch": 1.86, + "grad_norm": 23.821897506713867, + "learning_rate": 7.59151570932519e-06, + "loss": 1.3364, + "step": 14831 + }, + { + "epoch": 1.86, + "grad_norm": 22.2536563873291, + "learning_rate": 7.5906789942685025e-06, + "loss": 1.5433, + "step": 14832 + }, + { + "epoch": 1.86, + "grad_norm": 17.018207550048828, + "learning_rate": 7.589842279211815e-06, + "loss": 1.29, + "step": 14833 + }, + { + "epoch": 1.86, + "grad_norm": 36.413246154785156, + "learning_rate": 7.589005564155127e-06, + "loss": 2.2479, + "step": 14834 + }, + { + "epoch": 1.86, + "grad_norm": 13.229951858520508, + "learning_rate": 7.588168849098441e-06, + "loss": 1.2976, + "step": 14835 + }, + { + "epoch": 1.86, + "grad_norm": 34.4698371887207, + "learning_rate": 7.587332134041753e-06, + "loss": 2.5385, + "step": 14836 + }, + { + "epoch": 1.86, + "grad_norm": 9.363869667053223, + "learning_rate": 7.586495418985065e-06, + "loss": 1.5394, + "step": 14837 + }, + { + "epoch": 1.86, + "grad_norm": 11.833475112915039, + "learning_rate": 7.585658703928378e-06, + "loss": 2.091, + "step": 14838 + }, + { + "epoch": 1.86, + "grad_norm": 12.779874801635742, + "learning_rate": 7.5848219888716904e-06, + "loss": 0.6732, + "step": 14839 + }, + { + "epoch": 1.86, + "grad_norm": 24.601621627807617, + "learning_rate": 7.583985273815003e-06, + "loss": 0.7512, + "step": 14840 + }, + { + "epoch": 1.86, + "grad_norm": 8.069605827331543, + "learning_rate": 7.583148558758315e-06, + "loss": 0.4151, + "step": 14841 + }, + { + "epoch": 1.86, + "grad_norm": 24.539474487304688, + "learning_rate": 7.582311843701629e-06, + "loss": 0.7814, + "step": 14842 + }, + { + "epoch": 1.86, + "grad_norm": 25.05148696899414, + "learning_rate": 7.581475128644941e-06, + "loss": 2.6374, + "step": 14843 + }, + { + "epoch": 1.86, + "grad_norm": 20.171566009521484, + "learning_rate": 7.580638413588253e-06, + "loss": 1.0501, + "step": 14844 + }, + { + "epoch": 1.86, + "grad_norm": 20.1003475189209, + "learning_rate": 7.5798016985315655e-06, + "loss": 1.4832, + "step": 14845 + }, + { + "epoch": 1.86, + "grad_norm": 6.150067329406738, + "learning_rate": 7.578964983474878e-06, + "loss": 0.4083, + "step": 14846 + }, + { + "epoch": 1.86, + "grad_norm": 29.662729263305664, + "learning_rate": 7.578128268418191e-06, + "loss": 3.1539, + "step": 14847 + }, + { + "epoch": 1.86, + "grad_norm": 4.108884811401367, + "learning_rate": 7.577291553361503e-06, + "loss": 0.3868, + "step": 14848 + }, + { + "epoch": 1.86, + "grad_norm": 11.957849502563477, + "learning_rate": 7.576454838304817e-06, + "loss": 0.7825, + "step": 14849 + }, + { + "epoch": 1.86, + "grad_norm": 24.528156280517578, + "learning_rate": 7.575618123248129e-06, + "loss": 2.3116, + "step": 14850 + }, + { + "epoch": 1.86, + "grad_norm": 30.471540451049805, + "learning_rate": 7.574781408191441e-06, + "loss": 1.2928, + "step": 14851 + }, + { + "epoch": 1.86, + "grad_norm": 25.57601547241211, + "learning_rate": 7.5739446931347535e-06, + "loss": 2.7023, + "step": 14852 + }, + { + "epoch": 1.86, + "grad_norm": 9.855649948120117, + "learning_rate": 7.573107978078066e-06, + "loss": 0.5518, + "step": 14853 + }, + { + "epoch": 1.86, + "grad_norm": 22.2115535736084, + "learning_rate": 7.572271263021379e-06, + "loss": 1.4069, + "step": 14854 + }, + { + "epoch": 1.86, + "grad_norm": 18.855382919311523, + "learning_rate": 7.571434547964691e-06, + "loss": 0.9062, + "step": 14855 + }, + { + "epoch": 1.86, + "grad_norm": 12.688556671142578, + "learning_rate": 7.570597832908005e-06, + "loss": 0.486, + "step": 14856 + }, + { + "epoch": 1.86, + "grad_norm": 18.32562255859375, + "learning_rate": 7.569761117851317e-06, + "loss": 0.9634, + "step": 14857 + }, + { + "epoch": 1.86, + "grad_norm": 16.361801147460938, + "learning_rate": 7.568924402794629e-06, + "loss": 1.1458, + "step": 14858 + }, + { + "epoch": 1.86, + "grad_norm": 4.865877628326416, + "learning_rate": 7.568087687737941e-06, + "loss": 0.247, + "step": 14859 + }, + { + "epoch": 1.86, + "grad_norm": 22.873950958251953, + "learning_rate": 7.567250972681253e-06, + "loss": 0.5436, + "step": 14860 + }, + { + "epoch": 1.87, + "grad_norm": 7.281028747558594, + "learning_rate": 7.566414257624567e-06, + "loss": 1.5352, + "step": 14861 + }, + { + "epoch": 1.87, + "grad_norm": 65.8516845703125, + "learning_rate": 7.565577542567879e-06, + "loss": 0.5066, + "step": 14862 + }, + { + "epoch": 1.87, + "grad_norm": 21.525156021118164, + "learning_rate": 7.564740827511193e-06, + "loss": 0.531, + "step": 14863 + }, + { + "epoch": 1.87, + "grad_norm": 14.380889892578125, + "learning_rate": 7.5639041124545045e-06, + "loss": 1.3095, + "step": 14864 + }, + { + "epoch": 1.87, + "grad_norm": 13.246685981750488, + "learning_rate": 7.5630673973978165e-06, + "loss": 0.7462, + "step": 14865 + }, + { + "epoch": 1.87, + "grad_norm": 14.796537399291992, + "learning_rate": 7.562230682341129e-06, + "loss": 0.4701, + "step": 14866 + }, + { + "epoch": 1.87, + "grad_norm": 11.796875953674316, + "learning_rate": 7.561393967284441e-06, + "loss": 1.0644, + "step": 14867 + }, + { + "epoch": 1.87, + "grad_norm": 6.4520769119262695, + "learning_rate": 7.560557252227755e-06, + "loss": 0.2443, + "step": 14868 + }, + { + "epoch": 1.87, + "grad_norm": 19.781051635742188, + "learning_rate": 7.559720537171067e-06, + "loss": 0.8506, + "step": 14869 + }, + { + "epoch": 1.87, + "grad_norm": 65.25953674316406, + "learning_rate": 7.5588838221143805e-06, + "loss": 1.7756, + "step": 14870 + }, + { + "epoch": 1.87, + "grad_norm": 9.863962173461914, + "learning_rate": 7.5580471070576925e-06, + "loss": 0.4296, + "step": 14871 + }, + { + "epoch": 1.87, + "grad_norm": 25.320886611938477, + "learning_rate": 7.557210392001004e-06, + "loss": 1.2524, + "step": 14872 + }, + { + "epoch": 1.87, + "grad_norm": 5.225039958953857, + "learning_rate": 7.556373676944317e-06, + "loss": 0.2312, + "step": 14873 + }, + { + "epoch": 1.87, + "grad_norm": 24.97355842590332, + "learning_rate": 7.555536961887629e-06, + "loss": 2.759, + "step": 14874 + }, + { + "epoch": 1.87, + "grad_norm": 26.371421813964844, + "learning_rate": 7.554700246830943e-06, + "loss": 1.3567, + "step": 14875 + }, + { + "epoch": 1.87, + "grad_norm": 10.538215637207031, + "learning_rate": 7.553863531774255e-06, + "loss": 0.6993, + "step": 14876 + }, + { + "epoch": 1.87, + "grad_norm": 67.82234954833984, + "learning_rate": 7.5530268167175676e-06, + "loss": 0.7644, + "step": 14877 + }, + { + "epoch": 1.87, + "grad_norm": 37.28622817993164, + "learning_rate": 7.55219010166088e-06, + "loss": 1.1606, + "step": 14878 + }, + { + "epoch": 1.87, + "grad_norm": 19.709482192993164, + "learning_rate": 7.551353386604192e-06, + "loss": 1.4561, + "step": 14879 + }, + { + "epoch": 1.87, + "grad_norm": 11.090790748596191, + "learning_rate": 7.550516671547505e-06, + "loss": 1.7676, + "step": 14880 + }, + { + "epoch": 1.87, + "grad_norm": 4.8185553550720215, + "learning_rate": 7.549679956490817e-06, + "loss": 0.1238, + "step": 14881 + }, + { + "epoch": 1.87, + "grad_norm": 7.536447048187256, + "learning_rate": 7.548843241434131e-06, + "loss": 0.1998, + "step": 14882 + }, + { + "epoch": 1.87, + "grad_norm": 9.016610145568848, + "learning_rate": 7.548006526377443e-06, + "loss": 1.1862, + "step": 14883 + }, + { + "epoch": 1.87, + "grad_norm": 7.89788293838501, + "learning_rate": 7.5471698113207555e-06, + "loss": 0.4358, + "step": 14884 + }, + { + "epoch": 1.87, + "grad_norm": 65.45915985107422, + "learning_rate": 7.546333096264068e-06, + "loss": 2.5424, + "step": 14885 + }, + { + "epoch": 1.87, + "grad_norm": 52.495174407958984, + "learning_rate": 7.54549638120738e-06, + "loss": 2.0097, + "step": 14886 + }, + { + "epoch": 1.87, + "grad_norm": 20.664783477783203, + "learning_rate": 7.544659666150693e-06, + "loss": 2.3188, + "step": 14887 + }, + { + "epoch": 1.87, + "grad_norm": 8.102899551391602, + "learning_rate": 7.543822951094005e-06, + "loss": 0.3336, + "step": 14888 + }, + { + "epoch": 1.87, + "grad_norm": 7.261075496673584, + "learning_rate": 7.542986236037319e-06, + "loss": 0.4018, + "step": 14889 + }, + { + "epoch": 1.87, + "grad_norm": 10.724687576293945, + "learning_rate": 7.542149520980631e-06, + "loss": 1.7554, + "step": 14890 + }, + { + "epoch": 1.87, + "grad_norm": 17.16265106201172, + "learning_rate": 7.541312805923943e-06, + "loss": 1.215, + "step": 14891 + }, + { + "epoch": 1.87, + "grad_norm": 6.207948684692383, + "learning_rate": 7.540476090867256e-06, + "loss": 0.3008, + "step": 14892 + }, + { + "epoch": 1.87, + "grad_norm": 35.358238220214844, + "learning_rate": 7.539639375810568e-06, + "loss": 1.6874, + "step": 14893 + }, + { + "epoch": 1.87, + "grad_norm": 70.42288208007812, + "learning_rate": 7.538802660753881e-06, + "loss": 0.8292, + "step": 14894 + }, + { + "epoch": 1.87, + "grad_norm": 29.4013671875, + "learning_rate": 7.537965945697193e-06, + "loss": 0.617, + "step": 14895 + }, + { + "epoch": 1.87, + "grad_norm": 11.098967552185059, + "learning_rate": 7.5371292306405066e-06, + "loss": 1.0755, + "step": 14896 + }, + { + "epoch": 1.87, + "grad_norm": 31.135793685913086, + "learning_rate": 7.5362925155838185e-06, + "loss": 1.7696, + "step": 14897 + }, + { + "epoch": 1.87, + "grad_norm": 17.028921127319336, + "learning_rate": 7.5354558005271305e-06, + "loss": 1.865, + "step": 14898 + }, + { + "epoch": 1.87, + "grad_norm": 23.53152847290039, + "learning_rate": 7.534619085470444e-06, + "loss": 1.4702, + "step": 14899 + }, + { + "epoch": 1.87, + "grad_norm": 6.146441459655762, + "learning_rate": 7.533782370413756e-06, + "loss": 1.9333, + "step": 14900 + }, + { + "epoch": 1.87, + "grad_norm": 10.910538673400879, + "learning_rate": 7.532945655357069e-06, + "loss": 1.1973, + "step": 14901 + }, + { + "epoch": 1.87, + "grad_norm": 45.51909637451172, + "learning_rate": 7.532108940300381e-06, + "loss": 2.0465, + "step": 14902 + }, + { + "epoch": 1.87, + "grad_norm": 14.263692855834961, + "learning_rate": 7.5312722252436945e-06, + "loss": 0.4382, + "step": 14903 + }, + { + "epoch": 1.87, + "grad_norm": 15.351764678955078, + "learning_rate": 7.5304355101870064e-06, + "loss": 2.218, + "step": 14904 + }, + { + "epoch": 1.87, + "grad_norm": 6.3323845863342285, + "learning_rate": 7.529598795130318e-06, + "loss": 0.2431, + "step": 14905 + }, + { + "epoch": 1.87, + "grad_norm": 17.696409225463867, + "learning_rate": 7.528762080073632e-06, + "loss": 0.4567, + "step": 14906 + }, + { + "epoch": 1.87, + "grad_norm": 30.03977394104004, + "learning_rate": 7.527925365016944e-06, + "loss": 1.8367, + "step": 14907 + }, + { + "epoch": 1.87, + "grad_norm": 13.985361099243164, + "learning_rate": 7.527088649960257e-06, + "loss": 2.163, + "step": 14908 + }, + { + "epoch": 1.87, + "grad_norm": 20.119407653808594, + "learning_rate": 7.526251934903569e-06, + "loss": 1.9892, + "step": 14909 + }, + { + "epoch": 1.87, + "grad_norm": 14.874587059020996, + "learning_rate": 7.525415219846882e-06, + "loss": 0.9434, + "step": 14910 + }, + { + "epoch": 1.87, + "grad_norm": 19.314739227294922, + "learning_rate": 7.524578504790194e-06, + "loss": 1.0257, + "step": 14911 + }, + { + "epoch": 1.87, + "grad_norm": 16.053003311157227, + "learning_rate": 7.523741789733506e-06, + "loss": 1.4353, + "step": 14912 + }, + { + "epoch": 1.87, + "grad_norm": 7.82081937789917, + "learning_rate": 7.522905074676819e-06, + "loss": 0.7296, + "step": 14913 + }, + { + "epoch": 1.87, + "grad_norm": 5.054506301879883, + "learning_rate": 7.522068359620132e-06, + "loss": 1.1525, + "step": 14914 + }, + { + "epoch": 1.87, + "grad_norm": 16.969287872314453, + "learning_rate": 7.521231644563445e-06, + "loss": 3.0622, + "step": 14915 + }, + { + "epoch": 1.87, + "grad_norm": 15.36079216003418, + "learning_rate": 7.520394929506757e-06, + "loss": 0.9112, + "step": 14916 + }, + { + "epoch": 1.87, + "grad_norm": 38.61628341674805, + "learning_rate": 7.51955821445007e-06, + "loss": 0.5906, + "step": 14917 + }, + { + "epoch": 1.87, + "grad_norm": 2.0408730506896973, + "learning_rate": 7.518721499393382e-06, + "loss": 0.3868, + "step": 14918 + }, + { + "epoch": 1.87, + "grad_norm": 5.081964015960693, + "learning_rate": 7.517884784336694e-06, + "loss": 0.1102, + "step": 14919 + }, + { + "epoch": 1.87, + "grad_norm": 5.888787269592285, + "learning_rate": 7.517048069280007e-06, + "loss": 0.3224, + "step": 14920 + }, + { + "epoch": 1.87, + "grad_norm": 7.768947124481201, + "learning_rate": 7.51621135422332e-06, + "loss": 0.2688, + "step": 14921 + }, + { + "epoch": 1.87, + "grad_norm": 35.43925476074219, + "learning_rate": 7.515374639166633e-06, + "loss": 0.97, + "step": 14922 + }, + { + "epoch": 1.87, + "grad_norm": 9.788805961608887, + "learning_rate": 7.514537924109945e-06, + "loss": 0.705, + "step": 14923 + }, + { + "epoch": 1.87, + "grad_norm": 20.753562927246094, + "learning_rate": 7.513701209053258e-06, + "loss": 3.1368, + "step": 14924 + }, + { + "epoch": 1.87, + "grad_norm": 63.22145080566406, + "learning_rate": 7.51286449399657e-06, + "loss": 1.0398, + "step": 14925 + }, + { + "epoch": 1.87, + "grad_norm": 19.292924880981445, + "learning_rate": 7.512027778939882e-06, + "loss": 2.1925, + "step": 14926 + }, + { + "epoch": 1.87, + "grad_norm": 7.6630377769470215, + "learning_rate": 7.511191063883195e-06, + "loss": 0.9697, + "step": 14927 + }, + { + "epoch": 1.87, + "grad_norm": 19.1313533782959, + "learning_rate": 7.510354348826508e-06, + "loss": 0.4675, + "step": 14928 + }, + { + "epoch": 1.87, + "grad_norm": 18.08177947998047, + "learning_rate": 7.5095176337698206e-06, + "loss": 1.5367, + "step": 14929 + }, + { + "epoch": 1.87, + "grad_norm": 19.45388412475586, + "learning_rate": 7.5086809187131325e-06, + "loss": 1.5596, + "step": 14930 + }, + { + "epoch": 1.87, + "grad_norm": 18.87152099609375, + "learning_rate": 7.507844203656446e-06, + "loss": 2.3846, + "step": 14931 + }, + { + "epoch": 1.87, + "grad_norm": 30.16594696044922, + "learning_rate": 7.507007488599758e-06, + "loss": 2.878, + "step": 14932 + }, + { + "epoch": 1.87, + "grad_norm": 11.567892074584961, + "learning_rate": 7.50617077354307e-06, + "loss": 0.8096, + "step": 14933 + }, + { + "epoch": 1.87, + "grad_norm": 23.418529510498047, + "learning_rate": 7.505334058486383e-06, + "loss": 1.8027, + "step": 14934 + }, + { + "epoch": 1.87, + "grad_norm": 24.442947387695312, + "learning_rate": 7.504497343429696e-06, + "loss": 2.7443, + "step": 14935 + }, + { + "epoch": 1.87, + "grad_norm": 63.38848876953125, + "learning_rate": 7.5036606283730085e-06, + "loss": 2.4058, + "step": 14936 + }, + { + "epoch": 1.87, + "grad_norm": 19.26790428161621, + "learning_rate": 7.5028239133163204e-06, + "loss": 1.8722, + "step": 14937 + }, + { + "epoch": 1.87, + "grad_norm": 15.698874473571777, + "learning_rate": 7.501987198259634e-06, + "loss": 0.7961, + "step": 14938 + }, + { + "epoch": 1.87, + "grad_norm": 4.914449691772461, + "learning_rate": 7.501150483202946e-06, + "loss": 0.325, + "step": 14939 + }, + { + "epoch": 1.87, + "grad_norm": 6.667304992675781, + "learning_rate": 7.500313768146258e-06, + "loss": 0.247, + "step": 14940 + }, + { + "epoch": 1.88, + "grad_norm": 34.39765930175781, + "learning_rate": 7.499477053089571e-06, + "loss": 1.1508, + "step": 14941 + }, + { + "epoch": 1.88, + "grad_norm": 33.020774841308594, + "learning_rate": 7.498640338032883e-06, + "loss": 1.1243, + "step": 14942 + }, + { + "epoch": 1.88, + "grad_norm": 7.784989356994629, + "learning_rate": 7.497803622976196e-06, + "loss": 0.6662, + "step": 14943 + }, + { + "epoch": 1.88, + "grad_norm": 24.58091926574707, + "learning_rate": 7.496966907919508e-06, + "loss": 1.3466, + "step": 14944 + }, + { + "epoch": 1.88, + "grad_norm": 25.048736572265625, + "learning_rate": 7.496130192862822e-06, + "loss": 0.866, + "step": 14945 + }, + { + "epoch": 1.88, + "grad_norm": 4.25439977645874, + "learning_rate": 7.495293477806134e-06, + "loss": 0.5579, + "step": 14946 + }, + { + "epoch": 1.88, + "grad_norm": 14.87307071685791, + "learning_rate": 7.494456762749446e-06, + "loss": 2.0097, + "step": 14947 + }, + { + "epoch": 1.88, + "grad_norm": 12.519543647766113, + "learning_rate": 7.493620047692759e-06, + "loss": 0.6585, + "step": 14948 + }, + { + "epoch": 1.88, + "grad_norm": 29.968774795532227, + "learning_rate": 7.492783332636071e-06, + "loss": 1.0066, + "step": 14949 + }, + { + "epoch": 1.88, + "grad_norm": 26.526437759399414, + "learning_rate": 7.491946617579384e-06, + "loss": 0.8447, + "step": 14950 + }, + { + "epoch": 1.88, + "grad_norm": 7.1481852531433105, + "learning_rate": 7.491109902522696e-06, + "loss": 0.5562, + "step": 14951 + }, + { + "epoch": 1.88, + "grad_norm": 43.571720123291016, + "learning_rate": 7.49027318746601e-06, + "loss": 2.229, + "step": 14952 + }, + { + "epoch": 1.88, + "grad_norm": 10.739995956420898, + "learning_rate": 7.489436472409322e-06, + "loss": 1.0044, + "step": 14953 + }, + { + "epoch": 1.88, + "grad_norm": 34.558101654052734, + "learning_rate": 7.488599757352634e-06, + "loss": 0.8945, + "step": 14954 + }, + { + "epoch": 1.88, + "grad_norm": 6.550328254699707, + "learning_rate": 7.487763042295947e-06, + "loss": 0.2706, + "step": 14955 + }, + { + "epoch": 1.88, + "grad_norm": 15.401753425598145, + "learning_rate": 7.486926327239259e-06, + "loss": 0.3218, + "step": 14956 + }, + { + "epoch": 1.88, + "grad_norm": 10.214600563049316, + "learning_rate": 7.486089612182572e-06, + "loss": 0.6701, + "step": 14957 + }, + { + "epoch": 1.88, + "grad_norm": 7.7430524826049805, + "learning_rate": 7.485252897125884e-06, + "loss": 0.5822, + "step": 14958 + }, + { + "epoch": 1.88, + "grad_norm": 27.569425582885742, + "learning_rate": 7.484416182069197e-06, + "loss": 1.1034, + "step": 14959 + }, + { + "epoch": 1.88, + "grad_norm": 14.088428497314453, + "learning_rate": 7.48357946701251e-06, + "loss": 1.3161, + "step": 14960 + }, + { + "epoch": 1.88, + "grad_norm": 5.511892795562744, + "learning_rate": 7.482742751955822e-06, + "loss": 0.2117, + "step": 14961 + }, + { + "epoch": 1.88, + "grad_norm": 11.08498764038086, + "learning_rate": 7.4819060368991346e-06, + "loss": 0.9814, + "step": 14962 + }, + { + "epoch": 1.88, + "grad_norm": 10.185216903686523, + "learning_rate": 7.4810693218424465e-06, + "loss": 0.4156, + "step": 14963 + }, + { + "epoch": 1.88, + "grad_norm": 13.83070182800293, + "learning_rate": 7.48023260678576e-06, + "loss": 0.9413, + "step": 14964 + }, + { + "epoch": 1.88, + "grad_norm": 5.967977523803711, + "learning_rate": 7.479395891729072e-06, + "loss": 0.3474, + "step": 14965 + }, + { + "epoch": 1.88, + "grad_norm": 64.0623550415039, + "learning_rate": 7.478559176672385e-06, + "loss": 2.2951, + "step": 14966 + }, + { + "epoch": 1.88, + "grad_norm": 6.03565788269043, + "learning_rate": 7.477722461615698e-06, + "loss": 0.3642, + "step": 14967 + }, + { + "epoch": 1.88, + "grad_norm": 2.3975555896759033, + "learning_rate": 7.47688574655901e-06, + "loss": 0.0542, + "step": 14968 + }, + { + "epoch": 1.88, + "grad_norm": 15.115584373474121, + "learning_rate": 7.4760490315023225e-06, + "loss": 1.0409, + "step": 14969 + }, + { + "epoch": 1.88, + "grad_norm": 43.474605560302734, + "learning_rate": 7.4752123164456344e-06, + "loss": 1.5636, + "step": 14970 + }, + { + "epoch": 1.88, + "grad_norm": 10.10328197479248, + "learning_rate": 7.474375601388948e-06, + "loss": 0.2745, + "step": 14971 + }, + { + "epoch": 1.88, + "grad_norm": 16.16593360900879, + "learning_rate": 7.47353888633226e-06, + "loss": 0.4139, + "step": 14972 + }, + { + "epoch": 1.88, + "grad_norm": 8.691909790039062, + "learning_rate": 7.472702171275573e-06, + "loss": 1.9511, + "step": 14973 + }, + { + "epoch": 1.88, + "grad_norm": 38.42649841308594, + "learning_rate": 7.471865456218886e-06, + "loss": 2.803, + "step": 14974 + }, + { + "epoch": 1.88, + "grad_norm": 13.04145622253418, + "learning_rate": 7.471028741162198e-06, + "loss": 0.2814, + "step": 14975 + }, + { + "epoch": 1.88, + "grad_norm": 12.061103820800781, + "learning_rate": 7.47019202610551e-06, + "loss": 1.7101, + "step": 14976 + }, + { + "epoch": 1.88, + "grad_norm": 11.907960891723633, + "learning_rate": 7.469355311048822e-06, + "loss": 0.8512, + "step": 14977 + }, + { + "epoch": 1.88, + "grad_norm": 26.97811508178711, + "learning_rate": 7.468518595992136e-06, + "loss": 1.2657, + "step": 14978 + }, + { + "epoch": 1.88, + "grad_norm": 11.2202730178833, + "learning_rate": 7.467681880935448e-06, + "loss": 1.148, + "step": 14979 + }, + { + "epoch": 1.88, + "grad_norm": 7.339235305786133, + "learning_rate": 7.466845165878761e-06, + "loss": 0.4095, + "step": 14980 + }, + { + "epoch": 1.88, + "grad_norm": 23.894771575927734, + "learning_rate": 7.4660084508220736e-06, + "loss": 0.4987, + "step": 14981 + }, + { + "epoch": 1.88, + "grad_norm": 0.5060096383094788, + "learning_rate": 7.4651717357653855e-06, + "loss": 0.014, + "step": 14982 + }, + { + "epoch": 1.88, + "grad_norm": 0.960077166557312, + "learning_rate": 7.464335020708698e-06, + "loss": 0.0285, + "step": 14983 + }, + { + "epoch": 1.88, + "grad_norm": 114.88871765136719, + "learning_rate": 7.46349830565201e-06, + "loss": 4.6805, + "step": 14984 + }, + { + "epoch": 1.88, + "grad_norm": 11.071863174438477, + "learning_rate": 7.462661590595324e-06, + "loss": 0.3641, + "step": 14985 + }, + { + "epoch": 1.88, + "grad_norm": 12.264084815979004, + "learning_rate": 7.461824875538636e-06, + "loss": 0.6126, + "step": 14986 + }, + { + "epoch": 1.88, + "grad_norm": 11.309356689453125, + "learning_rate": 7.460988160481949e-06, + "loss": 1.2135, + "step": 14987 + }, + { + "epoch": 1.88, + "grad_norm": 22.1566219329834, + "learning_rate": 7.460151445425261e-06, + "loss": 0.7779, + "step": 14988 + }, + { + "epoch": 1.88, + "grad_norm": 156.8177490234375, + "learning_rate": 7.4593147303685734e-06, + "loss": 2.935, + "step": 14989 + }, + { + "epoch": 1.88, + "grad_norm": 22.175682067871094, + "learning_rate": 7.458478015311886e-06, + "loss": 0.5215, + "step": 14990 + }, + { + "epoch": 1.88, + "grad_norm": 28.10517120361328, + "learning_rate": 7.457641300255198e-06, + "loss": 3.2197, + "step": 14991 + }, + { + "epoch": 1.88, + "grad_norm": 14.341082572937012, + "learning_rate": 7.456804585198512e-06, + "loss": 0.9033, + "step": 14992 + }, + { + "epoch": 1.88, + "grad_norm": 40.654903411865234, + "learning_rate": 7.455967870141824e-06, + "loss": 1.9905, + "step": 14993 + }, + { + "epoch": 1.88, + "grad_norm": 6.580481052398682, + "learning_rate": 7.455131155085137e-06, + "loss": 0.1788, + "step": 14994 + }, + { + "epoch": 1.88, + "grad_norm": 26.085426330566406, + "learning_rate": 7.4542944400284485e-06, + "loss": 2.4308, + "step": 14995 + }, + { + "epoch": 1.88, + "grad_norm": 9.547032356262207, + "learning_rate": 7.453457724971761e-06, + "loss": 1.7124, + "step": 14996 + }, + { + "epoch": 1.88, + "grad_norm": 34.72230529785156, + "learning_rate": 7.452621009915074e-06, + "loss": 1.6396, + "step": 14997 + }, + { + "epoch": 1.88, + "grad_norm": 7.088932037353516, + "learning_rate": 7.451784294858386e-06, + "loss": 0.739, + "step": 14998 + }, + { + "epoch": 1.88, + "grad_norm": 18.540529251098633, + "learning_rate": 7.4509475798017e-06, + "loss": 3.2027, + "step": 14999 + }, + { + "epoch": 1.88, + "grad_norm": 12.922123908996582, + "learning_rate": 7.450110864745012e-06, + "loss": 0.4767, + "step": 15000 + }, + { + "epoch": 1.88, + "grad_norm": 30.11306381225586, + "learning_rate": 7.4492741496883245e-06, + "loss": 1.2894, + "step": 15001 + }, + { + "epoch": 1.88, + "grad_norm": 35.27677536010742, + "learning_rate": 7.4484374346316365e-06, + "loss": 1.9374, + "step": 15002 + }, + { + "epoch": 1.88, + "grad_norm": 14.58852767944336, + "learning_rate": 7.447600719574949e-06, + "loss": 2.1957, + "step": 15003 + }, + { + "epoch": 1.88, + "grad_norm": 6.9970383644104, + "learning_rate": 7.446764004518262e-06, + "loss": 0.9583, + "step": 15004 + }, + { + "epoch": 1.88, + "grad_norm": 39.45263671875, + "learning_rate": 7.445927289461574e-06, + "loss": 0.8669, + "step": 15005 + }, + { + "epoch": 1.88, + "grad_norm": 18.60435676574707, + "learning_rate": 7.445090574404888e-06, + "loss": 1.7102, + "step": 15006 + }, + { + "epoch": 1.88, + "grad_norm": 16.23671531677246, + "learning_rate": 7.4442538593482e-06, + "loss": 0.9465, + "step": 15007 + }, + { + "epoch": 1.88, + "grad_norm": 53.88591384887695, + "learning_rate": 7.4434171442915124e-06, + "loss": 1.9764, + "step": 15008 + }, + { + "epoch": 1.88, + "grad_norm": 26.555509567260742, + "learning_rate": 7.442580429234824e-06, + "loss": 1.1828, + "step": 15009 + }, + { + "epoch": 1.88, + "grad_norm": 0.8898343443870544, + "learning_rate": 7.441743714178137e-06, + "loss": 0.0484, + "step": 15010 + }, + { + "epoch": 1.88, + "grad_norm": 10.331498146057129, + "learning_rate": 7.44090699912145e-06, + "loss": 0.8278, + "step": 15011 + }, + { + "epoch": 1.88, + "grad_norm": 16.52419662475586, + "learning_rate": 7.440070284064762e-06, + "loss": 1.8094, + "step": 15012 + }, + { + "epoch": 1.88, + "grad_norm": 10.045432090759277, + "learning_rate": 7.439233569008076e-06, + "loss": 0.6667, + "step": 15013 + }, + { + "epoch": 1.88, + "grad_norm": 13.238429069519043, + "learning_rate": 7.4383968539513875e-06, + "loss": 1.7813, + "step": 15014 + }, + { + "epoch": 1.88, + "grad_norm": 9.393712997436523, + "learning_rate": 7.4375601388947e-06, + "loss": 0.6944, + "step": 15015 + }, + { + "epoch": 1.88, + "grad_norm": 18.744245529174805, + "learning_rate": 7.436723423838012e-06, + "loss": 1.1589, + "step": 15016 + }, + { + "epoch": 1.88, + "grad_norm": 11.48732852935791, + "learning_rate": 7.435886708781325e-06, + "loss": 1.0309, + "step": 15017 + }, + { + "epoch": 1.88, + "grad_norm": 58.54405975341797, + "learning_rate": 7.435049993724638e-06, + "loss": 0.9708, + "step": 15018 + }, + { + "epoch": 1.88, + "grad_norm": 8.87790298461914, + "learning_rate": 7.43421327866795e-06, + "loss": 2.8177, + "step": 15019 + }, + { + "epoch": 1.88, + "grad_norm": 10.245800971984863, + "learning_rate": 7.4333765636112635e-06, + "loss": 0.6433, + "step": 15020 + }, + { + "epoch": 1.89, + "grad_norm": 42.300506591796875, + "learning_rate": 7.4325398485545755e-06, + "loss": 3.357, + "step": 15021 + }, + { + "epoch": 1.89, + "grad_norm": 13.989384651184082, + "learning_rate": 7.431703133497888e-06, + "loss": 1.7183, + "step": 15022 + }, + { + "epoch": 1.89, + "grad_norm": 9.733213424682617, + "learning_rate": 7.4308664184412e-06, + "loss": 0.5374, + "step": 15023 + }, + { + "epoch": 1.89, + "grad_norm": 13.210402488708496, + "learning_rate": 7.430029703384512e-06, + "loss": 0.8232, + "step": 15024 + }, + { + "epoch": 1.89, + "grad_norm": 31.181640625, + "learning_rate": 7.429192988327826e-06, + "loss": 1.2312, + "step": 15025 + }, + { + "epoch": 1.89, + "grad_norm": 74.49955749511719, + "learning_rate": 7.428356273271138e-06, + "loss": 1.7963, + "step": 15026 + }, + { + "epoch": 1.89, + "grad_norm": 56.21243667602539, + "learning_rate": 7.427519558214451e-06, + "loss": 1.9133, + "step": 15027 + }, + { + "epoch": 1.89, + "grad_norm": 12.333606719970703, + "learning_rate": 7.426682843157763e-06, + "loss": 1.6646, + "step": 15028 + }, + { + "epoch": 1.89, + "grad_norm": 24.455326080322266, + "learning_rate": 7.425846128101076e-06, + "loss": 1.2796, + "step": 15029 + }, + { + "epoch": 1.89, + "grad_norm": 5.309059143066406, + "learning_rate": 7.425009413044388e-06, + "loss": 0.1407, + "step": 15030 + }, + { + "epoch": 1.89, + "grad_norm": 18.7886962890625, + "learning_rate": 7.4241726979877e-06, + "loss": 2.037, + "step": 15031 + }, + { + "epoch": 1.89, + "grad_norm": 4.938962459564209, + "learning_rate": 7.423335982931014e-06, + "loss": 0.4311, + "step": 15032 + }, + { + "epoch": 1.89, + "grad_norm": 7.151268005371094, + "learning_rate": 7.422499267874326e-06, + "loss": 0.6303, + "step": 15033 + }, + { + "epoch": 1.89, + "grad_norm": 93.7016830444336, + "learning_rate": 7.421662552817639e-06, + "loss": 1.1028, + "step": 15034 + }, + { + "epoch": 1.89, + "grad_norm": 18.35399627685547, + "learning_rate": 7.420825837760951e-06, + "loss": 1.5585, + "step": 15035 + }, + { + "epoch": 1.89, + "grad_norm": 8.213486671447754, + "learning_rate": 7.419989122704264e-06, + "loss": 0.8414, + "step": 15036 + }, + { + "epoch": 1.89, + "grad_norm": 23.497493743896484, + "learning_rate": 7.419152407647576e-06, + "loss": 0.9, + "step": 15037 + }, + { + "epoch": 1.89, + "grad_norm": 25.89967918395996, + "learning_rate": 7.418315692590888e-06, + "loss": 1.3119, + "step": 15038 + }, + { + "epoch": 1.89, + "grad_norm": 18.524890899658203, + "learning_rate": 7.417478977534202e-06, + "loss": 0.8211, + "step": 15039 + }, + { + "epoch": 1.89, + "grad_norm": 30.518224716186523, + "learning_rate": 7.416642262477514e-06, + "loss": 1.0983, + "step": 15040 + }, + { + "epoch": 1.89, + "grad_norm": 7.779489994049072, + "learning_rate": 7.415805547420826e-06, + "loss": 1.4563, + "step": 15041 + }, + { + "epoch": 1.89, + "grad_norm": 5.428210735321045, + "learning_rate": 7.414968832364139e-06, + "loss": 0.208, + "step": 15042 + }, + { + "epoch": 1.89, + "grad_norm": 8.045461654663086, + "learning_rate": 7.414132117307452e-06, + "loss": 0.2831, + "step": 15043 + }, + { + "epoch": 1.89, + "grad_norm": 18.02309226989746, + "learning_rate": 7.413295402250764e-06, + "loss": 0.7516, + "step": 15044 + }, + { + "epoch": 1.89, + "grad_norm": 94.65620422363281, + "learning_rate": 7.412458687194076e-06, + "loss": 1.8608, + "step": 15045 + }, + { + "epoch": 1.89, + "grad_norm": 25.21953582763672, + "learning_rate": 7.4116219721373896e-06, + "loss": 1.3827, + "step": 15046 + }, + { + "epoch": 1.89, + "grad_norm": 11.448881149291992, + "learning_rate": 7.4107852570807015e-06, + "loss": 2.3396, + "step": 15047 + }, + { + "epoch": 1.89, + "grad_norm": 8.742633819580078, + "learning_rate": 7.409948542024014e-06, + "loss": 2.5715, + "step": 15048 + }, + { + "epoch": 1.89, + "grad_norm": 31.669166564941406, + "learning_rate": 7.409111826967327e-06, + "loss": 1.3321, + "step": 15049 + }, + { + "epoch": 1.89, + "grad_norm": 3.381666660308838, + "learning_rate": 7.40827511191064e-06, + "loss": 0.0671, + "step": 15050 + }, + { + "epoch": 1.89, + "grad_norm": 17.08733558654785, + "learning_rate": 7.407438396853952e-06, + "loss": 0.93, + "step": 15051 + }, + { + "epoch": 1.89, + "grad_norm": 24.145172119140625, + "learning_rate": 7.406601681797264e-06, + "loss": 1.7006, + "step": 15052 + }, + { + "epoch": 1.89, + "grad_norm": 15.978361129760742, + "learning_rate": 7.4057649667405775e-06, + "loss": 1.4769, + "step": 15053 + }, + { + "epoch": 1.89, + "grad_norm": 11.425533294677734, + "learning_rate": 7.4049282516838894e-06, + "loss": 0.4073, + "step": 15054 + }, + { + "epoch": 1.89, + "grad_norm": 19.693145751953125, + "learning_rate": 7.404091536627202e-06, + "loss": 1.3016, + "step": 15055 + }, + { + "epoch": 1.89, + "grad_norm": 4.42262601852417, + "learning_rate": 7.403254821570515e-06, + "loss": 0.1591, + "step": 15056 + }, + { + "epoch": 1.89, + "grad_norm": 30.48455238342285, + "learning_rate": 7.402418106513828e-06, + "loss": 0.6781, + "step": 15057 + }, + { + "epoch": 1.89, + "grad_norm": 6.9818339347839355, + "learning_rate": 7.40158139145714e-06, + "loss": 0.719, + "step": 15058 + }, + { + "epoch": 1.89, + "grad_norm": 21.614707946777344, + "learning_rate": 7.400744676400452e-06, + "loss": 0.8311, + "step": 15059 + }, + { + "epoch": 1.89, + "grad_norm": 19.429691314697266, + "learning_rate": 7.399907961343765e-06, + "loss": 2.7223, + "step": 15060 + }, + { + "epoch": 1.89, + "grad_norm": 6.083532810211182, + "learning_rate": 7.399071246287077e-06, + "loss": 0.5901, + "step": 15061 + }, + { + "epoch": 1.89, + "grad_norm": 25.171422958374023, + "learning_rate": 7.39823453123039e-06, + "loss": 0.969, + "step": 15062 + }, + { + "epoch": 1.89, + "grad_norm": 18.150936126708984, + "learning_rate": 7.397397816173703e-06, + "loss": 1.1038, + "step": 15063 + }, + { + "epoch": 1.89, + "grad_norm": 10.097476959228516, + "learning_rate": 7.396561101117016e-06, + "loss": 0.5034, + "step": 15064 + }, + { + "epoch": 1.89, + "grad_norm": 7.264492511749268, + "learning_rate": 7.395724386060328e-06, + "loss": 0.1194, + "step": 15065 + }, + { + "epoch": 1.89, + "grad_norm": 7.850740909576416, + "learning_rate": 7.39488767100364e-06, + "loss": 1.5162, + "step": 15066 + }, + { + "epoch": 1.89, + "grad_norm": 12.903263092041016, + "learning_rate": 7.394050955946953e-06, + "loss": 0.3627, + "step": 15067 + }, + { + "epoch": 1.89, + "grad_norm": 176.21908569335938, + "learning_rate": 7.393214240890265e-06, + "loss": 0.3115, + "step": 15068 + }, + { + "epoch": 1.89, + "grad_norm": 32.34139633178711, + "learning_rate": 7.392377525833578e-06, + "loss": 0.6036, + "step": 15069 + }, + { + "epoch": 1.89, + "grad_norm": 6.2513604164123535, + "learning_rate": 7.39154081077689e-06, + "loss": 0.7679, + "step": 15070 + }, + { + "epoch": 1.89, + "grad_norm": 21.014720916748047, + "learning_rate": 7.390704095720204e-06, + "loss": 1.3911, + "step": 15071 + }, + { + "epoch": 1.89, + "grad_norm": 6.944918155670166, + "learning_rate": 7.389867380663516e-06, + "loss": 0.6498, + "step": 15072 + }, + { + "epoch": 1.89, + "grad_norm": 25.82847785949707, + "learning_rate": 7.389030665606828e-06, + "loss": 1.5971, + "step": 15073 + }, + { + "epoch": 1.89, + "grad_norm": 15.153432846069336, + "learning_rate": 7.388193950550141e-06, + "loss": 0.7888, + "step": 15074 + }, + { + "epoch": 1.89, + "grad_norm": 20.675094604492188, + "learning_rate": 7.387357235493453e-06, + "loss": 0.6522, + "step": 15075 + }, + { + "epoch": 1.89, + "grad_norm": 7.721600532531738, + "learning_rate": 7.386520520436766e-06, + "loss": 0.4128, + "step": 15076 + }, + { + "epoch": 1.89, + "grad_norm": 18.053680419921875, + "learning_rate": 7.385683805380078e-06, + "loss": 1.6978, + "step": 15077 + }, + { + "epoch": 1.89, + "grad_norm": 9.639769554138184, + "learning_rate": 7.384847090323391e-06, + "loss": 0.7635, + "step": 15078 + }, + { + "epoch": 1.89, + "grad_norm": 12.367140769958496, + "learning_rate": 7.3840103752667036e-06, + "loss": 0.7246, + "step": 15079 + }, + { + "epoch": 1.89, + "grad_norm": 49.35564422607422, + "learning_rate": 7.3831736602100155e-06, + "loss": 0.6651, + "step": 15080 + }, + { + "epoch": 1.89, + "grad_norm": 40.579952239990234, + "learning_rate": 7.382336945153329e-06, + "loss": 1.3458, + "step": 15081 + }, + { + "epoch": 1.89, + "grad_norm": 28.484344482421875, + "learning_rate": 7.381500230096641e-06, + "loss": 2.1591, + "step": 15082 + }, + { + "epoch": 1.89, + "grad_norm": 7.652926445007324, + "learning_rate": 7.380663515039954e-06, + "loss": 0.3406, + "step": 15083 + }, + { + "epoch": 1.89, + "grad_norm": 18.412296295166016, + "learning_rate": 7.379826799983266e-06, + "loss": 1.7563, + "step": 15084 + }, + { + "epoch": 1.89, + "grad_norm": 16.60628318786621, + "learning_rate": 7.378990084926579e-06, + "loss": 0.7008, + "step": 15085 + }, + { + "epoch": 1.89, + "grad_norm": 21.525320053100586, + "learning_rate": 7.3781533698698915e-06, + "loss": 0.7776, + "step": 15086 + }, + { + "epoch": 1.89, + "grad_norm": 10.889859199523926, + "learning_rate": 7.3773166548132034e-06, + "loss": 1.0566, + "step": 15087 + }, + { + "epoch": 1.89, + "grad_norm": 12.126493453979492, + "learning_rate": 7.376479939756517e-06, + "loss": 1.1783, + "step": 15088 + }, + { + "epoch": 1.89, + "grad_norm": 11.531156539916992, + "learning_rate": 7.375643224699829e-06, + "loss": 1.9034, + "step": 15089 + }, + { + "epoch": 1.89, + "grad_norm": 18.80508041381836, + "learning_rate": 7.374806509643142e-06, + "loss": 0.9211, + "step": 15090 + }, + { + "epoch": 1.89, + "grad_norm": 2.0429701805114746, + "learning_rate": 7.373969794586454e-06, + "loss": 0.1388, + "step": 15091 + }, + { + "epoch": 1.89, + "grad_norm": 8.433183670043945, + "learning_rate": 7.373133079529767e-06, + "loss": 0.3499, + "step": 15092 + }, + { + "epoch": 1.89, + "grad_norm": 11.792083740234375, + "learning_rate": 7.372296364473079e-06, + "loss": 0.7006, + "step": 15093 + }, + { + "epoch": 1.89, + "grad_norm": 5.4274983406066895, + "learning_rate": 7.371459649416391e-06, + "loss": 0.2147, + "step": 15094 + }, + { + "epoch": 1.89, + "grad_norm": 22.65265655517578, + "learning_rate": 7.370622934359705e-06, + "loss": 0.9307, + "step": 15095 + }, + { + "epoch": 1.89, + "grad_norm": 74.88692474365234, + "learning_rate": 7.369786219303017e-06, + "loss": 2.2369, + "step": 15096 + }, + { + "epoch": 1.89, + "grad_norm": 43.321231842041016, + "learning_rate": 7.36894950424633e-06, + "loss": 1.5066, + "step": 15097 + }, + { + "epoch": 1.89, + "grad_norm": 10.069525718688965, + "learning_rate": 7.368112789189642e-06, + "loss": 1.406, + "step": 15098 + }, + { + "epoch": 1.89, + "grad_norm": 24.34798240661621, + "learning_rate": 7.367276074132954e-06, + "loss": 0.8286, + "step": 15099 + }, + { + "epoch": 1.9, + "grad_norm": 9.9306001663208, + "learning_rate": 7.366439359076267e-06, + "loss": 0.3151, + "step": 15100 + }, + { + "epoch": 1.9, + "grad_norm": 10.898557662963867, + "learning_rate": 7.365602644019579e-06, + "loss": 0.6529, + "step": 15101 + }, + { + "epoch": 1.9, + "grad_norm": 14.822643280029297, + "learning_rate": 7.364765928962893e-06, + "loss": 0.6295, + "step": 15102 + }, + { + "epoch": 1.9, + "grad_norm": 8.52712631225586, + "learning_rate": 7.363929213906205e-06, + "loss": 0.4436, + "step": 15103 + }, + { + "epoch": 1.9, + "grad_norm": 12.008702278137207, + "learning_rate": 7.363092498849518e-06, + "loss": 1.3962, + "step": 15104 + }, + { + "epoch": 1.9, + "grad_norm": 49.62347412109375, + "learning_rate": 7.36225578379283e-06, + "loss": 1.4771, + "step": 15105 + }, + { + "epoch": 1.9, + "grad_norm": 21.786096572875977, + "learning_rate": 7.361419068736142e-06, + "loss": 1.0213, + "step": 15106 + }, + { + "epoch": 1.9, + "grad_norm": 21.74210548400879, + "learning_rate": 7.360582353679455e-06, + "loss": 1.9559, + "step": 15107 + }, + { + "epoch": 1.9, + "grad_norm": 11.14261245727539, + "learning_rate": 7.359745638622767e-06, + "loss": 1.6207, + "step": 15108 + }, + { + "epoch": 1.9, + "grad_norm": 2.294177293777466, + "learning_rate": 7.358908923566081e-06, + "loss": 0.0516, + "step": 15109 + }, + { + "epoch": 1.9, + "grad_norm": 11.590841293334961, + "learning_rate": 7.358072208509393e-06, + "loss": 0.5179, + "step": 15110 + }, + { + "epoch": 1.9, + "grad_norm": 18.835065841674805, + "learning_rate": 7.357235493452706e-06, + "loss": 1.0335, + "step": 15111 + }, + { + "epoch": 1.9, + "grad_norm": 10.532160758972168, + "learning_rate": 7.3563987783960176e-06, + "loss": 1.2372, + "step": 15112 + }, + { + "epoch": 1.9, + "grad_norm": 14.514026641845703, + "learning_rate": 7.3555620633393295e-06, + "loss": 1.355, + "step": 15113 + }, + { + "epoch": 1.9, + "grad_norm": 16.543455123901367, + "learning_rate": 7.354725348282643e-06, + "loss": 0.6673, + "step": 15114 + }, + { + "epoch": 1.9, + "grad_norm": 16.31077766418457, + "learning_rate": 7.353888633225955e-06, + "loss": 0.9401, + "step": 15115 + }, + { + "epoch": 1.9, + "grad_norm": 12.899883270263672, + "learning_rate": 7.353051918169268e-06, + "loss": 0.6144, + "step": 15116 + }, + { + "epoch": 1.9, + "grad_norm": 56.374244689941406, + "learning_rate": 7.352215203112581e-06, + "loss": 1.9511, + "step": 15117 + }, + { + "epoch": 1.9, + "grad_norm": 17.845420837402344, + "learning_rate": 7.3513784880558935e-06, + "loss": 0.3596, + "step": 15118 + }, + { + "epoch": 1.9, + "grad_norm": 11.164807319641113, + "learning_rate": 7.3505417729992055e-06, + "loss": 1.0834, + "step": 15119 + }, + { + "epoch": 1.9, + "grad_norm": 23.736774444580078, + "learning_rate": 7.3497050579425174e-06, + "loss": 0.6354, + "step": 15120 + }, + { + "epoch": 1.9, + "grad_norm": 69.24571990966797, + "learning_rate": 7.348868342885831e-06, + "loss": 1.576, + "step": 15121 + }, + { + "epoch": 1.9, + "grad_norm": 17.8673152923584, + "learning_rate": 7.348031627829143e-06, + "loss": 0.4724, + "step": 15122 + }, + { + "epoch": 1.9, + "grad_norm": 24.305837631225586, + "learning_rate": 7.347194912772456e-06, + "loss": 0.5262, + "step": 15123 + }, + { + "epoch": 1.9, + "grad_norm": 9.139628410339355, + "learning_rate": 7.346358197715769e-06, + "loss": 0.8911, + "step": 15124 + }, + { + "epoch": 1.9, + "grad_norm": 8.655332565307617, + "learning_rate": 7.3455214826590814e-06, + "loss": 0.4515, + "step": 15125 + }, + { + "epoch": 1.9, + "grad_norm": 11.019580841064453, + "learning_rate": 7.344684767602393e-06, + "loss": 0.5824, + "step": 15126 + }, + { + "epoch": 1.9, + "grad_norm": 14.877920150756836, + "learning_rate": 7.343848052545705e-06, + "loss": 1.5006, + "step": 15127 + }, + { + "epoch": 1.9, + "grad_norm": 40.81086349487305, + "learning_rate": 7.343011337489019e-06, + "loss": 1.0782, + "step": 15128 + }, + { + "epoch": 1.9, + "grad_norm": 8.504908561706543, + "learning_rate": 7.342174622432331e-06, + "loss": 0.4803, + "step": 15129 + }, + { + "epoch": 1.9, + "grad_norm": 13.374335289001465, + "learning_rate": 7.341337907375644e-06, + "loss": 1.976, + "step": 15130 + }, + { + "epoch": 1.9, + "grad_norm": 20.366533279418945, + "learning_rate": 7.3405011923189566e-06, + "loss": 0.9099, + "step": 15131 + }, + { + "epoch": 1.9, + "grad_norm": 10.132173538208008, + "learning_rate": 7.339664477262269e-06, + "loss": 1.0805, + "step": 15132 + }, + { + "epoch": 1.9, + "grad_norm": 12.006936073303223, + "learning_rate": 7.338827762205581e-06, + "loss": 0.4156, + "step": 15133 + }, + { + "epoch": 1.9, + "grad_norm": 7.405896186828613, + "learning_rate": 7.337991047148893e-06, + "loss": 1.38, + "step": 15134 + }, + { + "epoch": 1.9, + "grad_norm": 4.1786627769470215, + "learning_rate": 7.337154332092207e-06, + "loss": 0.2808, + "step": 15135 + }, + { + "epoch": 1.9, + "grad_norm": 5.891201496124268, + "learning_rate": 7.336317617035519e-06, + "loss": 0.2731, + "step": 15136 + }, + { + "epoch": 1.9, + "grad_norm": 11.198047637939453, + "learning_rate": 7.335480901978832e-06, + "loss": 1.3121, + "step": 15137 + }, + { + "epoch": 1.9, + "grad_norm": 3.8166863918304443, + "learning_rate": 7.3346441869221445e-06, + "loss": 0.2421, + "step": 15138 + }, + { + "epoch": 1.9, + "grad_norm": 5.097291946411133, + "learning_rate": 7.333807471865457e-06, + "loss": 0.2636, + "step": 15139 + }, + { + "epoch": 1.9, + "grad_norm": 31.219444274902344, + "learning_rate": 7.332970756808769e-06, + "loss": 1.0549, + "step": 15140 + }, + { + "epoch": 1.9, + "grad_norm": 6.707075595855713, + "learning_rate": 7.332134041752081e-06, + "loss": 0.6725, + "step": 15141 + }, + { + "epoch": 1.9, + "grad_norm": 30.624589920043945, + "learning_rate": 7.331297326695395e-06, + "loss": 2.4987, + "step": 15142 + }, + { + "epoch": 1.9, + "grad_norm": 36.42421340942383, + "learning_rate": 7.330460611638707e-06, + "loss": 1.2836, + "step": 15143 + }, + { + "epoch": 1.9, + "grad_norm": 21.4605655670166, + "learning_rate": 7.32962389658202e-06, + "loss": 0.5328, + "step": 15144 + }, + { + "epoch": 1.9, + "grad_norm": 11.258630752563477, + "learning_rate": 7.328787181525332e-06, + "loss": 0.3049, + "step": 15145 + }, + { + "epoch": 1.9, + "grad_norm": 9.895224571228027, + "learning_rate": 7.327950466468645e-06, + "loss": 1.7441, + "step": 15146 + }, + { + "epoch": 1.9, + "grad_norm": 47.341033935546875, + "learning_rate": 7.327113751411957e-06, + "loss": 0.7324, + "step": 15147 + }, + { + "epoch": 1.9, + "grad_norm": 13.817826271057129, + "learning_rate": 7.326277036355269e-06, + "loss": 2.5864, + "step": 15148 + }, + { + "epoch": 1.9, + "grad_norm": 72.39340209960938, + "learning_rate": 7.325440321298583e-06, + "loss": 2.7951, + "step": 15149 + }, + { + "epoch": 1.9, + "grad_norm": 10.49294376373291, + "learning_rate": 7.324603606241895e-06, + "loss": 0.3464, + "step": 15150 + }, + { + "epoch": 1.9, + "grad_norm": 53.07066345214844, + "learning_rate": 7.3237668911852075e-06, + "loss": 1.9196, + "step": 15151 + }, + { + "epoch": 1.9, + "grad_norm": 12.413329124450684, + "learning_rate": 7.3229301761285195e-06, + "loss": 2.0227, + "step": 15152 + }, + { + "epoch": 1.9, + "grad_norm": 9.606522560119629, + "learning_rate": 7.322093461071833e-06, + "loss": 0.4902, + "step": 15153 + }, + { + "epoch": 1.9, + "grad_norm": 17.33139419555664, + "learning_rate": 7.321256746015145e-06, + "loss": 1.5737, + "step": 15154 + }, + { + "epoch": 1.9, + "grad_norm": 24.492815017700195, + "learning_rate": 7.320420030958457e-06, + "loss": 1.4284, + "step": 15155 + }, + { + "epoch": 1.9, + "grad_norm": 7.170698642730713, + "learning_rate": 7.319583315901771e-06, + "loss": 0.328, + "step": 15156 + }, + { + "epoch": 1.9, + "grad_norm": 20.43840789794922, + "learning_rate": 7.318746600845083e-06, + "loss": 1.3311, + "step": 15157 + }, + { + "epoch": 1.9, + "grad_norm": 22.837284088134766, + "learning_rate": 7.3179098857883954e-06, + "loss": 1.1056, + "step": 15158 + }, + { + "epoch": 1.9, + "grad_norm": 17.995685577392578, + "learning_rate": 7.317073170731707e-06, + "loss": 1.1954, + "step": 15159 + }, + { + "epoch": 1.9, + "grad_norm": 18.641700744628906, + "learning_rate": 7.316236455675021e-06, + "loss": 0.83, + "step": 15160 + }, + { + "epoch": 1.9, + "grad_norm": 10.630788803100586, + "learning_rate": 7.315399740618333e-06, + "loss": 1.4478, + "step": 15161 + }, + { + "epoch": 1.9, + "grad_norm": 16.64458465576172, + "learning_rate": 7.314563025561645e-06, + "loss": 1.4767, + "step": 15162 + }, + { + "epoch": 1.9, + "grad_norm": 26.294815063476562, + "learning_rate": 7.313726310504959e-06, + "loss": 1.6531, + "step": 15163 + }, + { + "epoch": 1.9, + "grad_norm": 13.200927734375, + "learning_rate": 7.3128895954482705e-06, + "loss": 1.0693, + "step": 15164 + }, + { + "epoch": 1.9, + "grad_norm": 13.845024108886719, + "learning_rate": 7.312052880391583e-06, + "loss": 4.2466, + "step": 15165 + }, + { + "epoch": 1.9, + "grad_norm": 9.46701717376709, + "learning_rate": 7.311216165334895e-06, + "loss": 0.6261, + "step": 15166 + }, + { + "epoch": 1.9, + "grad_norm": 31.682180404663086, + "learning_rate": 7.310379450278209e-06, + "loss": 2.2054, + "step": 15167 + }, + { + "epoch": 1.9, + "grad_norm": 6.807620048522949, + "learning_rate": 7.309542735221521e-06, + "loss": 1.3925, + "step": 15168 + }, + { + "epoch": 1.9, + "grad_norm": 25.933950424194336, + "learning_rate": 7.308706020164833e-06, + "loss": 0.6675, + "step": 15169 + }, + { + "epoch": 1.9, + "grad_norm": 28.577220916748047, + "learning_rate": 7.3078693051081465e-06, + "loss": 1.0649, + "step": 15170 + }, + { + "epoch": 1.9, + "grad_norm": 8.963960647583008, + "learning_rate": 7.3070325900514585e-06, + "loss": 0.9175, + "step": 15171 + }, + { + "epoch": 1.9, + "grad_norm": 7.93988561630249, + "learning_rate": 7.306195874994771e-06, + "loss": 0.485, + "step": 15172 + }, + { + "epoch": 1.9, + "grad_norm": 8.721303939819336, + "learning_rate": 7.305359159938083e-06, + "loss": 0.4843, + "step": 15173 + }, + { + "epoch": 1.9, + "grad_norm": 5.980069160461426, + "learning_rate": 7.304522444881397e-06, + "loss": 0.7523, + "step": 15174 + }, + { + "epoch": 1.9, + "grad_norm": 1.0261625051498413, + "learning_rate": 7.303685729824709e-06, + "loss": 0.0355, + "step": 15175 + }, + { + "epoch": 1.9, + "grad_norm": 28.56754493713379, + "learning_rate": 7.302849014768021e-06, + "loss": 1.4144, + "step": 15176 + }, + { + "epoch": 1.9, + "grad_norm": 6.409519672393799, + "learning_rate": 7.302012299711334e-06, + "loss": 0.4377, + "step": 15177 + }, + { + "epoch": 1.9, + "grad_norm": 11.686038970947266, + "learning_rate": 7.301175584654646e-06, + "loss": 2.361, + "step": 15178 + }, + { + "epoch": 1.9, + "grad_norm": 16.717464447021484, + "learning_rate": 7.300338869597959e-06, + "loss": 4.0278, + "step": 15179 + }, + { + "epoch": 1.91, + "grad_norm": 15.546704292297363, + "learning_rate": 7.299502154541271e-06, + "loss": 0.6647, + "step": 15180 + }, + { + "epoch": 1.91, + "grad_norm": 7.070075511932373, + "learning_rate": 7.298665439484585e-06, + "loss": 0.2537, + "step": 15181 + }, + { + "epoch": 1.91, + "grad_norm": 29.783416748046875, + "learning_rate": 7.297828724427897e-06, + "loss": 1.9954, + "step": 15182 + }, + { + "epoch": 1.91, + "grad_norm": 18.873929977416992, + "learning_rate": 7.296992009371209e-06, + "loss": 0.3738, + "step": 15183 + }, + { + "epoch": 1.91, + "grad_norm": 7.223883152008057, + "learning_rate": 7.296155294314522e-06, + "loss": 0.7467, + "step": 15184 + }, + { + "epoch": 1.91, + "grad_norm": 15.374015808105469, + "learning_rate": 7.295318579257834e-06, + "loss": 1.0972, + "step": 15185 + }, + { + "epoch": 1.91, + "grad_norm": 5.549311637878418, + "learning_rate": 7.294481864201147e-06, + "loss": 0.3452, + "step": 15186 + }, + { + "epoch": 1.91, + "grad_norm": 16.02145004272461, + "learning_rate": 7.293645149144459e-06, + "loss": 0.3838, + "step": 15187 + }, + { + "epoch": 1.91, + "grad_norm": 20.3539981842041, + "learning_rate": 7.292808434087773e-06, + "loss": 2.1246, + "step": 15188 + }, + { + "epoch": 1.91, + "grad_norm": 5.821852684020996, + "learning_rate": 7.291971719031085e-06, + "loss": 0.5058, + "step": 15189 + }, + { + "epoch": 1.91, + "grad_norm": 12.831040382385254, + "learning_rate": 7.291135003974397e-06, + "loss": 0.2566, + "step": 15190 + }, + { + "epoch": 1.91, + "grad_norm": 15.333756446838379, + "learning_rate": 7.29029828891771e-06, + "loss": 2.0446, + "step": 15191 + }, + { + "epoch": 1.91, + "grad_norm": 44.381813049316406, + "learning_rate": 7.289461573861022e-06, + "loss": 2.7031, + "step": 15192 + }, + { + "epoch": 1.91, + "grad_norm": 52.53483963012695, + "learning_rate": 7.288624858804335e-06, + "loss": 1.6683, + "step": 15193 + }, + { + "epoch": 1.91, + "grad_norm": 23.234952926635742, + "learning_rate": 7.287788143747647e-06, + "loss": 0.887, + "step": 15194 + }, + { + "epoch": 1.91, + "grad_norm": 119.96180725097656, + "learning_rate": 7.286951428690961e-06, + "loss": 1.6006, + "step": 15195 + }, + { + "epoch": 1.91, + "grad_norm": 18.631879806518555, + "learning_rate": 7.2861147136342726e-06, + "loss": 1.9097, + "step": 15196 + }, + { + "epoch": 1.91, + "grad_norm": 50.981788635253906, + "learning_rate": 7.2852779985775845e-06, + "loss": 1.9735, + "step": 15197 + }, + { + "epoch": 1.91, + "grad_norm": 5.808567523956299, + "learning_rate": 7.284441283520897e-06, + "loss": 0.3778, + "step": 15198 + }, + { + "epoch": 1.91, + "grad_norm": 17.7376651763916, + "learning_rate": 7.28360456846421e-06, + "loss": 0.9597, + "step": 15199 + }, + { + "epoch": 1.91, + "grad_norm": 142.1316375732422, + "learning_rate": 7.282767853407523e-06, + "loss": 0.846, + "step": 15200 + }, + { + "epoch": 1.91, + "eval_loss": 0.07876706123352051, + "eval_runtime": 96.4561, + "eval_samples_per_second": 36.721, + "eval_steps_per_second": 36.721, + "step": 15200 + }, + { + "epoch": 1.91, + "grad_norm": 6.942018032073975, + "learning_rate": 7.281931138350835e-06, + "loss": 0.2998, + "step": 15201 + }, + { + "epoch": 1.91, + "grad_norm": 39.050254821777344, + "learning_rate": 7.2810944232941485e-06, + "loss": 1.4191, + "step": 15202 + }, + { + "epoch": 1.91, + "grad_norm": 12.845418930053711, + "learning_rate": 7.2802577082374605e-06, + "loss": 0.9449, + "step": 15203 + }, + { + "epoch": 1.91, + "grad_norm": 237.6184539794922, + "learning_rate": 7.2794209931807724e-06, + "loss": 3.3873, + "step": 15204 + }, + { + "epoch": 1.91, + "grad_norm": 991.9547119140625, + "learning_rate": 7.278584278124085e-06, + "loss": 1.8497, + "step": 15205 + }, + { + "epoch": 1.91, + "grad_norm": 22.550867080688477, + "learning_rate": 7.277747563067398e-06, + "loss": 0.6088, + "step": 15206 + }, + { + "epoch": 1.91, + "grad_norm": 15.08200740814209, + "learning_rate": 7.276910848010711e-06, + "loss": 2.0656, + "step": 15207 + }, + { + "epoch": 1.91, + "grad_norm": 4.465595722198486, + "learning_rate": 7.276074132954023e-06, + "loss": 0.1612, + "step": 15208 + }, + { + "epoch": 1.91, + "grad_norm": 21.401498794555664, + "learning_rate": 7.2752374178973365e-06, + "loss": 1.6728, + "step": 15209 + }, + { + "epoch": 1.91, + "grad_norm": 25.57209587097168, + "learning_rate": 7.274400702840648e-06, + "loss": 1.3603, + "step": 15210 + }, + { + "epoch": 1.91, + "grad_norm": 7.294596195220947, + "learning_rate": 7.27356398778396e-06, + "loss": 0.1678, + "step": 15211 + }, + { + "epoch": 1.91, + "grad_norm": 56.66082763671875, + "learning_rate": 7.272727272727273e-06, + "loss": 2.7304, + "step": 15212 + }, + { + "epoch": 1.91, + "grad_norm": 11.751508712768555, + "learning_rate": 7.271890557670586e-06, + "loss": 0.881, + "step": 15213 + }, + { + "epoch": 1.91, + "grad_norm": 20.95166778564453, + "learning_rate": 7.271053842613899e-06, + "loss": 1.5158, + "step": 15214 + }, + { + "epoch": 1.91, + "grad_norm": 16.030107498168945, + "learning_rate": 7.270217127557211e-06, + "loss": 0.8177, + "step": 15215 + }, + { + "epoch": 1.91, + "grad_norm": 10.919445991516113, + "learning_rate": 7.269380412500524e-06, + "loss": 1.8145, + "step": 15216 + }, + { + "epoch": 1.91, + "grad_norm": 36.17577362060547, + "learning_rate": 7.268543697443836e-06, + "loss": 0.9956, + "step": 15217 + }, + { + "epoch": 1.91, + "grad_norm": 11.49276351928711, + "learning_rate": 7.267706982387148e-06, + "loss": 2.2279, + "step": 15218 + }, + { + "epoch": 1.91, + "grad_norm": 11.070647239685059, + "learning_rate": 7.266870267330461e-06, + "loss": 0.3389, + "step": 15219 + }, + { + "epoch": 1.91, + "grad_norm": 19.18267059326172, + "learning_rate": 7.266033552273774e-06, + "loss": 1.5976, + "step": 15220 + }, + { + "epoch": 1.91, + "grad_norm": 12.499030113220215, + "learning_rate": 7.265196837217087e-06, + "loss": 0.986, + "step": 15221 + }, + { + "epoch": 1.91, + "grad_norm": 14.461274147033691, + "learning_rate": 7.264360122160399e-06, + "loss": 0.7536, + "step": 15222 + }, + { + "epoch": 1.91, + "grad_norm": 22.120290756225586, + "learning_rate": 7.263523407103712e-06, + "loss": 1.0406, + "step": 15223 + }, + { + "epoch": 1.91, + "grad_norm": 43.58599090576172, + "learning_rate": 7.262686692047024e-06, + "loss": 3.0559, + "step": 15224 + }, + { + "epoch": 1.91, + "grad_norm": 55.521148681640625, + "learning_rate": 7.261849976990336e-06, + "loss": 1.18, + "step": 15225 + }, + { + "epoch": 1.91, + "grad_norm": 35.34981918334961, + "learning_rate": 7.261013261933649e-06, + "loss": 1.4666, + "step": 15226 + }, + { + "epoch": 1.91, + "grad_norm": 6.730340003967285, + "learning_rate": 7.260176546876962e-06, + "loss": 0.325, + "step": 15227 + }, + { + "epoch": 1.91, + "grad_norm": 5.402364730834961, + "learning_rate": 7.259339831820275e-06, + "loss": 0.263, + "step": 15228 + }, + { + "epoch": 1.91, + "grad_norm": 10.066123962402344, + "learning_rate": 7.2585031167635866e-06, + "loss": 0.4633, + "step": 15229 + }, + { + "epoch": 1.91, + "grad_norm": 40.3651008605957, + "learning_rate": 7.2576664017069e-06, + "loss": 1.5189, + "step": 15230 + }, + { + "epoch": 1.91, + "grad_norm": 18.799203872680664, + "learning_rate": 7.256829686650212e-06, + "loss": 1.1197, + "step": 15231 + }, + { + "epoch": 1.91, + "grad_norm": 15.619763374328613, + "learning_rate": 7.255992971593524e-06, + "loss": 2.1876, + "step": 15232 + }, + { + "epoch": 1.91, + "grad_norm": 12.761515617370605, + "learning_rate": 7.255156256536837e-06, + "loss": 0.7248, + "step": 15233 + }, + { + "epoch": 1.91, + "grad_norm": 11.928339004516602, + "learning_rate": 7.254319541480149e-06, + "loss": 0.4087, + "step": 15234 + }, + { + "epoch": 1.91, + "grad_norm": 23.60286521911621, + "learning_rate": 7.2534828264234625e-06, + "loss": 1.801, + "step": 15235 + }, + { + "epoch": 1.91, + "grad_norm": 4.796131134033203, + "learning_rate": 7.2526461113667745e-06, + "loss": 0.2426, + "step": 15236 + }, + { + "epoch": 1.91, + "grad_norm": 54.70196533203125, + "learning_rate": 7.251809396310088e-06, + "loss": 3.1836, + "step": 15237 + }, + { + "epoch": 1.91, + "grad_norm": 79.63700103759766, + "learning_rate": 7.2509726812534e-06, + "loss": 1.9657, + "step": 15238 + }, + { + "epoch": 1.91, + "grad_norm": 9.026930809020996, + "learning_rate": 7.250135966196712e-06, + "loss": 0.7326, + "step": 15239 + }, + { + "epoch": 1.91, + "grad_norm": 10.02031135559082, + "learning_rate": 7.249299251140025e-06, + "loss": 1.1231, + "step": 15240 + }, + { + "epoch": 1.91, + "grad_norm": 21.590858459472656, + "learning_rate": 7.248462536083337e-06, + "loss": 3.2721, + "step": 15241 + }, + { + "epoch": 1.91, + "grad_norm": 12.597800254821777, + "learning_rate": 7.2476258210266504e-06, + "loss": 1.2757, + "step": 15242 + }, + { + "epoch": 1.91, + "grad_norm": 42.85508728027344, + "learning_rate": 7.246789105969962e-06, + "loss": 0.5524, + "step": 15243 + }, + { + "epoch": 1.91, + "grad_norm": 12.83395004272461, + "learning_rate": 7.245952390913275e-06, + "loss": 2.0516, + "step": 15244 + }, + { + "epoch": 1.91, + "grad_norm": 24.339702606201172, + "learning_rate": 7.245115675856588e-06, + "loss": 2.0179, + "step": 15245 + }, + { + "epoch": 1.91, + "grad_norm": 160.06024169921875, + "learning_rate": 7.2442789607999e-06, + "loss": 3.1359, + "step": 15246 + }, + { + "epoch": 1.91, + "grad_norm": 16.193593978881836, + "learning_rate": 7.243442245743213e-06, + "loss": 0.4034, + "step": 15247 + }, + { + "epoch": 1.91, + "grad_norm": 49.94191360473633, + "learning_rate": 7.242605530686525e-06, + "loss": 2.3597, + "step": 15248 + }, + { + "epoch": 1.91, + "grad_norm": 12.985690116882324, + "learning_rate": 7.241768815629838e-06, + "loss": 0.4545, + "step": 15249 + }, + { + "epoch": 1.91, + "grad_norm": 19.354537963867188, + "learning_rate": 7.24093210057315e-06, + "loss": 1.873, + "step": 15250 + }, + { + "epoch": 1.91, + "grad_norm": 13.181794166564941, + "learning_rate": 7.240095385516463e-06, + "loss": 1.2042, + "step": 15251 + }, + { + "epoch": 1.91, + "grad_norm": 61.11602020263672, + "learning_rate": 7.239258670459776e-06, + "loss": 0.8935, + "step": 15252 + }, + { + "epoch": 1.91, + "grad_norm": 18.95747184753418, + "learning_rate": 7.238421955403088e-06, + "loss": 1.0299, + "step": 15253 + }, + { + "epoch": 1.91, + "grad_norm": 9.337152481079102, + "learning_rate": 7.237585240346401e-06, + "loss": 1.2961, + "step": 15254 + }, + { + "epoch": 1.91, + "grad_norm": 7.195521354675293, + "learning_rate": 7.236748525289713e-06, + "loss": 2.0989, + "step": 15255 + }, + { + "epoch": 1.91, + "grad_norm": 12.675674438476562, + "learning_rate": 7.235911810233026e-06, + "loss": 1.7304, + "step": 15256 + }, + { + "epoch": 1.91, + "grad_norm": 8.78879451751709, + "learning_rate": 7.235075095176338e-06, + "loss": 0.2985, + "step": 15257 + }, + { + "epoch": 1.91, + "grad_norm": 18.082355499267578, + "learning_rate": 7.23423838011965e-06, + "loss": 1.065, + "step": 15258 + }, + { + "epoch": 1.91, + "grad_norm": 24.055559158325195, + "learning_rate": 7.233401665062964e-06, + "loss": 0.9967, + "step": 15259 + }, + { + "epoch": 1.92, + "grad_norm": 11.412266731262207, + "learning_rate": 7.232564950006276e-06, + "loss": 1.2657, + "step": 15260 + }, + { + "epoch": 1.92, + "grad_norm": 5.17311954498291, + "learning_rate": 7.231728234949589e-06, + "loss": 0.6104, + "step": 15261 + }, + { + "epoch": 1.92, + "grad_norm": 6.313230991363525, + "learning_rate": 7.2308915198929006e-06, + "loss": 0.3799, + "step": 15262 + }, + { + "epoch": 1.92, + "grad_norm": 10.846939086914062, + "learning_rate": 7.230054804836214e-06, + "loss": 0.9638, + "step": 15263 + }, + { + "epoch": 1.92, + "grad_norm": 12.357636451721191, + "learning_rate": 7.229218089779526e-06, + "loss": 1.0083, + "step": 15264 + }, + { + "epoch": 1.92, + "grad_norm": 9.515074729919434, + "learning_rate": 7.228381374722838e-06, + "loss": 0.3895, + "step": 15265 + }, + { + "epoch": 1.92, + "grad_norm": 26.114919662475586, + "learning_rate": 7.227544659666152e-06, + "loss": 1.9338, + "step": 15266 + }, + { + "epoch": 1.92, + "grad_norm": 15.958468437194824, + "learning_rate": 7.226707944609464e-06, + "loss": 0.7954, + "step": 15267 + }, + { + "epoch": 1.92, + "grad_norm": 7.361175537109375, + "learning_rate": 7.2258712295527765e-06, + "loss": 0.2709, + "step": 15268 + }, + { + "epoch": 1.92, + "grad_norm": 10.929686546325684, + "learning_rate": 7.2250345144960885e-06, + "loss": 0.7763, + "step": 15269 + }, + { + "epoch": 1.92, + "grad_norm": 30.771678924560547, + "learning_rate": 7.224197799439402e-06, + "loss": 0.9929, + "step": 15270 + }, + { + "epoch": 1.92, + "grad_norm": 16.85222816467285, + "learning_rate": 7.223361084382714e-06, + "loss": 1.3375, + "step": 15271 + }, + { + "epoch": 1.92, + "grad_norm": 18.63667869567871, + "learning_rate": 7.222524369326026e-06, + "loss": 1.1272, + "step": 15272 + }, + { + "epoch": 1.92, + "grad_norm": 8.972474098205566, + "learning_rate": 7.22168765426934e-06, + "loss": 0.5789, + "step": 15273 + }, + { + "epoch": 1.92, + "grad_norm": 17.44340705871582, + "learning_rate": 7.220850939212652e-06, + "loss": 1.8272, + "step": 15274 + }, + { + "epoch": 1.92, + "grad_norm": 16.283367156982422, + "learning_rate": 7.2200142241559644e-06, + "loss": 0.3297, + "step": 15275 + }, + { + "epoch": 1.92, + "grad_norm": 0.77719646692276, + "learning_rate": 7.219177509099276e-06, + "loss": 0.0125, + "step": 15276 + }, + { + "epoch": 1.92, + "grad_norm": 18.31374740600586, + "learning_rate": 7.21834079404259e-06, + "loss": 3.0636, + "step": 15277 + }, + { + "epoch": 1.92, + "grad_norm": 50.88170623779297, + "learning_rate": 7.217504078985902e-06, + "loss": 2.3115, + "step": 15278 + }, + { + "epoch": 1.92, + "grad_norm": 78.62443542480469, + "learning_rate": 7.216667363929214e-06, + "loss": 0.7992, + "step": 15279 + }, + { + "epoch": 1.92, + "grad_norm": 16.376123428344727, + "learning_rate": 7.215830648872527e-06, + "loss": 0.6161, + "step": 15280 + }, + { + "epoch": 1.92, + "grad_norm": 27.0351619720459, + "learning_rate": 7.2149939338158396e-06, + "loss": 1.6892, + "step": 15281 + }, + { + "epoch": 1.92, + "grad_norm": 13.475540161132812, + "learning_rate": 7.214157218759152e-06, + "loss": 0.7147, + "step": 15282 + }, + { + "epoch": 1.92, + "grad_norm": 18.50657081604004, + "learning_rate": 7.213320503702464e-06, + "loss": 1.8595, + "step": 15283 + }, + { + "epoch": 1.92, + "grad_norm": 11.895668983459473, + "learning_rate": 7.212483788645778e-06, + "loss": 0.3789, + "step": 15284 + }, + { + "epoch": 1.92, + "grad_norm": 10.596177101135254, + "learning_rate": 7.21164707358909e-06, + "loss": 1.4801, + "step": 15285 + }, + { + "epoch": 1.92, + "grad_norm": 7.016933441162109, + "learning_rate": 7.210810358532402e-06, + "loss": 1.1085, + "step": 15286 + }, + { + "epoch": 1.92, + "grad_norm": 8.173614501953125, + "learning_rate": 7.209973643475715e-06, + "loss": 0.6395, + "step": 15287 + }, + { + "epoch": 1.92, + "grad_norm": 16.760009765625, + "learning_rate": 7.2091369284190275e-06, + "loss": 0.949, + "step": 15288 + }, + { + "epoch": 1.92, + "grad_norm": 47.62588119506836, + "learning_rate": 7.20830021336234e-06, + "loss": 2.1585, + "step": 15289 + }, + { + "epoch": 1.92, + "grad_norm": 10.460638999938965, + "learning_rate": 7.207463498305652e-06, + "loss": 1.4834, + "step": 15290 + }, + { + "epoch": 1.92, + "grad_norm": 16.595172882080078, + "learning_rate": 7.206626783248966e-06, + "loss": 0.5579, + "step": 15291 + }, + { + "epoch": 1.92, + "grad_norm": 16.40092658996582, + "learning_rate": 7.205790068192278e-06, + "loss": 1.733, + "step": 15292 + }, + { + "epoch": 1.92, + "grad_norm": 45.9400520324707, + "learning_rate": 7.20495335313559e-06, + "loss": 1.4993, + "step": 15293 + }, + { + "epoch": 1.92, + "grad_norm": 4.556918144226074, + "learning_rate": 7.204116638078903e-06, + "loss": 0.093, + "step": 15294 + }, + { + "epoch": 1.92, + "grad_norm": 3.307995557785034, + "learning_rate": 7.203279923022215e-06, + "loss": 0.2106, + "step": 15295 + }, + { + "epoch": 1.92, + "grad_norm": 24.323028564453125, + "learning_rate": 7.202443207965528e-06, + "loss": 0.9309, + "step": 15296 + }, + { + "epoch": 1.92, + "grad_norm": 13.639983177185059, + "learning_rate": 7.20160649290884e-06, + "loss": 1.3806, + "step": 15297 + }, + { + "epoch": 1.92, + "grad_norm": 19.285390853881836, + "learning_rate": 7.200769777852154e-06, + "loss": 0.9875, + "step": 15298 + }, + { + "epoch": 1.92, + "grad_norm": 9.064143180847168, + "learning_rate": 7.199933062795466e-06, + "loss": 1.4574, + "step": 15299 + }, + { + "epoch": 1.92, + "grad_norm": 16.76412582397461, + "learning_rate": 7.199096347738778e-06, + "loss": 1.0419, + "step": 15300 + }, + { + "epoch": 1.92, + "grad_norm": 33.953739166259766, + "learning_rate": 7.1982596326820905e-06, + "loss": 2.0381, + "step": 15301 + }, + { + "epoch": 1.92, + "grad_norm": 12.692020416259766, + "learning_rate": 7.197422917625403e-06, + "loss": 0.7316, + "step": 15302 + }, + { + "epoch": 1.92, + "grad_norm": 28.04511260986328, + "learning_rate": 7.196586202568716e-06, + "loss": 2.1434, + "step": 15303 + }, + { + "epoch": 1.92, + "grad_norm": 6.163270473480225, + "learning_rate": 7.195749487512028e-06, + "loss": 0.8395, + "step": 15304 + }, + { + "epoch": 1.92, + "grad_norm": 6.6385579109191895, + "learning_rate": 7.194912772455342e-06, + "loss": 0.3598, + "step": 15305 + }, + { + "epoch": 1.92, + "grad_norm": 19.119504928588867, + "learning_rate": 7.194076057398654e-06, + "loss": 2.1298, + "step": 15306 + }, + { + "epoch": 1.92, + "grad_norm": 7.362306594848633, + "learning_rate": 7.193239342341966e-06, + "loss": 0.4124, + "step": 15307 + }, + { + "epoch": 1.92, + "grad_norm": 9.695954322814941, + "learning_rate": 7.1924026272852784e-06, + "loss": 0.2428, + "step": 15308 + }, + { + "epoch": 1.92, + "grad_norm": 17.593093872070312, + "learning_rate": 7.19156591222859e-06, + "loss": 1.2185, + "step": 15309 + }, + { + "epoch": 1.92, + "grad_norm": 11.084668159484863, + "learning_rate": 7.190729197171904e-06, + "loss": 1.2277, + "step": 15310 + }, + { + "epoch": 1.92, + "grad_norm": 15.913836479187012, + "learning_rate": 7.189892482115216e-06, + "loss": 0.9599, + "step": 15311 + }, + { + "epoch": 1.92, + "grad_norm": 8.229795455932617, + "learning_rate": 7.18905576705853e-06, + "loss": 0.5386, + "step": 15312 + }, + { + "epoch": 1.92, + "grad_norm": 19.191814422607422, + "learning_rate": 7.188219052001842e-06, + "loss": 1.045, + "step": 15313 + }, + { + "epoch": 1.92, + "grad_norm": 22.096803665161133, + "learning_rate": 7.1873823369451535e-06, + "loss": 1.4529, + "step": 15314 + }, + { + "epoch": 1.92, + "grad_norm": 18.706634521484375, + "learning_rate": 7.186545621888466e-06, + "loss": 0.8223, + "step": 15315 + }, + { + "epoch": 1.92, + "grad_norm": 3.793954849243164, + "learning_rate": 7.185708906831778e-06, + "loss": 0.1147, + "step": 15316 + }, + { + "epoch": 1.92, + "grad_norm": 13.102605819702148, + "learning_rate": 7.184872191775092e-06, + "loss": 0.7827, + "step": 15317 + }, + { + "epoch": 1.92, + "grad_norm": 10.377617835998535, + "learning_rate": 7.184035476718404e-06, + "loss": 0.4478, + "step": 15318 + }, + { + "epoch": 1.92, + "grad_norm": 22.061677932739258, + "learning_rate": 7.1831987616617175e-06, + "loss": 0.964, + "step": 15319 + }, + { + "epoch": 1.92, + "grad_norm": 7.3718719482421875, + "learning_rate": 7.1823620466050295e-06, + "loss": 0.9412, + "step": 15320 + }, + { + "epoch": 1.92, + "grad_norm": 92.04766082763672, + "learning_rate": 7.1815253315483415e-06, + "loss": 1.7727, + "step": 15321 + }, + { + "epoch": 1.92, + "grad_norm": 13.109322547912598, + "learning_rate": 7.180688616491654e-06, + "loss": 1.5429, + "step": 15322 + }, + { + "epoch": 1.92, + "grad_norm": 22.95648765563965, + "learning_rate": 7.179851901434966e-06, + "loss": 1.0019, + "step": 15323 + }, + { + "epoch": 1.92, + "grad_norm": 6.840642929077148, + "learning_rate": 7.17901518637828e-06, + "loss": 1.2736, + "step": 15324 + }, + { + "epoch": 1.92, + "grad_norm": 10.248290061950684, + "learning_rate": 7.178178471321592e-06, + "loss": 1.3105, + "step": 15325 + }, + { + "epoch": 1.92, + "grad_norm": 15.547542572021484, + "learning_rate": 7.177341756264905e-06, + "loss": 0.5836, + "step": 15326 + }, + { + "epoch": 1.92, + "grad_norm": 3.253751277923584, + "learning_rate": 7.176505041208217e-06, + "loss": 0.0978, + "step": 15327 + }, + { + "epoch": 1.92, + "grad_norm": 6.300126552581787, + "learning_rate": 7.175668326151529e-06, + "loss": 1.7964, + "step": 15328 + }, + { + "epoch": 1.92, + "grad_norm": 14.57581615447998, + "learning_rate": 7.174831611094842e-06, + "loss": 0.7965, + "step": 15329 + }, + { + "epoch": 1.92, + "grad_norm": 35.33696746826172, + "learning_rate": 7.173994896038154e-06, + "loss": 3.6189, + "step": 15330 + }, + { + "epoch": 1.92, + "grad_norm": 4.974849224090576, + "learning_rate": 7.173158180981468e-06, + "loss": 0.6593, + "step": 15331 + }, + { + "epoch": 1.92, + "grad_norm": 15.684173583984375, + "learning_rate": 7.17232146592478e-06, + "loss": 2.0062, + "step": 15332 + }, + { + "epoch": 1.92, + "grad_norm": 10.59436321258545, + "learning_rate": 7.1714847508680925e-06, + "loss": 0.7349, + "step": 15333 + }, + { + "epoch": 1.92, + "grad_norm": 24.182329177856445, + "learning_rate": 7.170648035811405e-06, + "loss": 1.2789, + "step": 15334 + }, + { + "epoch": 1.92, + "grad_norm": 20.643909454345703, + "learning_rate": 7.169811320754717e-06, + "loss": 0.5253, + "step": 15335 + }, + { + "epoch": 1.92, + "grad_norm": 15.558892250061035, + "learning_rate": 7.16897460569803e-06, + "loss": 1.6174, + "step": 15336 + }, + { + "epoch": 1.92, + "grad_norm": 36.112823486328125, + "learning_rate": 7.168137890641342e-06, + "loss": 0.7389, + "step": 15337 + }, + { + "epoch": 1.92, + "grad_norm": 25.463340759277344, + "learning_rate": 7.167301175584656e-06, + "loss": 1.8058, + "step": 15338 + }, + { + "epoch": 1.93, + "grad_norm": 7.937673091888428, + "learning_rate": 7.166464460527968e-06, + "loss": 0.1034, + "step": 15339 + }, + { + "epoch": 1.93, + "grad_norm": 9.738503456115723, + "learning_rate": 7.1656277454712805e-06, + "loss": 0.548, + "step": 15340 + }, + { + "epoch": 1.93, + "grad_norm": 3.6526103019714355, + "learning_rate": 7.164791030414593e-06, + "loss": 0.3518, + "step": 15341 + }, + { + "epoch": 1.93, + "grad_norm": 25.582611083984375, + "learning_rate": 7.163954315357905e-06, + "loss": 0.8825, + "step": 15342 + }, + { + "epoch": 1.93, + "grad_norm": 15.33735466003418, + "learning_rate": 7.163117600301218e-06, + "loss": 1.0733, + "step": 15343 + }, + { + "epoch": 1.93, + "grad_norm": 15.540820121765137, + "learning_rate": 7.16228088524453e-06, + "loss": 1.082, + "step": 15344 + }, + { + "epoch": 1.93, + "grad_norm": 14.650261878967285, + "learning_rate": 7.161444170187844e-06, + "loss": 1.4717, + "step": 15345 + }, + { + "epoch": 1.93, + "grad_norm": 15.091980934143066, + "learning_rate": 7.1606074551311556e-06, + "loss": 0.8659, + "step": 15346 + }, + { + "epoch": 1.93, + "grad_norm": 11.12157154083252, + "learning_rate": 7.159770740074468e-06, + "loss": 0.437, + "step": 15347 + }, + { + "epoch": 1.93, + "grad_norm": 8.351181983947754, + "learning_rate": 7.158934025017781e-06, + "loss": 1.6069, + "step": 15348 + }, + { + "epoch": 1.93, + "grad_norm": 90.13201904296875, + "learning_rate": 7.158097309961093e-06, + "loss": 3.6957, + "step": 15349 + }, + { + "epoch": 1.93, + "grad_norm": 10.546927452087402, + "learning_rate": 7.157260594904406e-06, + "loss": 0.5513, + "step": 15350 + }, + { + "epoch": 1.93, + "grad_norm": 19.79357147216797, + "learning_rate": 7.156423879847718e-06, + "loss": 1.0419, + "step": 15351 + }, + { + "epoch": 1.93, + "grad_norm": 9.656234741210938, + "learning_rate": 7.1555871647910315e-06, + "loss": 2.2094, + "step": 15352 + }, + { + "epoch": 1.93, + "grad_norm": 31.641164779663086, + "learning_rate": 7.1547504497343435e-06, + "loss": 1.1596, + "step": 15353 + }, + { + "epoch": 1.93, + "grad_norm": 17.413711547851562, + "learning_rate": 7.153913734677656e-06, + "loss": 1.6971, + "step": 15354 + }, + { + "epoch": 1.93, + "grad_norm": 4.659546375274658, + "learning_rate": 7.153077019620969e-06, + "loss": 1.0103, + "step": 15355 + }, + { + "epoch": 1.93, + "grad_norm": 4.0066752433776855, + "learning_rate": 7.152240304564281e-06, + "loss": 0.1789, + "step": 15356 + }, + { + "epoch": 1.93, + "grad_norm": 5.486143589019775, + "learning_rate": 7.151403589507594e-06, + "loss": 0.3966, + "step": 15357 + }, + { + "epoch": 1.93, + "grad_norm": 5.760356426239014, + "learning_rate": 7.150566874450906e-06, + "loss": 1.0819, + "step": 15358 + }, + { + "epoch": 1.93, + "grad_norm": 12.972009658813477, + "learning_rate": 7.1497301593942195e-06, + "loss": 0.9274, + "step": 15359 + }, + { + "epoch": 1.93, + "grad_norm": 21.619810104370117, + "learning_rate": 7.148893444337531e-06, + "loss": 1.5404, + "step": 15360 + }, + { + "epoch": 1.93, + "grad_norm": 4.129205226898193, + "learning_rate": 7.148056729280844e-06, + "loss": 0.3018, + "step": 15361 + }, + { + "epoch": 1.93, + "grad_norm": 11.901113510131836, + "learning_rate": 7.147220014224156e-06, + "loss": 0.748, + "step": 15362 + }, + { + "epoch": 1.93, + "grad_norm": 12.46020793914795, + "learning_rate": 7.146383299167469e-06, + "loss": 0.5956, + "step": 15363 + }, + { + "epoch": 1.93, + "grad_norm": 8.097681045532227, + "learning_rate": 7.145546584110782e-06, + "loss": 1.1116, + "step": 15364 + }, + { + "epoch": 1.93, + "grad_norm": 9.668359756469727, + "learning_rate": 7.144709869054094e-06, + "loss": 1.9949, + "step": 15365 + }, + { + "epoch": 1.93, + "grad_norm": 7.611221790313721, + "learning_rate": 7.143873153997407e-06, + "loss": 0.3213, + "step": 15366 + }, + { + "epoch": 1.93, + "grad_norm": 64.34578704833984, + "learning_rate": 7.143036438940719e-06, + "loss": 2.5571, + "step": 15367 + }, + { + "epoch": 1.93, + "grad_norm": 7.546182632446289, + "learning_rate": 7.142199723884032e-06, + "loss": 2.1841, + "step": 15368 + }, + { + "epoch": 1.93, + "grad_norm": 3.2903261184692383, + "learning_rate": 7.141363008827344e-06, + "loss": 0.2241, + "step": 15369 + }, + { + "epoch": 1.93, + "grad_norm": 15.703153610229492, + "learning_rate": 7.140526293770657e-06, + "loss": 0.8884, + "step": 15370 + }, + { + "epoch": 1.93, + "grad_norm": 9.410683631896973, + "learning_rate": 7.13968957871397e-06, + "loss": 0.4108, + "step": 15371 + }, + { + "epoch": 1.93, + "grad_norm": 7.17274808883667, + "learning_rate": 7.138852863657282e-06, + "loss": 0.4658, + "step": 15372 + }, + { + "epoch": 1.93, + "grad_norm": 7.382733345031738, + "learning_rate": 7.138016148600595e-06, + "loss": 0.8086, + "step": 15373 + }, + { + "epoch": 1.93, + "grad_norm": 11.851573944091797, + "learning_rate": 7.137179433543907e-06, + "loss": 1.2607, + "step": 15374 + }, + { + "epoch": 1.93, + "grad_norm": 1.1019930839538574, + "learning_rate": 7.13634271848722e-06, + "loss": 0.0266, + "step": 15375 + }, + { + "epoch": 1.93, + "grad_norm": 18.079978942871094, + "learning_rate": 7.135506003430532e-06, + "loss": 2.1212, + "step": 15376 + }, + { + "epoch": 1.93, + "grad_norm": 59.86653518676758, + "learning_rate": 7.134669288373845e-06, + "loss": 1.968, + "step": 15377 + }, + { + "epoch": 1.93, + "grad_norm": 17.122314453125, + "learning_rate": 7.133832573317158e-06, + "loss": 0.509, + "step": 15378 + }, + { + "epoch": 1.93, + "grad_norm": 29.334320068359375, + "learning_rate": 7.1329958582604696e-06, + "loss": 2.0254, + "step": 15379 + }, + { + "epoch": 1.93, + "grad_norm": 11.485424041748047, + "learning_rate": 7.132159143203783e-06, + "loss": 0.9514, + "step": 15380 + }, + { + "epoch": 1.93, + "grad_norm": 6.937499046325684, + "learning_rate": 7.131322428147095e-06, + "loss": 0.4008, + "step": 15381 + }, + { + "epoch": 1.93, + "grad_norm": 2.3256618976593018, + "learning_rate": 7.130485713090408e-06, + "loss": 0.1147, + "step": 15382 + }, + { + "epoch": 1.93, + "grad_norm": 8.174434661865234, + "learning_rate": 7.12964899803372e-06, + "loss": 0.3715, + "step": 15383 + }, + { + "epoch": 1.93, + "grad_norm": 11.014540672302246, + "learning_rate": 7.128812282977033e-06, + "loss": 1.8294, + "step": 15384 + }, + { + "epoch": 1.93, + "grad_norm": 60.12565231323242, + "learning_rate": 7.1279755679203455e-06, + "loss": 1.9883, + "step": 15385 + }, + { + "epoch": 1.93, + "grad_norm": 15.926053047180176, + "learning_rate": 7.1271388528636575e-06, + "loss": 0.7368, + "step": 15386 + }, + { + "epoch": 1.93, + "grad_norm": 20.925365447998047, + "learning_rate": 7.126302137806971e-06, + "loss": 1.8686, + "step": 15387 + }, + { + "epoch": 1.93, + "grad_norm": 12.295696258544922, + "learning_rate": 7.125465422750283e-06, + "loss": 1.3463, + "step": 15388 + }, + { + "epoch": 1.93, + "grad_norm": 20.267383575439453, + "learning_rate": 7.124628707693596e-06, + "loss": 0.6291, + "step": 15389 + }, + { + "epoch": 1.93, + "grad_norm": 15.206001281738281, + "learning_rate": 7.123791992636908e-06, + "loss": 1.6648, + "step": 15390 + }, + { + "epoch": 1.93, + "grad_norm": 50.678199768066406, + "learning_rate": 7.12295527758022e-06, + "loss": 2.8122, + "step": 15391 + }, + { + "epoch": 1.93, + "grad_norm": 10.255516052246094, + "learning_rate": 7.1221185625235334e-06, + "loss": 0.7144, + "step": 15392 + }, + { + "epoch": 1.93, + "grad_norm": 18.288196563720703, + "learning_rate": 7.121281847466845e-06, + "loss": 1.5048, + "step": 15393 + }, + { + "epoch": 1.93, + "grad_norm": 35.55363082885742, + "learning_rate": 7.120445132410159e-06, + "loss": 1.2026, + "step": 15394 + }, + { + "epoch": 1.93, + "grad_norm": 6.697925090789795, + "learning_rate": 7.119608417353471e-06, + "loss": 0.4907, + "step": 15395 + }, + { + "epoch": 1.93, + "grad_norm": 18.958486557006836, + "learning_rate": 7.118771702296784e-06, + "loss": 0.8496, + "step": 15396 + }, + { + "epoch": 1.93, + "grad_norm": 50.062774658203125, + "learning_rate": 7.117934987240096e-06, + "loss": 1.6842, + "step": 15397 + }, + { + "epoch": 1.93, + "grad_norm": 3.4984216690063477, + "learning_rate": 7.117098272183408e-06, + "loss": 0.39, + "step": 15398 + }, + { + "epoch": 1.93, + "grad_norm": 56.26967239379883, + "learning_rate": 7.116261557126721e-06, + "loss": 0.7481, + "step": 15399 + }, + { + "epoch": 1.93, + "grad_norm": 13.155136108398438, + "learning_rate": 7.115424842070033e-06, + "loss": 0.7274, + "step": 15400 + }, + { + "epoch": 1.93, + "grad_norm": 8.556282043457031, + "learning_rate": 7.114588127013347e-06, + "loss": 1.1603, + "step": 15401 + }, + { + "epoch": 1.93, + "grad_norm": 6.849576473236084, + "learning_rate": 7.113751411956659e-06, + "loss": 0.2008, + "step": 15402 + }, + { + "epoch": 1.93, + "grad_norm": 40.26840591430664, + "learning_rate": 7.112914696899972e-06, + "loss": 2.0927, + "step": 15403 + }, + { + "epoch": 1.93, + "grad_norm": 9.60886287689209, + "learning_rate": 7.112077981843284e-06, + "loss": 0.7664, + "step": 15404 + }, + { + "epoch": 1.93, + "grad_norm": 12.531184196472168, + "learning_rate": 7.111241266786596e-06, + "loss": 1.0722, + "step": 15405 + }, + { + "epoch": 1.93, + "grad_norm": 17.331567764282227, + "learning_rate": 7.110404551729909e-06, + "loss": 0.843, + "step": 15406 + }, + { + "epoch": 1.93, + "grad_norm": 3.1289303302764893, + "learning_rate": 7.109567836673221e-06, + "loss": 0.2729, + "step": 15407 + }, + { + "epoch": 1.93, + "grad_norm": 11.882319450378418, + "learning_rate": 7.108731121616534e-06, + "loss": 0.8114, + "step": 15408 + }, + { + "epoch": 1.93, + "grad_norm": 9.488268852233887, + "learning_rate": 7.107894406559847e-06, + "loss": 0.4272, + "step": 15409 + }, + { + "epoch": 1.93, + "grad_norm": 30.07292938232422, + "learning_rate": 7.10705769150316e-06, + "loss": 1.7169, + "step": 15410 + }, + { + "epoch": 1.93, + "grad_norm": 17.756690979003906, + "learning_rate": 7.106220976446472e-06, + "loss": 0.7914, + "step": 15411 + }, + { + "epoch": 1.93, + "grad_norm": 68.1766357421875, + "learning_rate": 7.1053842613897836e-06, + "loss": 0.6702, + "step": 15412 + }, + { + "epoch": 1.93, + "grad_norm": 4.996368408203125, + "learning_rate": 7.104547546333097e-06, + "loss": 0.3774, + "step": 15413 + }, + { + "epoch": 1.93, + "grad_norm": 12.997795104980469, + "learning_rate": 7.103710831276409e-06, + "loss": 0.9306, + "step": 15414 + }, + { + "epoch": 1.93, + "grad_norm": 15.791701316833496, + "learning_rate": 7.102874116219722e-06, + "loss": 0.5919, + "step": 15415 + }, + { + "epoch": 1.93, + "grad_norm": 15.011215209960938, + "learning_rate": 7.102037401163035e-06, + "loss": 2.4547, + "step": 15416 + }, + { + "epoch": 1.93, + "grad_norm": 12.556661605834961, + "learning_rate": 7.1012006861063476e-06, + "loss": 1.3745, + "step": 15417 + }, + { + "epoch": 1.93, + "grad_norm": 11.91063404083252, + "learning_rate": 7.1003639710496595e-06, + "loss": 0.7513, + "step": 15418 + }, + { + "epoch": 1.94, + "grad_norm": 7.577408790588379, + "learning_rate": 7.0995272559929715e-06, + "loss": 0.9176, + "step": 15419 + }, + { + "epoch": 1.94, + "grad_norm": 13.84679889678955, + "learning_rate": 7.098690540936285e-06, + "loss": 1.835, + "step": 15420 + }, + { + "epoch": 1.94, + "grad_norm": 35.05377197265625, + "learning_rate": 7.097853825879597e-06, + "loss": 1.8702, + "step": 15421 + }, + { + "epoch": 1.94, + "grad_norm": 16.646114349365234, + "learning_rate": 7.09701711082291e-06, + "loss": 1.0236, + "step": 15422 + }, + { + "epoch": 1.94, + "grad_norm": 10.494402885437012, + "learning_rate": 7.096180395766223e-06, + "loss": 1.6881, + "step": 15423 + }, + { + "epoch": 1.94, + "grad_norm": 7.679769039154053, + "learning_rate": 7.0953436807095355e-06, + "loss": 0.4881, + "step": 15424 + }, + { + "epoch": 1.94, + "grad_norm": 8.879209518432617, + "learning_rate": 7.0945069656528474e-06, + "loss": 1.7926, + "step": 15425 + }, + { + "epoch": 1.94, + "grad_norm": 16.050445556640625, + "learning_rate": 7.093670250596159e-06, + "loss": 1.0691, + "step": 15426 + }, + { + "epoch": 1.94, + "grad_norm": 12.42928695678711, + "learning_rate": 7.092833535539473e-06, + "loss": 0.7106, + "step": 15427 + }, + { + "epoch": 1.94, + "grad_norm": 13.413458824157715, + "learning_rate": 7.091996820482785e-06, + "loss": 0.751, + "step": 15428 + }, + { + "epoch": 1.94, + "grad_norm": 6.810462474822998, + "learning_rate": 7.091160105426098e-06, + "loss": 0.3828, + "step": 15429 + }, + { + "epoch": 1.94, + "grad_norm": 30.17489242553711, + "learning_rate": 7.090323390369411e-06, + "loss": 1.5593, + "step": 15430 + }, + { + "epoch": 1.94, + "grad_norm": 86.79495239257812, + "learning_rate": 7.0894866753127226e-06, + "loss": 0.6664, + "step": 15431 + }, + { + "epoch": 1.94, + "grad_norm": 20.300350189208984, + "learning_rate": 7.088649960256035e-06, + "loss": 0.7502, + "step": 15432 + }, + { + "epoch": 1.94, + "grad_norm": 17.606658935546875, + "learning_rate": 7.087813245199347e-06, + "loss": 1.5995, + "step": 15433 + }, + { + "epoch": 1.94, + "grad_norm": 21.617136001586914, + "learning_rate": 7.086976530142661e-06, + "loss": 1.7476, + "step": 15434 + }, + { + "epoch": 1.94, + "grad_norm": 22.607784271240234, + "learning_rate": 7.086139815085973e-06, + "loss": 0.8055, + "step": 15435 + }, + { + "epoch": 1.94, + "grad_norm": 33.39226531982422, + "learning_rate": 7.085303100029286e-06, + "loss": 1.6133, + "step": 15436 + }, + { + "epoch": 1.94, + "grad_norm": 78.84778594970703, + "learning_rate": 7.084466384972598e-06, + "loss": 3.1536, + "step": 15437 + }, + { + "epoch": 1.94, + "grad_norm": 11.63171100616455, + "learning_rate": 7.0836296699159105e-06, + "loss": 0.9092, + "step": 15438 + }, + { + "epoch": 1.94, + "grad_norm": 26.426563262939453, + "learning_rate": 7.082792954859223e-06, + "loss": 0.9172, + "step": 15439 + }, + { + "epoch": 1.94, + "grad_norm": 5.196547031402588, + "learning_rate": 7.081956239802535e-06, + "loss": 0.6071, + "step": 15440 + }, + { + "epoch": 1.94, + "grad_norm": 5.708215713500977, + "learning_rate": 7.081119524745849e-06, + "loss": 1.3831, + "step": 15441 + }, + { + "epoch": 1.94, + "grad_norm": 8.377507209777832, + "learning_rate": 7.080282809689161e-06, + "loss": 0.7, + "step": 15442 + }, + { + "epoch": 1.94, + "grad_norm": 27.26991844177246, + "learning_rate": 7.079446094632474e-06, + "loss": 0.5514, + "step": 15443 + }, + { + "epoch": 1.94, + "grad_norm": 10.7058744430542, + "learning_rate": 7.078609379575786e-06, + "loss": 0.8448, + "step": 15444 + }, + { + "epoch": 1.94, + "grad_norm": 27.420000076293945, + "learning_rate": 7.077772664519098e-06, + "loss": 1.2417, + "step": 15445 + }, + { + "epoch": 1.94, + "grad_norm": 7.791304588317871, + "learning_rate": 7.076935949462411e-06, + "loss": 0.7116, + "step": 15446 + }, + { + "epoch": 1.94, + "grad_norm": 8.022967338562012, + "learning_rate": 7.076099234405723e-06, + "loss": 0.8989, + "step": 15447 + }, + { + "epoch": 1.94, + "grad_norm": 17.566761016845703, + "learning_rate": 7.075262519349037e-06, + "loss": 0.6213, + "step": 15448 + }, + { + "epoch": 1.94, + "grad_norm": 13.799229621887207, + "learning_rate": 7.074425804292349e-06, + "loss": 1.849, + "step": 15449 + }, + { + "epoch": 1.94, + "grad_norm": 184.754150390625, + "learning_rate": 7.0735890892356616e-06, + "loss": 1.5852, + "step": 15450 + }, + { + "epoch": 1.94, + "grad_norm": 2.525916337966919, + "learning_rate": 7.0727523741789735e-06, + "loss": 0.0662, + "step": 15451 + }, + { + "epoch": 1.94, + "grad_norm": 11.680980682373047, + "learning_rate": 7.071915659122286e-06, + "loss": 0.7585, + "step": 15452 + }, + { + "epoch": 1.94, + "grad_norm": 89.53353881835938, + "learning_rate": 7.071078944065599e-06, + "loss": 2.3426, + "step": 15453 + }, + { + "epoch": 1.94, + "grad_norm": 18.292312622070312, + "learning_rate": 7.070242229008911e-06, + "loss": 1.1013, + "step": 15454 + }, + { + "epoch": 1.94, + "grad_norm": 11.780877113342285, + "learning_rate": 7.069405513952225e-06, + "loss": 0.9168, + "step": 15455 + }, + { + "epoch": 1.94, + "grad_norm": 5.118886470794678, + "learning_rate": 7.068568798895537e-06, + "loss": 0.4283, + "step": 15456 + }, + { + "epoch": 1.94, + "grad_norm": 11.497488975524902, + "learning_rate": 7.0677320838388495e-06, + "loss": 0.3166, + "step": 15457 + }, + { + "epoch": 1.94, + "grad_norm": 11.876434326171875, + "learning_rate": 7.0668953687821614e-06, + "loss": 0.6062, + "step": 15458 + }, + { + "epoch": 1.94, + "grad_norm": 7.251729488372803, + "learning_rate": 7.066058653725474e-06, + "loss": 0.2288, + "step": 15459 + }, + { + "epoch": 1.94, + "grad_norm": 16.087635040283203, + "learning_rate": 7.065221938668787e-06, + "loss": 1.1124, + "step": 15460 + }, + { + "epoch": 1.94, + "grad_norm": 13.209246635437012, + "learning_rate": 7.064385223612099e-06, + "loss": 1.7922, + "step": 15461 + }, + { + "epoch": 1.94, + "grad_norm": 12.305509567260742, + "learning_rate": 7.063548508555413e-06, + "loss": 0.9409, + "step": 15462 + }, + { + "epoch": 1.94, + "grad_norm": 7.604883193969727, + "learning_rate": 7.062711793498725e-06, + "loss": 0.244, + "step": 15463 + }, + { + "epoch": 1.94, + "grad_norm": 60.780635833740234, + "learning_rate": 7.061875078442037e-06, + "loss": 0.7982, + "step": 15464 + }, + { + "epoch": 1.94, + "grad_norm": 23.57979965209961, + "learning_rate": 7.061038363385349e-06, + "loss": 1.2766, + "step": 15465 + }, + { + "epoch": 1.94, + "grad_norm": 11.719807624816895, + "learning_rate": 7.060201648328662e-06, + "loss": 1.5545, + "step": 15466 + }, + { + "epoch": 1.94, + "grad_norm": 9.688767433166504, + "learning_rate": 7.059364933271975e-06, + "loss": 0.461, + "step": 15467 + }, + { + "epoch": 1.94, + "grad_norm": 5.736171245574951, + "learning_rate": 7.058528218215287e-06, + "loss": 0.9425, + "step": 15468 + }, + { + "epoch": 1.94, + "grad_norm": 11.748924255371094, + "learning_rate": 7.0576915031586005e-06, + "loss": 2.2647, + "step": 15469 + }, + { + "epoch": 1.94, + "grad_norm": 16.284366607666016, + "learning_rate": 7.0568547881019125e-06, + "loss": 1.0216, + "step": 15470 + }, + { + "epoch": 1.94, + "grad_norm": 10.771014213562012, + "learning_rate": 7.056018073045225e-06, + "loss": 0.3961, + "step": 15471 + }, + { + "epoch": 1.94, + "grad_norm": 23.656824111938477, + "learning_rate": 7.055181357988537e-06, + "loss": 1.489, + "step": 15472 + }, + { + "epoch": 1.94, + "grad_norm": 9.67492389678955, + "learning_rate": 7.054344642931849e-06, + "loss": 1.5355, + "step": 15473 + }, + { + "epoch": 1.94, + "grad_norm": 22.204484939575195, + "learning_rate": 7.053507927875163e-06, + "loss": 1.4844, + "step": 15474 + }, + { + "epoch": 1.94, + "grad_norm": 21.31995391845703, + "learning_rate": 7.052671212818475e-06, + "loss": 1.2159, + "step": 15475 + }, + { + "epoch": 1.94, + "grad_norm": 6.922398090362549, + "learning_rate": 7.0518344977617885e-06, + "loss": 0.5267, + "step": 15476 + }, + { + "epoch": 1.94, + "grad_norm": 22.273216247558594, + "learning_rate": 7.0509977827051e-06, + "loss": 1.3283, + "step": 15477 + }, + { + "epoch": 1.94, + "grad_norm": 17.06501579284668, + "learning_rate": 7.050161067648413e-06, + "loss": 0.7504, + "step": 15478 + }, + { + "epoch": 1.94, + "grad_norm": 8.22417163848877, + "learning_rate": 7.049324352591725e-06, + "loss": 0.4198, + "step": 15479 + }, + { + "epoch": 1.94, + "grad_norm": 10.979037284851074, + "learning_rate": 7.048487637535037e-06, + "loss": 0.4843, + "step": 15480 + }, + { + "epoch": 1.94, + "grad_norm": 165.80557250976562, + "learning_rate": 7.047650922478351e-06, + "loss": 0.6725, + "step": 15481 + }, + { + "epoch": 1.94, + "grad_norm": 10.835333824157715, + "learning_rate": 7.046814207421663e-06, + "loss": 0.8138, + "step": 15482 + }, + { + "epoch": 1.94, + "grad_norm": 13.948822021484375, + "learning_rate": 7.045977492364976e-06, + "loss": 0.1694, + "step": 15483 + }, + { + "epoch": 1.94, + "grad_norm": 13.02929401397705, + "learning_rate": 7.045140777308288e-06, + "loss": 0.6899, + "step": 15484 + }, + { + "epoch": 1.94, + "grad_norm": 5.320096492767334, + "learning_rate": 7.044304062251601e-06, + "loss": 0.1951, + "step": 15485 + }, + { + "epoch": 1.94, + "grad_norm": 15.157958984375, + "learning_rate": 7.043467347194913e-06, + "loss": 1.6232, + "step": 15486 + }, + { + "epoch": 1.94, + "grad_norm": 23.764209747314453, + "learning_rate": 7.042630632138225e-06, + "loss": 2.2997, + "step": 15487 + }, + { + "epoch": 1.94, + "grad_norm": 12.446185111999512, + "learning_rate": 7.041793917081539e-06, + "loss": 0.6686, + "step": 15488 + }, + { + "epoch": 1.94, + "grad_norm": 9.41409969329834, + "learning_rate": 7.040957202024851e-06, + "loss": 0.9177, + "step": 15489 + }, + { + "epoch": 1.94, + "grad_norm": 6.236893653869629, + "learning_rate": 7.0401204869681635e-06, + "loss": 0.255, + "step": 15490 + }, + { + "epoch": 1.94, + "grad_norm": 2.5406250953674316, + "learning_rate": 7.039283771911476e-06, + "loss": 0.1584, + "step": 15491 + }, + { + "epoch": 1.94, + "grad_norm": 9.143668174743652, + "learning_rate": 7.038447056854789e-06, + "loss": 0.6818, + "step": 15492 + }, + { + "epoch": 1.94, + "grad_norm": 13.777385711669922, + "learning_rate": 7.037610341798101e-06, + "loss": 1.0515, + "step": 15493 + }, + { + "epoch": 1.94, + "grad_norm": 6.954865455627441, + "learning_rate": 7.036773626741413e-06, + "loss": 0.4036, + "step": 15494 + }, + { + "epoch": 1.94, + "grad_norm": 14.621844291687012, + "learning_rate": 7.035936911684727e-06, + "loss": 1.7383, + "step": 15495 + }, + { + "epoch": 1.94, + "grad_norm": 19.72504234313965, + "learning_rate": 7.0351001966280386e-06, + "loss": 0.2982, + "step": 15496 + }, + { + "epoch": 1.94, + "grad_norm": 10.424211502075195, + "learning_rate": 7.034263481571351e-06, + "loss": 1.3376, + "step": 15497 + }, + { + "epoch": 1.94, + "grad_norm": 51.879905700683594, + "learning_rate": 7.033426766514664e-06, + "loss": 0.5933, + "step": 15498 + }, + { + "epoch": 1.95, + "grad_norm": 15.023542404174805, + "learning_rate": 7.032590051457977e-06, + "loss": 2.1166, + "step": 15499 + }, + { + "epoch": 1.95, + "grad_norm": 25.0118350982666, + "learning_rate": 7.031753336401289e-06, + "loss": 0.8506, + "step": 15500 + }, + { + "epoch": 1.95, + "grad_norm": 20.47811508178711, + "learning_rate": 7.030916621344601e-06, + "loss": 1.2114, + "step": 15501 + }, + { + "epoch": 1.95, + "grad_norm": 9.80174446105957, + "learning_rate": 7.0300799062879145e-06, + "loss": 0.302, + "step": 15502 + }, + { + "epoch": 1.95, + "grad_norm": 67.72651672363281, + "learning_rate": 7.0292431912312265e-06, + "loss": 0.9023, + "step": 15503 + }, + { + "epoch": 1.95, + "grad_norm": 17.172203063964844, + "learning_rate": 7.028406476174539e-06, + "loss": 0.9231, + "step": 15504 + }, + { + "epoch": 1.95, + "grad_norm": 3.053865432739258, + "learning_rate": 7.027569761117852e-06, + "loss": 0.8876, + "step": 15505 + }, + { + "epoch": 1.95, + "grad_norm": 19.792144775390625, + "learning_rate": 7.026733046061165e-06, + "loss": 2.6251, + "step": 15506 + }, + { + "epoch": 1.95, + "grad_norm": 13.75473403930664, + "learning_rate": 7.025896331004477e-06, + "loss": 1.8659, + "step": 15507 + }, + { + "epoch": 1.95, + "grad_norm": 10.364108085632324, + "learning_rate": 7.025059615947789e-06, + "loss": 1.7935, + "step": 15508 + }, + { + "epoch": 1.95, + "grad_norm": 5.748884201049805, + "learning_rate": 7.0242229008911025e-06, + "loss": 1.4596, + "step": 15509 + }, + { + "epoch": 1.95, + "grad_norm": 4.112232208251953, + "learning_rate": 7.023386185834414e-06, + "loss": 0.2395, + "step": 15510 + }, + { + "epoch": 1.95, + "grad_norm": 6.716560363769531, + "learning_rate": 7.022549470777727e-06, + "loss": 0.4791, + "step": 15511 + }, + { + "epoch": 1.95, + "grad_norm": 22.984079360961914, + "learning_rate": 7.02171275572104e-06, + "loss": 1.3597, + "step": 15512 + }, + { + "epoch": 1.95, + "grad_norm": 17.0230655670166, + "learning_rate": 7.020876040664353e-06, + "loss": 1.5377, + "step": 15513 + }, + { + "epoch": 1.95, + "grad_norm": 3.351734161376953, + "learning_rate": 7.020039325607665e-06, + "loss": 0.2617, + "step": 15514 + }, + { + "epoch": 1.95, + "grad_norm": 242.02784729003906, + "learning_rate": 7.019202610550977e-06, + "loss": 2.3834, + "step": 15515 + }, + { + "epoch": 1.95, + "grad_norm": 4.806171894073486, + "learning_rate": 7.01836589549429e-06, + "loss": 0.2289, + "step": 15516 + }, + { + "epoch": 1.95, + "grad_norm": 42.979461669921875, + "learning_rate": 7.017529180437602e-06, + "loss": 1.302, + "step": 15517 + }, + { + "epoch": 1.95, + "grad_norm": 10.605782508850098, + "learning_rate": 7.016692465380915e-06, + "loss": 0.9935, + "step": 15518 + }, + { + "epoch": 1.95, + "grad_norm": 5.702014446258545, + "learning_rate": 7.015855750324227e-06, + "loss": 0.3763, + "step": 15519 + }, + { + "epoch": 1.95, + "grad_norm": 9.059732437133789, + "learning_rate": 7.015019035267541e-06, + "loss": 0.7708, + "step": 15520 + }, + { + "epoch": 1.95, + "grad_norm": 75.51557922363281, + "learning_rate": 7.014182320210853e-06, + "loss": 1.0951, + "step": 15521 + }, + { + "epoch": 1.95, + "grad_norm": 14.990911483764648, + "learning_rate": 7.013345605154165e-06, + "loss": 0.8757, + "step": 15522 + }, + { + "epoch": 1.95, + "grad_norm": 26.17129898071289, + "learning_rate": 7.012508890097478e-06, + "loss": 1.2145, + "step": 15523 + }, + { + "epoch": 1.95, + "grad_norm": 9.07250690460205, + "learning_rate": 7.01167217504079e-06, + "loss": 0.9486, + "step": 15524 + }, + { + "epoch": 1.95, + "grad_norm": 95.37599182128906, + "learning_rate": 7.010835459984103e-06, + "loss": 3.1057, + "step": 15525 + }, + { + "epoch": 1.95, + "grad_norm": 67.44821166992188, + "learning_rate": 7.009998744927415e-06, + "loss": 1.2477, + "step": 15526 + }, + { + "epoch": 1.95, + "grad_norm": 16.430702209472656, + "learning_rate": 7.009162029870729e-06, + "loss": 1.0081, + "step": 15527 + }, + { + "epoch": 1.95, + "grad_norm": 11.817269325256348, + "learning_rate": 7.008325314814041e-06, + "loss": 1.8068, + "step": 15528 + }, + { + "epoch": 1.95, + "grad_norm": 2.9634690284729004, + "learning_rate": 7.0074885997573526e-06, + "loss": 0.1884, + "step": 15529 + }, + { + "epoch": 1.95, + "grad_norm": 5.671106338500977, + "learning_rate": 7.006651884700666e-06, + "loss": 0.5094, + "step": 15530 + }, + { + "epoch": 1.95, + "grad_norm": 6.831871509552002, + "learning_rate": 7.005815169643978e-06, + "loss": 0.4738, + "step": 15531 + }, + { + "epoch": 1.95, + "grad_norm": 12.323295593261719, + "learning_rate": 7.004978454587291e-06, + "loss": 0.2962, + "step": 15532 + }, + { + "epoch": 1.95, + "grad_norm": 10.955921173095703, + "learning_rate": 7.004141739530603e-06, + "loss": 0.6173, + "step": 15533 + }, + { + "epoch": 1.95, + "grad_norm": 25.807981491088867, + "learning_rate": 7.0033050244739166e-06, + "loss": 1.2629, + "step": 15534 + }, + { + "epoch": 1.95, + "grad_norm": 33.78031921386719, + "learning_rate": 7.0024683094172285e-06, + "loss": 2.3431, + "step": 15535 + }, + { + "epoch": 1.95, + "grad_norm": 24.45186424255371, + "learning_rate": 7.0016315943605405e-06, + "loss": 0.9759, + "step": 15536 + }, + { + "epoch": 1.95, + "grad_norm": 42.53618240356445, + "learning_rate": 7.000794879303854e-06, + "loss": 1.7634, + "step": 15537 + }, + { + "epoch": 1.95, + "grad_norm": 7.286814212799072, + "learning_rate": 6.999958164247166e-06, + "loss": 0.8893, + "step": 15538 + }, + { + "epoch": 1.95, + "grad_norm": 14.286301612854004, + "learning_rate": 6.999121449190479e-06, + "loss": 0.481, + "step": 15539 + }, + { + "epoch": 1.95, + "grad_norm": 21.10111427307129, + "learning_rate": 6.998284734133791e-06, + "loss": 0.7561, + "step": 15540 + }, + { + "epoch": 1.95, + "grad_norm": 38.04764175415039, + "learning_rate": 6.9974480190771045e-06, + "loss": 1.5845, + "step": 15541 + }, + { + "epoch": 1.95, + "grad_norm": 25.33977508544922, + "learning_rate": 6.9966113040204164e-06, + "loss": 1.8679, + "step": 15542 + }, + { + "epoch": 1.95, + "grad_norm": 26.45148468017578, + "learning_rate": 6.995774588963728e-06, + "loss": 1.0318, + "step": 15543 + }, + { + "epoch": 1.95, + "grad_norm": 16.335588455200195, + "learning_rate": 6.994937873907042e-06, + "loss": 0.973, + "step": 15544 + }, + { + "epoch": 1.95, + "grad_norm": 12.946664810180664, + "learning_rate": 6.994101158850354e-06, + "loss": 1.432, + "step": 15545 + }, + { + "epoch": 1.95, + "grad_norm": 15.203397750854492, + "learning_rate": 6.993264443793667e-06, + "loss": 0.7775, + "step": 15546 + }, + { + "epoch": 1.95, + "grad_norm": 17.209043502807617, + "learning_rate": 6.992427728736979e-06, + "loss": 1.6214, + "step": 15547 + }, + { + "epoch": 1.95, + "grad_norm": 4.763405799865723, + "learning_rate": 6.991591013680292e-06, + "loss": 0.0663, + "step": 15548 + }, + { + "epoch": 1.95, + "grad_norm": 4.411322593688965, + "learning_rate": 6.990754298623604e-06, + "loss": 0.2267, + "step": 15549 + }, + { + "epoch": 1.95, + "grad_norm": 31.33431053161621, + "learning_rate": 6.989917583566916e-06, + "loss": 2.5391, + "step": 15550 + }, + { + "epoch": 1.95, + "grad_norm": 9.03928279876709, + "learning_rate": 6.98908086851023e-06, + "loss": 0.4988, + "step": 15551 + }, + { + "epoch": 1.95, + "grad_norm": 13.892117500305176, + "learning_rate": 6.988244153453542e-06, + "loss": 1.6088, + "step": 15552 + }, + { + "epoch": 1.95, + "grad_norm": 333.7330627441406, + "learning_rate": 6.987407438396855e-06, + "loss": 1.2483, + "step": 15553 + }, + { + "epoch": 1.95, + "grad_norm": 9.675161361694336, + "learning_rate": 6.986570723340167e-06, + "loss": 1.1111, + "step": 15554 + }, + { + "epoch": 1.95, + "grad_norm": 17.345584869384766, + "learning_rate": 6.98573400828348e-06, + "loss": 1.7745, + "step": 15555 + }, + { + "epoch": 1.95, + "grad_norm": 19.145296096801758, + "learning_rate": 6.984897293226792e-06, + "loss": 1.1672, + "step": 15556 + }, + { + "epoch": 1.95, + "grad_norm": 27.76471519470215, + "learning_rate": 6.984060578170104e-06, + "loss": 0.8067, + "step": 15557 + }, + { + "epoch": 1.95, + "grad_norm": 14.955503463745117, + "learning_rate": 6.983223863113418e-06, + "loss": 1.2938, + "step": 15558 + }, + { + "epoch": 1.95, + "grad_norm": 10.937143325805664, + "learning_rate": 6.98238714805673e-06, + "loss": 0.2304, + "step": 15559 + }, + { + "epoch": 1.95, + "grad_norm": 16.57361602783203, + "learning_rate": 6.981550433000043e-06, + "loss": 1.6325, + "step": 15560 + }, + { + "epoch": 1.95, + "grad_norm": 139.50778198242188, + "learning_rate": 6.980713717943355e-06, + "loss": 0.9943, + "step": 15561 + }, + { + "epoch": 1.95, + "grad_norm": 8.616129875183105, + "learning_rate": 6.979877002886668e-06, + "loss": 0.3728, + "step": 15562 + }, + { + "epoch": 1.95, + "grad_norm": 7.047756195068359, + "learning_rate": 6.97904028782998e-06, + "loss": 0.9883, + "step": 15563 + }, + { + "epoch": 1.95, + "grad_norm": 15.066789627075195, + "learning_rate": 6.978203572773292e-06, + "loss": 0.3011, + "step": 15564 + }, + { + "epoch": 1.95, + "grad_norm": 9.801277160644531, + "learning_rate": 6.977366857716605e-06, + "loss": 1.6511, + "step": 15565 + }, + { + "epoch": 1.95, + "grad_norm": 32.564884185791016, + "learning_rate": 6.976530142659918e-06, + "loss": 1.3165, + "step": 15566 + }, + { + "epoch": 1.95, + "grad_norm": 47.5469970703125, + "learning_rate": 6.9756934276032306e-06, + "loss": 2.0164, + "step": 15567 + }, + { + "epoch": 1.95, + "grad_norm": 21.765548706054688, + "learning_rate": 6.9748567125465425e-06, + "loss": 1.201, + "step": 15568 + }, + { + "epoch": 1.95, + "grad_norm": 27.66904067993164, + "learning_rate": 6.974019997489856e-06, + "loss": 1.2243, + "step": 15569 + }, + { + "epoch": 1.95, + "grad_norm": 16.696565628051758, + "learning_rate": 6.973183282433168e-06, + "loss": 2.2312, + "step": 15570 + }, + { + "epoch": 1.95, + "grad_norm": 26.649904251098633, + "learning_rate": 6.97234656737648e-06, + "loss": 1.092, + "step": 15571 + }, + { + "epoch": 1.95, + "grad_norm": 24.76673126220703, + "learning_rate": 6.971509852319793e-06, + "loss": 2.7237, + "step": 15572 + }, + { + "epoch": 1.95, + "grad_norm": 14.912168502807617, + "learning_rate": 6.970673137263106e-06, + "loss": 0.9972, + "step": 15573 + }, + { + "epoch": 1.95, + "grad_norm": 14.813760757446289, + "learning_rate": 6.9698364222064185e-06, + "loss": 0.5662, + "step": 15574 + }, + { + "epoch": 1.95, + "grad_norm": 19.716306686401367, + "learning_rate": 6.9689997071497304e-06, + "loss": 1.6188, + "step": 15575 + }, + { + "epoch": 1.95, + "grad_norm": 11.275886535644531, + "learning_rate": 6.968162992093044e-06, + "loss": 1.7687, + "step": 15576 + }, + { + "epoch": 1.95, + "grad_norm": 122.64093017578125, + "learning_rate": 6.967326277036356e-06, + "loss": 3.0464, + "step": 15577 + }, + { + "epoch": 1.96, + "grad_norm": 17.656211853027344, + "learning_rate": 6.966489561979668e-06, + "loss": 0.9186, + "step": 15578 + }, + { + "epoch": 1.96, + "grad_norm": 21.526639938354492, + "learning_rate": 6.965652846922981e-06, + "loss": 0.6122, + "step": 15579 + }, + { + "epoch": 1.96, + "grad_norm": 15.666875839233398, + "learning_rate": 6.964816131866294e-06, + "loss": 0.8211, + "step": 15580 + }, + { + "epoch": 1.96, + "grad_norm": 6.567947864532471, + "learning_rate": 6.963979416809606e-06, + "loss": 0.6006, + "step": 15581 + }, + { + "epoch": 1.96, + "grad_norm": 11.897137641906738, + "learning_rate": 6.963142701752918e-06, + "loss": 0.2065, + "step": 15582 + }, + { + "epoch": 1.96, + "grad_norm": 35.2044677734375, + "learning_rate": 6.962305986696232e-06, + "loss": 0.8952, + "step": 15583 + }, + { + "epoch": 1.96, + "grad_norm": 36.7293815612793, + "learning_rate": 6.961469271639544e-06, + "loss": 0.5825, + "step": 15584 + }, + { + "epoch": 1.96, + "grad_norm": 4.57098913192749, + "learning_rate": 6.960632556582856e-06, + "loss": 0.2379, + "step": 15585 + }, + { + "epoch": 1.96, + "grad_norm": 12.477896690368652, + "learning_rate": 6.959795841526169e-06, + "loss": 1.7507, + "step": 15586 + }, + { + "epoch": 1.96, + "grad_norm": 25.72513771057129, + "learning_rate": 6.9589591264694815e-06, + "loss": 0.7155, + "step": 15587 + }, + { + "epoch": 1.96, + "grad_norm": 12.6240234375, + "learning_rate": 6.958122411412794e-06, + "loss": 0.4072, + "step": 15588 + }, + { + "epoch": 1.96, + "grad_norm": 10.934189796447754, + "learning_rate": 6.957285696356106e-06, + "loss": 0.5685, + "step": 15589 + }, + { + "epoch": 1.96, + "grad_norm": 169.14804077148438, + "learning_rate": 6.95644898129942e-06, + "loss": 2.121, + "step": 15590 + }, + { + "epoch": 1.96, + "grad_norm": 20.99358367919922, + "learning_rate": 6.955612266242732e-06, + "loss": 1.0553, + "step": 15591 + }, + { + "epoch": 1.96, + "grad_norm": 7.8657612800598145, + "learning_rate": 6.954775551186044e-06, + "loss": 0.6709, + "step": 15592 + }, + { + "epoch": 1.96, + "grad_norm": 17.866134643554688, + "learning_rate": 6.953938836129357e-06, + "loss": 1.0087, + "step": 15593 + }, + { + "epoch": 1.96, + "grad_norm": 45.2334098815918, + "learning_rate": 6.9531021210726694e-06, + "loss": 2.2569, + "step": 15594 + }, + { + "epoch": 1.96, + "grad_norm": 20.872684478759766, + "learning_rate": 6.952265406015982e-06, + "loss": 1.8624, + "step": 15595 + }, + { + "epoch": 1.96, + "grad_norm": 8.320897102355957, + "learning_rate": 6.951428690959294e-06, + "loss": 0.7625, + "step": 15596 + }, + { + "epoch": 1.96, + "grad_norm": 21.983694076538086, + "learning_rate": 6.950591975902608e-06, + "loss": 1.1616, + "step": 15597 + }, + { + "epoch": 1.96, + "grad_norm": 31.263404846191406, + "learning_rate": 6.94975526084592e-06, + "loss": 0.454, + "step": 15598 + }, + { + "epoch": 1.96, + "grad_norm": 22.783296585083008, + "learning_rate": 6.948918545789232e-06, + "loss": 0.932, + "step": 15599 + }, + { + "epoch": 1.96, + "grad_norm": 45.86775207519531, + "learning_rate": 6.9480818307325446e-06, + "loss": 1.7207, + "step": 15600 + }, + { + "epoch": 1.96, + "eval_loss": 0.08063507825136185, + "eval_runtime": 96.7731, + "eval_samples_per_second": 36.601, + "eval_steps_per_second": 36.601, + "step": 15600 + }, + { + "epoch": 1.96, + "grad_norm": 55.06128692626953, + "learning_rate": 6.9472451156758565e-06, + "loss": 2.4919, + "step": 15601 + }, + { + "epoch": 1.96, + "grad_norm": 12.562663078308105, + "learning_rate": 6.94640840061917e-06, + "loss": 0.6102, + "step": 15602 + }, + { + "epoch": 1.96, + "grad_norm": 18.10681915283203, + "learning_rate": 6.945571685562482e-06, + "loss": 0.9363, + "step": 15603 + }, + { + "epoch": 1.96, + "grad_norm": 10.809121131896973, + "learning_rate": 6.944734970505796e-06, + "loss": 0.9123, + "step": 15604 + }, + { + "epoch": 1.96, + "grad_norm": 6.625796318054199, + "learning_rate": 6.943898255449108e-06, + "loss": 1.3045, + "step": 15605 + }, + { + "epoch": 1.96, + "grad_norm": 7.727932453155518, + "learning_rate": 6.94306154039242e-06, + "loss": 1.7243, + "step": 15606 + }, + { + "epoch": 1.96, + "grad_norm": 57.7634391784668, + "learning_rate": 6.9422248253357325e-06, + "loss": 4.2435, + "step": 15607 + }, + { + "epoch": 1.96, + "grad_norm": 8.309063911437988, + "learning_rate": 6.9413881102790444e-06, + "loss": 0.6944, + "step": 15608 + }, + { + "epoch": 1.96, + "grad_norm": 25.379261016845703, + "learning_rate": 6.940551395222358e-06, + "loss": 1.1329, + "step": 15609 + }, + { + "epoch": 1.96, + "grad_norm": 13.84262752532959, + "learning_rate": 6.93971468016567e-06, + "loss": 1.0546, + "step": 15610 + }, + { + "epoch": 1.96, + "grad_norm": 3.004401922225952, + "learning_rate": 6.938877965108982e-06, + "loss": 0.1816, + "step": 15611 + }, + { + "epoch": 1.96, + "grad_norm": 7.947330951690674, + "learning_rate": 6.938041250052296e-06, + "loss": 0.3381, + "step": 15612 + }, + { + "epoch": 1.96, + "grad_norm": 15.322650909423828, + "learning_rate": 6.937204534995608e-06, + "loss": 0.3763, + "step": 15613 + }, + { + "epoch": 1.96, + "grad_norm": 26.744220733642578, + "learning_rate": 6.93636781993892e-06, + "loss": 1.0411, + "step": 15614 + }, + { + "epoch": 1.96, + "grad_norm": 35.896907806396484, + "learning_rate": 6.935531104882232e-06, + "loss": 3.2921, + "step": 15615 + }, + { + "epoch": 1.96, + "grad_norm": 17.5107479095459, + "learning_rate": 6.934694389825546e-06, + "loss": 2.53, + "step": 15616 + }, + { + "epoch": 1.96, + "grad_norm": 13.85987377166748, + "learning_rate": 6.933857674768858e-06, + "loss": 1.5989, + "step": 15617 + }, + { + "epoch": 1.96, + "grad_norm": 26.768171310424805, + "learning_rate": 6.93302095971217e-06, + "loss": 3.0122, + "step": 15618 + }, + { + "epoch": 1.96, + "grad_norm": 4.744687080383301, + "learning_rate": 6.9321842446554835e-06, + "loss": 0.2583, + "step": 15619 + }, + { + "epoch": 1.96, + "grad_norm": 11.324332237243652, + "learning_rate": 6.9313475295987955e-06, + "loss": 1.3009, + "step": 15620 + }, + { + "epoch": 1.96, + "grad_norm": 4.5160698890686035, + "learning_rate": 6.930510814542108e-06, + "loss": 0.26, + "step": 15621 + }, + { + "epoch": 1.96, + "grad_norm": 9.876502990722656, + "learning_rate": 6.92967409948542e-06, + "loss": 2.2473, + "step": 15622 + }, + { + "epoch": 1.96, + "grad_norm": 22.673030853271484, + "learning_rate": 6.928837384428734e-06, + "loss": 1.5846, + "step": 15623 + }, + { + "epoch": 1.96, + "grad_norm": 5.811694622039795, + "learning_rate": 6.928000669372046e-06, + "loss": 0.3244, + "step": 15624 + }, + { + "epoch": 1.96, + "grad_norm": 18.304227828979492, + "learning_rate": 6.927163954315358e-06, + "loss": 0.5696, + "step": 15625 + }, + { + "epoch": 1.96, + "grad_norm": 20.676616668701172, + "learning_rate": 6.9263272392586715e-06, + "loss": 1.7732, + "step": 15626 + }, + { + "epoch": 1.96, + "grad_norm": 6.337315559387207, + "learning_rate": 6.925490524201983e-06, + "loss": 1.3696, + "step": 15627 + }, + { + "epoch": 1.96, + "grad_norm": 16.525117874145508, + "learning_rate": 6.924653809145296e-06, + "loss": 1.1781, + "step": 15628 + }, + { + "epoch": 1.96, + "grad_norm": 39.423526763916016, + "learning_rate": 6.923817094088608e-06, + "loss": 1.7619, + "step": 15629 + }, + { + "epoch": 1.96, + "grad_norm": 15.524224281311035, + "learning_rate": 6.922980379031922e-06, + "loss": 1.4073, + "step": 15630 + }, + { + "epoch": 1.96, + "grad_norm": 10.538905143737793, + "learning_rate": 6.922143663975234e-06, + "loss": 1.2483, + "step": 15631 + }, + { + "epoch": 1.96, + "grad_norm": 18.400815963745117, + "learning_rate": 6.921306948918546e-06, + "loss": 0.7544, + "step": 15632 + }, + { + "epoch": 1.96, + "grad_norm": 88.06678771972656, + "learning_rate": 6.920470233861859e-06, + "loss": 2.307, + "step": 15633 + }, + { + "epoch": 1.96, + "grad_norm": 6.407633304595947, + "learning_rate": 6.919633518805171e-06, + "loss": 1.6535, + "step": 15634 + }, + { + "epoch": 1.96, + "grad_norm": 11.588862419128418, + "learning_rate": 6.918796803748484e-06, + "loss": 0.3925, + "step": 15635 + }, + { + "epoch": 1.96, + "grad_norm": 8.575936317443848, + "learning_rate": 6.917960088691796e-06, + "loss": 1.0875, + "step": 15636 + }, + { + "epoch": 1.96, + "grad_norm": 7.416591167449951, + "learning_rate": 6.91712337363511e-06, + "loss": 0.4413, + "step": 15637 + }, + { + "epoch": 1.96, + "grad_norm": 19.250289916992188, + "learning_rate": 6.916286658578422e-06, + "loss": 0.6748, + "step": 15638 + }, + { + "epoch": 1.96, + "grad_norm": 155.1161651611328, + "learning_rate": 6.915449943521734e-06, + "loss": 1.7139, + "step": 15639 + }, + { + "epoch": 1.96, + "grad_norm": 8.13044261932373, + "learning_rate": 6.914613228465047e-06, + "loss": 0.7617, + "step": 15640 + }, + { + "epoch": 1.96, + "grad_norm": 10.329883575439453, + "learning_rate": 6.913776513408359e-06, + "loss": 1.1411, + "step": 15641 + }, + { + "epoch": 1.96, + "grad_norm": 11.309494972229004, + "learning_rate": 6.912939798351672e-06, + "loss": 0.5356, + "step": 15642 + }, + { + "epoch": 1.96, + "grad_norm": 71.26300811767578, + "learning_rate": 6.912103083294984e-06, + "loss": 0.5755, + "step": 15643 + }, + { + "epoch": 1.96, + "grad_norm": 18.788625717163086, + "learning_rate": 6.911266368238298e-06, + "loss": 1.2436, + "step": 15644 + }, + { + "epoch": 1.96, + "grad_norm": 12.986571311950684, + "learning_rate": 6.91042965318161e-06, + "loss": 0.4995, + "step": 15645 + }, + { + "epoch": 1.96, + "grad_norm": 10.278299331665039, + "learning_rate": 6.9095929381249216e-06, + "loss": 1.0472, + "step": 15646 + }, + { + "epoch": 1.96, + "grad_norm": 9.618620872497559, + "learning_rate": 6.908756223068234e-06, + "loss": 0.1579, + "step": 15647 + }, + { + "epoch": 1.96, + "grad_norm": 19.59689712524414, + "learning_rate": 6.907919508011547e-06, + "loss": 1.0557, + "step": 15648 + }, + { + "epoch": 1.96, + "grad_norm": 7.157246112823486, + "learning_rate": 6.90708279295486e-06, + "loss": 0.4243, + "step": 15649 + }, + { + "epoch": 1.96, + "grad_norm": 12.754986763000488, + "learning_rate": 6.906246077898172e-06, + "loss": 0.4012, + "step": 15650 + }, + { + "epoch": 1.96, + "grad_norm": 33.26277542114258, + "learning_rate": 6.905409362841486e-06, + "loss": 0.9033, + "step": 15651 + }, + { + "epoch": 1.96, + "grad_norm": 22.752958297729492, + "learning_rate": 6.9045726477847975e-06, + "loss": 0.71, + "step": 15652 + }, + { + "epoch": 1.96, + "grad_norm": 10.783706665039062, + "learning_rate": 6.9037359327281095e-06, + "loss": 0.4343, + "step": 15653 + }, + { + "epoch": 1.96, + "grad_norm": 20.73919677734375, + "learning_rate": 6.902899217671422e-06, + "loss": 0.7173, + "step": 15654 + }, + { + "epoch": 1.96, + "grad_norm": 50.49082946777344, + "learning_rate": 6.902062502614735e-06, + "loss": 2.1645, + "step": 15655 + }, + { + "epoch": 1.96, + "grad_norm": 14.673543930053711, + "learning_rate": 6.901225787558048e-06, + "loss": 0.9353, + "step": 15656 + }, + { + "epoch": 1.96, + "grad_norm": 13.010211944580078, + "learning_rate": 6.90038907250136e-06, + "loss": 1.5859, + "step": 15657 + }, + { + "epoch": 1.97, + "grad_norm": 17.193904876708984, + "learning_rate": 6.8995523574446735e-06, + "loss": 0.8288, + "step": 15658 + }, + { + "epoch": 1.97, + "grad_norm": 22.396930694580078, + "learning_rate": 6.8987156423879855e-06, + "loss": 1.4121, + "step": 15659 + }, + { + "epoch": 1.97, + "grad_norm": 22.452795028686523, + "learning_rate": 6.897878927331297e-06, + "loss": 2.8046, + "step": 15660 + }, + { + "epoch": 1.97, + "grad_norm": 12.525009155273438, + "learning_rate": 6.89704221227461e-06, + "loss": 0.4185, + "step": 15661 + }, + { + "epoch": 1.97, + "grad_norm": 17.753753662109375, + "learning_rate": 6.896205497217923e-06, + "loss": 0.9865, + "step": 15662 + }, + { + "epoch": 1.97, + "grad_norm": 75.84577178955078, + "learning_rate": 6.895368782161236e-06, + "loss": 0.7734, + "step": 15663 + }, + { + "epoch": 1.97, + "grad_norm": 16.62454605102539, + "learning_rate": 6.894532067104548e-06, + "loss": 1.01, + "step": 15664 + }, + { + "epoch": 1.97, + "grad_norm": 6.9103569984436035, + "learning_rate": 6.893695352047861e-06, + "loss": 0.5124, + "step": 15665 + }, + { + "epoch": 1.97, + "grad_norm": 34.67379379272461, + "learning_rate": 6.892858636991173e-06, + "loss": 1.1893, + "step": 15666 + }, + { + "epoch": 1.97, + "grad_norm": 104.24536895751953, + "learning_rate": 6.892021921934485e-06, + "loss": 1.6502, + "step": 15667 + }, + { + "epoch": 1.97, + "grad_norm": 13.472212791442871, + "learning_rate": 6.891185206877798e-06, + "loss": 0.8939, + "step": 15668 + }, + { + "epoch": 1.97, + "grad_norm": 11.587250709533691, + "learning_rate": 6.890348491821111e-06, + "loss": 0.6514, + "step": 15669 + }, + { + "epoch": 1.97, + "grad_norm": 6.328396797180176, + "learning_rate": 6.889511776764424e-06, + "loss": 1.2435, + "step": 15670 + }, + { + "epoch": 1.97, + "grad_norm": 5.189692974090576, + "learning_rate": 6.888675061707736e-06, + "loss": 0.4131, + "step": 15671 + }, + { + "epoch": 1.97, + "grad_norm": 2.220883369445801, + "learning_rate": 6.887838346651049e-06, + "loss": 0.1098, + "step": 15672 + }, + { + "epoch": 1.97, + "grad_norm": 8.100465774536133, + "learning_rate": 6.887001631594361e-06, + "loss": 0.8029, + "step": 15673 + }, + { + "epoch": 1.97, + "grad_norm": 33.33985900878906, + "learning_rate": 6.886164916537673e-06, + "loss": 2.2551, + "step": 15674 + }, + { + "epoch": 1.97, + "grad_norm": 18.337589263916016, + "learning_rate": 6.885328201480986e-06, + "loss": 0.5092, + "step": 15675 + }, + { + "epoch": 1.97, + "grad_norm": 7.437816143035889, + "learning_rate": 6.884491486424299e-06, + "loss": 0.6743, + "step": 15676 + }, + { + "epoch": 1.97, + "grad_norm": 39.140586853027344, + "learning_rate": 6.883654771367612e-06, + "loss": 1.7836, + "step": 15677 + }, + { + "epoch": 1.97, + "grad_norm": 21.392349243164062, + "learning_rate": 6.882818056310924e-06, + "loss": 0.5786, + "step": 15678 + }, + { + "epoch": 1.97, + "grad_norm": 13.871636390686035, + "learning_rate": 6.881981341254237e-06, + "loss": 1.7352, + "step": 15679 + }, + { + "epoch": 1.97, + "grad_norm": 8.367588996887207, + "learning_rate": 6.881144626197549e-06, + "loss": 0.5013, + "step": 15680 + }, + { + "epoch": 1.97, + "grad_norm": 21.83454704284668, + "learning_rate": 6.880307911140861e-06, + "loss": 0.7217, + "step": 15681 + }, + { + "epoch": 1.97, + "grad_norm": 9.30942440032959, + "learning_rate": 6.879471196084174e-06, + "loss": 1.1288, + "step": 15682 + }, + { + "epoch": 1.97, + "grad_norm": 29.0285701751709, + "learning_rate": 6.878634481027486e-06, + "loss": 0.9151, + "step": 15683 + }, + { + "epoch": 1.97, + "grad_norm": 102.32872772216797, + "learning_rate": 6.8777977659707996e-06, + "loss": 2.2491, + "step": 15684 + }, + { + "epoch": 1.97, + "grad_norm": 12.969430923461914, + "learning_rate": 6.8769610509141115e-06, + "loss": 0.6064, + "step": 15685 + }, + { + "epoch": 1.97, + "grad_norm": 16.655744552612305, + "learning_rate": 6.876124335857425e-06, + "loss": 0.6201, + "step": 15686 + }, + { + "epoch": 1.97, + "grad_norm": 10.896740913391113, + "learning_rate": 6.875287620800737e-06, + "loss": 1.1829, + "step": 15687 + }, + { + "epoch": 1.97, + "grad_norm": 12.075094223022461, + "learning_rate": 6.874450905744049e-06, + "loss": 1.008, + "step": 15688 + }, + { + "epoch": 1.97, + "grad_norm": 10.580779075622559, + "learning_rate": 6.873614190687362e-06, + "loss": 1.104, + "step": 15689 + }, + { + "epoch": 1.97, + "grad_norm": 6.3209547996521, + "learning_rate": 6.872777475630674e-06, + "loss": 0.51, + "step": 15690 + }, + { + "epoch": 1.97, + "grad_norm": 1.2571667432785034, + "learning_rate": 6.8719407605739875e-06, + "loss": 0.0328, + "step": 15691 + }, + { + "epoch": 1.97, + "grad_norm": 14.199670791625977, + "learning_rate": 6.8711040455172994e-06, + "loss": 0.7279, + "step": 15692 + }, + { + "epoch": 1.97, + "grad_norm": 17.16489028930664, + "learning_rate": 6.870267330460613e-06, + "loss": 0.9808, + "step": 15693 + }, + { + "epoch": 1.97, + "grad_norm": 13.66822338104248, + "learning_rate": 6.869430615403925e-06, + "loss": 0.88, + "step": 15694 + }, + { + "epoch": 1.97, + "grad_norm": 17.813467025756836, + "learning_rate": 6.868593900347237e-06, + "loss": 1.3307, + "step": 15695 + }, + { + "epoch": 1.97, + "grad_norm": 8.129831314086914, + "learning_rate": 6.86775718529055e-06, + "loss": 2.2649, + "step": 15696 + }, + { + "epoch": 1.97, + "grad_norm": 7.627436637878418, + "learning_rate": 6.866920470233862e-06, + "loss": 0.3787, + "step": 15697 + }, + { + "epoch": 1.97, + "grad_norm": 23.57646369934082, + "learning_rate": 6.866083755177175e-06, + "loss": 1.2059, + "step": 15698 + }, + { + "epoch": 1.97, + "grad_norm": 23.84362030029297, + "learning_rate": 6.865247040120487e-06, + "loss": 0.3332, + "step": 15699 + }, + { + "epoch": 1.97, + "grad_norm": 3.6035149097442627, + "learning_rate": 6.8644103250638e-06, + "loss": 0.2212, + "step": 15700 + }, + { + "epoch": 1.97, + "grad_norm": 5.199368000030518, + "learning_rate": 6.863573610007113e-06, + "loss": 0.5067, + "step": 15701 + }, + { + "epoch": 1.97, + "grad_norm": 16.549041748046875, + "learning_rate": 6.862736894950425e-06, + "loss": 1.5231, + "step": 15702 + }, + { + "epoch": 1.97, + "grad_norm": 8.173349380493164, + "learning_rate": 6.861900179893738e-06, + "loss": 0.7989, + "step": 15703 + }, + { + "epoch": 1.97, + "grad_norm": 43.922786712646484, + "learning_rate": 6.86106346483705e-06, + "loss": 0.707, + "step": 15704 + }, + { + "epoch": 1.97, + "grad_norm": 105.08285522460938, + "learning_rate": 6.860226749780363e-06, + "loss": 1.2973, + "step": 15705 + }, + { + "epoch": 1.97, + "grad_norm": 23.7647647857666, + "learning_rate": 6.859390034723675e-06, + "loss": 0.8263, + "step": 15706 + }, + { + "epoch": 1.97, + "grad_norm": 17.59117889404297, + "learning_rate": 6.858553319666988e-06, + "loss": 0.6021, + "step": 15707 + }, + { + "epoch": 1.97, + "grad_norm": 38.81248092651367, + "learning_rate": 6.857716604610301e-06, + "loss": 1.6105, + "step": 15708 + }, + { + "epoch": 1.97, + "grad_norm": 20.170320510864258, + "learning_rate": 6.856879889553613e-06, + "loss": 1.2641, + "step": 15709 + }, + { + "epoch": 1.97, + "grad_norm": 36.23456954956055, + "learning_rate": 6.856043174496926e-06, + "loss": 1.802, + "step": 15710 + }, + { + "epoch": 1.97, + "grad_norm": 31.516807556152344, + "learning_rate": 6.855206459440238e-06, + "loss": 1.4221, + "step": 15711 + }, + { + "epoch": 1.97, + "grad_norm": 60.37530517578125, + "learning_rate": 6.854369744383551e-06, + "loss": 1.0894, + "step": 15712 + }, + { + "epoch": 1.97, + "grad_norm": 1.5870620012283325, + "learning_rate": 6.853533029326863e-06, + "loss": 0.0508, + "step": 15713 + }, + { + "epoch": 1.97, + "grad_norm": 8.285735130310059, + "learning_rate": 6.852696314270176e-06, + "loss": 1.4598, + "step": 15714 + }, + { + "epoch": 1.97, + "grad_norm": 5.626489162445068, + "learning_rate": 6.851859599213489e-06, + "loss": 0.3816, + "step": 15715 + }, + { + "epoch": 1.97, + "grad_norm": 242.55050659179688, + "learning_rate": 6.851022884156801e-06, + "loss": 1.5678, + "step": 15716 + }, + { + "epoch": 1.97, + "grad_norm": 12.18159294128418, + "learning_rate": 6.8501861691001136e-06, + "loss": 0.7714, + "step": 15717 + }, + { + "epoch": 1.97, + "grad_norm": 22.286861419677734, + "learning_rate": 6.8493494540434255e-06, + "loss": 0.6816, + "step": 15718 + }, + { + "epoch": 1.97, + "grad_norm": 5.448339462280273, + "learning_rate": 6.848512738986739e-06, + "loss": 0.4246, + "step": 15719 + }, + { + "epoch": 1.97, + "grad_norm": 18.41815757751465, + "learning_rate": 6.847676023930051e-06, + "loss": 1.3264, + "step": 15720 + }, + { + "epoch": 1.97, + "grad_norm": 48.002986907958984, + "learning_rate": 6.846839308873364e-06, + "loss": 1.2621, + "step": 15721 + }, + { + "epoch": 1.97, + "grad_norm": 7.736732482910156, + "learning_rate": 6.846002593816677e-06, + "loss": 0.6592, + "step": 15722 + }, + { + "epoch": 1.97, + "grad_norm": 8.901252746582031, + "learning_rate": 6.845165878759989e-06, + "loss": 0.6618, + "step": 15723 + }, + { + "epoch": 1.97, + "grad_norm": 15.029723167419434, + "learning_rate": 6.8443291637033015e-06, + "loss": 1.0123, + "step": 15724 + }, + { + "epoch": 1.97, + "grad_norm": 11.73349380493164, + "learning_rate": 6.8434924486466134e-06, + "loss": 0.7331, + "step": 15725 + }, + { + "epoch": 1.97, + "grad_norm": 40.74237823486328, + "learning_rate": 6.842655733589927e-06, + "loss": 2.2732, + "step": 15726 + }, + { + "epoch": 1.97, + "grad_norm": 21.932228088378906, + "learning_rate": 6.841819018533239e-06, + "loss": 1.0555, + "step": 15727 + }, + { + "epoch": 1.97, + "grad_norm": 15.126953125, + "learning_rate": 6.840982303476552e-06, + "loss": 1.2464, + "step": 15728 + }, + { + "epoch": 1.97, + "grad_norm": 17.180967330932617, + "learning_rate": 6.840145588419864e-06, + "loss": 2.1003, + "step": 15729 + }, + { + "epoch": 1.97, + "grad_norm": 6.244019985198975, + "learning_rate": 6.839308873363177e-06, + "loss": 0.3997, + "step": 15730 + }, + { + "epoch": 1.97, + "grad_norm": 13.408403396606445, + "learning_rate": 6.838472158306489e-06, + "loss": 0.7182, + "step": 15731 + }, + { + "epoch": 1.97, + "grad_norm": 17.05916976928711, + "learning_rate": 6.837635443249801e-06, + "loss": 0.8322, + "step": 15732 + }, + { + "epoch": 1.97, + "grad_norm": 8.671708106994629, + "learning_rate": 6.836798728193115e-06, + "loss": 0.5849, + "step": 15733 + }, + { + "epoch": 1.97, + "grad_norm": 17.564306259155273, + "learning_rate": 6.835962013136427e-06, + "loss": 1.3005, + "step": 15734 + }, + { + "epoch": 1.97, + "grad_norm": 30.314828872680664, + "learning_rate": 6.83512529807974e-06, + "loss": 1.777, + "step": 15735 + }, + { + "epoch": 1.97, + "grad_norm": 15.86076545715332, + "learning_rate": 6.834288583023052e-06, + "loss": 1.0505, + "step": 15736 + }, + { + "epoch": 1.97, + "grad_norm": 8.155749320983887, + "learning_rate": 6.8334518679663645e-06, + "loss": 0.2771, + "step": 15737 + }, + { + "epoch": 1.98, + "grad_norm": 6.9560956954956055, + "learning_rate": 6.832615152909677e-06, + "loss": 0.8509, + "step": 15738 + }, + { + "epoch": 1.98, + "grad_norm": 26.87178611755371, + "learning_rate": 6.831778437852989e-06, + "loss": 1.9133, + "step": 15739 + }, + { + "epoch": 1.98, + "grad_norm": 12.6340913772583, + "learning_rate": 6.830941722796303e-06, + "loss": 2.5513, + "step": 15740 + }, + { + "epoch": 1.98, + "grad_norm": 13.080458641052246, + "learning_rate": 6.830105007739615e-06, + "loss": 0.8845, + "step": 15741 + }, + { + "epoch": 1.98, + "grad_norm": 6.604981899261475, + "learning_rate": 6.829268292682928e-06, + "loss": 0.5194, + "step": 15742 + }, + { + "epoch": 1.98, + "grad_norm": 18.114355087280273, + "learning_rate": 6.82843157762624e-06, + "loss": 0.8396, + "step": 15743 + }, + { + "epoch": 1.98, + "grad_norm": 8.608214378356934, + "learning_rate": 6.8275948625695524e-06, + "loss": 0.5554, + "step": 15744 + }, + { + "epoch": 1.98, + "grad_norm": 24.107295989990234, + "learning_rate": 6.826758147512865e-06, + "loss": 2.9412, + "step": 15745 + }, + { + "epoch": 1.98, + "grad_norm": 14.742103576660156, + "learning_rate": 6.825921432456177e-06, + "loss": 2.0741, + "step": 15746 + }, + { + "epoch": 1.98, + "grad_norm": 15.706936836242676, + "learning_rate": 6.825084717399491e-06, + "loss": 1.4164, + "step": 15747 + }, + { + "epoch": 1.98, + "grad_norm": 27.102779388427734, + "learning_rate": 6.824248002342803e-06, + "loss": 1.9597, + "step": 15748 + }, + { + "epoch": 1.98, + "grad_norm": 10.980995178222656, + "learning_rate": 6.823411287286116e-06, + "loss": 1.5035, + "step": 15749 + }, + { + "epoch": 1.98, + "grad_norm": 52.42707443237305, + "learning_rate": 6.8225745722294276e-06, + "loss": 2.2065, + "step": 15750 + }, + { + "epoch": 1.98, + "grad_norm": 3.3362693786621094, + "learning_rate": 6.82173785717274e-06, + "loss": 0.1472, + "step": 15751 + }, + { + "epoch": 1.98, + "grad_norm": 25.937246322631836, + "learning_rate": 6.820901142116053e-06, + "loss": 2.0787, + "step": 15752 + }, + { + "epoch": 1.98, + "grad_norm": 17.075586318969727, + "learning_rate": 6.820064427059365e-06, + "loss": 2.0738, + "step": 15753 + }, + { + "epoch": 1.98, + "grad_norm": 24.563770294189453, + "learning_rate": 6.819227712002679e-06, + "loss": 1.5253, + "step": 15754 + }, + { + "epoch": 1.98, + "grad_norm": 26.96422004699707, + "learning_rate": 6.818390996945991e-06, + "loss": 0.6918, + "step": 15755 + }, + { + "epoch": 1.98, + "grad_norm": 8.356304168701172, + "learning_rate": 6.8175542818893035e-06, + "loss": 1.0219, + "step": 15756 + }, + { + "epoch": 1.98, + "grad_norm": 5.026080131530762, + "learning_rate": 6.8167175668326155e-06, + "loss": 0.6225, + "step": 15757 + }, + { + "epoch": 1.98, + "grad_norm": 14.464239120483398, + "learning_rate": 6.8158808517759274e-06, + "loss": 1.389, + "step": 15758 + }, + { + "epoch": 1.98, + "grad_norm": 6.517858982086182, + "learning_rate": 6.815044136719241e-06, + "loss": 1.3605, + "step": 15759 + }, + { + "epoch": 1.98, + "grad_norm": 16.6304874420166, + "learning_rate": 6.814207421662553e-06, + "loss": 0.8383, + "step": 15760 + }, + { + "epoch": 1.98, + "grad_norm": 13.514803886413574, + "learning_rate": 6.813370706605867e-06, + "loss": 1.129, + "step": 15761 + }, + { + "epoch": 1.98, + "grad_norm": 8.15904426574707, + "learning_rate": 6.812533991549179e-06, + "loss": 0.0998, + "step": 15762 + }, + { + "epoch": 1.98, + "grad_norm": 6.209280014038086, + "learning_rate": 6.8116972764924914e-06, + "loss": 0.2516, + "step": 15763 + }, + { + "epoch": 1.98, + "grad_norm": 3.692831516265869, + "learning_rate": 6.810860561435803e-06, + "loss": 0.3186, + "step": 15764 + }, + { + "epoch": 1.98, + "grad_norm": 45.93081283569336, + "learning_rate": 6.810023846379115e-06, + "loss": 5.1503, + "step": 15765 + }, + { + "epoch": 1.98, + "grad_norm": 7.090846538543701, + "learning_rate": 6.809187131322429e-06, + "loss": 0.5046, + "step": 15766 + }, + { + "epoch": 1.98, + "grad_norm": 18.20060920715332, + "learning_rate": 6.808350416265741e-06, + "loss": 1.0815, + "step": 15767 + }, + { + "epoch": 1.98, + "grad_norm": 17.6390323638916, + "learning_rate": 6.807513701209055e-06, + "loss": 0.9999, + "step": 15768 + }, + { + "epoch": 1.98, + "grad_norm": 52.04750061035156, + "learning_rate": 6.8066769861523665e-06, + "loss": 3.1283, + "step": 15769 + }, + { + "epoch": 1.98, + "grad_norm": 15.587772369384766, + "learning_rate": 6.805840271095679e-06, + "loss": 0.5959, + "step": 15770 + }, + { + "epoch": 1.98, + "grad_norm": 6.01774263381958, + "learning_rate": 6.805003556038991e-06, + "loss": 0.337, + "step": 15771 + }, + { + "epoch": 1.98, + "grad_norm": 12.749217987060547, + "learning_rate": 6.804166840982303e-06, + "loss": 0.6262, + "step": 15772 + }, + { + "epoch": 1.98, + "grad_norm": 13.583781242370605, + "learning_rate": 6.803330125925617e-06, + "loss": 0.8778, + "step": 15773 + }, + { + "epoch": 1.98, + "grad_norm": 152.0609130859375, + "learning_rate": 6.802493410868929e-06, + "loss": 1.3635, + "step": 15774 + }, + { + "epoch": 1.98, + "grad_norm": 8.898676872253418, + "learning_rate": 6.801656695812242e-06, + "loss": 0.7382, + "step": 15775 + }, + { + "epoch": 1.98, + "grad_norm": 14.59203052520752, + "learning_rate": 6.8008199807555545e-06, + "loss": 0.7811, + "step": 15776 + }, + { + "epoch": 1.98, + "grad_norm": 31.98999786376953, + "learning_rate": 6.799983265698867e-06, + "loss": 1.3975, + "step": 15777 + }, + { + "epoch": 1.98, + "grad_norm": 21.140865325927734, + "learning_rate": 6.799146550642179e-06, + "loss": 1.5799, + "step": 15778 + }, + { + "epoch": 1.98, + "grad_norm": 14.374770164489746, + "learning_rate": 6.798309835585491e-06, + "loss": 0.7239, + "step": 15779 + }, + { + "epoch": 1.98, + "grad_norm": 12.85692024230957, + "learning_rate": 6.797473120528805e-06, + "loss": 1.3584, + "step": 15780 + }, + { + "epoch": 1.98, + "grad_norm": 6.9161810874938965, + "learning_rate": 6.796636405472117e-06, + "loss": 0.9429, + "step": 15781 + }, + { + "epoch": 1.98, + "grad_norm": 18.36745262145996, + "learning_rate": 6.79579969041543e-06, + "loss": 2.0932, + "step": 15782 + }, + { + "epoch": 1.98, + "grad_norm": 13.395533561706543, + "learning_rate": 6.794962975358742e-06, + "loss": 1.2362, + "step": 15783 + }, + { + "epoch": 1.98, + "grad_norm": 31.147062301635742, + "learning_rate": 6.794126260302055e-06, + "loss": 2.2719, + "step": 15784 + }, + { + "epoch": 1.98, + "grad_norm": 15.50479793548584, + "learning_rate": 6.793289545245367e-06, + "loss": 1.2375, + "step": 15785 + }, + { + "epoch": 1.98, + "grad_norm": 12.041942596435547, + "learning_rate": 6.792452830188679e-06, + "loss": 0.2188, + "step": 15786 + }, + { + "epoch": 1.98, + "grad_norm": 16.76938819885254, + "learning_rate": 6.791616115131993e-06, + "loss": 1.1379, + "step": 15787 + }, + { + "epoch": 1.98, + "grad_norm": 38.72340393066406, + "learning_rate": 6.790779400075305e-06, + "loss": 0.9639, + "step": 15788 + }, + { + "epoch": 1.98, + "grad_norm": 10.512739181518555, + "learning_rate": 6.7899426850186175e-06, + "loss": 0.7662, + "step": 15789 + }, + { + "epoch": 1.98, + "grad_norm": 30.250303268432617, + "learning_rate": 6.78910596996193e-06, + "loss": 1.3182, + "step": 15790 + }, + { + "epoch": 1.98, + "grad_norm": 15.59507942199707, + "learning_rate": 6.788269254905242e-06, + "loss": 1.2405, + "step": 15791 + }, + { + "epoch": 1.98, + "grad_norm": 19.40499496459961, + "learning_rate": 6.787432539848555e-06, + "loss": 0.6822, + "step": 15792 + }, + { + "epoch": 1.98, + "grad_norm": 14.11445140838623, + "learning_rate": 6.786595824791867e-06, + "loss": 1.0599, + "step": 15793 + }, + { + "epoch": 1.98, + "grad_norm": 34.05891799926758, + "learning_rate": 6.785759109735181e-06, + "loss": 2.9596, + "step": 15794 + }, + { + "epoch": 1.98, + "grad_norm": 5.87421178817749, + "learning_rate": 6.784922394678493e-06, + "loss": 0.3535, + "step": 15795 + }, + { + "epoch": 1.98, + "grad_norm": 1.5683294534683228, + "learning_rate": 6.784085679621805e-06, + "loss": 0.0357, + "step": 15796 + }, + { + "epoch": 1.98, + "grad_norm": 5.782618522644043, + "learning_rate": 6.783248964565118e-06, + "loss": 0.2633, + "step": 15797 + }, + { + "epoch": 1.98, + "grad_norm": 46.601898193359375, + "learning_rate": 6.78241224950843e-06, + "loss": 1.0305, + "step": 15798 + }, + { + "epoch": 1.98, + "grad_norm": 19.762128829956055, + "learning_rate": 6.781575534451743e-06, + "loss": 1.0901, + "step": 15799 + }, + { + "epoch": 1.98, + "grad_norm": 4.58148717880249, + "learning_rate": 6.780738819395055e-06, + "loss": 0.2555, + "step": 15800 + }, + { + "epoch": 1.98, + "grad_norm": 114.76978302001953, + "learning_rate": 6.779902104338369e-06, + "loss": 1.4787, + "step": 15801 + }, + { + "epoch": 1.98, + "grad_norm": 10.883560180664062, + "learning_rate": 6.7790653892816805e-06, + "loss": 0.4775, + "step": 15802 + }, + { + "epoch": 1.98, + "grad_norm": 35.43465042114258, + "learning_rate": 6.778228674224993e-06, + "loss": 3.3315, + "step": 15803 + }, + { + "epoch": 1.98, + "grad_norm": 8.82397174835205, + "learning_rate": 6.777391959168306e-06, + "loss": 0.6702, + "step": 15804 + }, + { + "epoch": 1.98, + "grad_norm": 8.601953506469727, + "learning_rate": 6.776555244111618e-06, + "loss": 0.4739, + "step": 15805 + }, + { + "epoch": 1.98, + "grad_norm": 9.222111701965332, + "learning_rate": 6.775718529054931e-06, + "loss": 0.3269, + "step": 15806 + }, + { + "epoch": 1.98, + "grad_norm": 10.697507858276367, + "learning_rate": 6.774881813998243e-06, + "loss": 1.3201, + "step": 15807 + }, + { + "epoch": 1.98, + "grad_norm": 16.113651275634766, + "learning_rate": 6.7740450989415565e-06, + "loss": 1.4599, + "step": 15808 + }, + { + "epoch": 1.98, + "grad_norm": 21.23736572265625, + "learning_rate": 6.7732083838848685e-06, + "loss": 0.8539, + "step": 15809 + }, + { + "epoch": 1.98, + "grad_norm": 22.37204933166504, + "learning_rate": 6.772371668828181e-06, + "loss": 0.8251, + "step": 15810 + }, + { + "epoch": 1.98, + "grad_norm": 52.310489654541016, + "learning_rate": 6.771534953771493e-06, + "loss": 2.5391, + "step": 15811 + }, + { + "epoch": 1.98, + "grad_norm": 49.94077682495117, + "learning_rate": 6.770698238714806e-06, + "loss": 1.2796, + "step": 15812 + }, + { + "epoch": 1.98, + "grad_norm": 6.511701583862305, + "learning_rate": 6.769861523658119e-06, + "loss": 0.714, + "step": 15813 + }, + { + "epoch": 1.98, + "grad_norm": 11.521330833435059, + "learning_rate": 6.769024808601431e-06, + "loss": 1.385, + "step": 15814 + }, + { + "epoch": 1.98, + "grad_norm": 15.310072898864746, + "learning_rate": 6.768188093544744e-06, + "loss": 0.913, + "step": 15815 + }, + { + "epoch": 1.98, + "grad_norm": 31.162324905395508, + "learning_rate": 6.767351378488056e-06, + "loss": 1.6257, + "step": 15816 + }, + { + "epoch": 1.99, + "grad_norm": 56.60767364501953, + "learning_rate": 6.766514663431369e-06, + "loss": 2.2295, + "step": 15817 + }, + { + "epoch": 1.99, + "grad_norm": 6.426440238952637, + "learning_rate": 6.765677948374681e-06, + "loss": 0.5198, + "step": 15818 + }, + { + "epoch": 1.99, + "grad_norm": 7.248169422149658, + "learning_rate": 6.764841233317994e-06, + "loss": 0.739, + "step": 15819 + }, + { + "epoch": 1.99, + "grad_norm": 22.613384246826172, + "learning_rate": 6.764004518261307e-06, + "loss": 0.5279, + "step": 15820 + }, + { + "epoch": 1.99, + "grad_norm": 14.047466278076172, + "learning_rate": 6.763167803204619e-06, + "loss": 1.7756, + "step": 15821 + }, + { + "epoch": 1.99, + "grad_norm": 103.04438018798828, + "learning_rate": 6.762331088147932e-06, + "loss": 1.9127, + "step": 15822 + }, + { + "epoch": 1.99, + "grad_norm": 12.794999122619629, + "learning_rate": 6.761494373091244e-06, + "loss": 0.5107, + "step": 15823 + }, + { + "epoch": 1.99, + "grad_norm": 31.93400764465332, + "learning_rate": 6.760657658034557e-06, + "loss": 0.9843, + "step": 15824 + }, + { + "epoch": 1.99, + "grad_norm": 18.88462257385254, + "learning_rate": 6.759820942977869e-06, + "loss": 0.3994, + "step": 15825 + }, + { + "epoch": 1.99, + "grad_norm": 19.0087833404541, + "learning_rate": 6.758984227921182e-06, + "loss": 0.7096, + "step": 15826 + }, + { + "epoch": 1.99, + "grad_norm": 15.642922401428223, + "learning_rate": 6.758147512864495e-06, + "loss": 0.4583, + "step": 15827 + }, + { + "epoch": 1.99, + "grad_norm": 83.87149810791016, + "learning_rate": 6.757310797807807e-06, + "loss": 1.4162, + "step": 15828 + }, + { + "epoch": 1.99, + "grad_norm": 4.853623390197754, + "learning_rate": 6.75647408275112e-06, + "loss": 0.8932, + "step": 15829 + }, + { + "epoch": 1.99, + "grad_norm": 53.79916000366211, + "learning_rate": 6.755637367694432e-06, + "loss": 0.9648, + "step": 15830 + }, + { + "epoch": 1.99, + "grad_norm": 77.17891693115234, + "learning_rate": 6.754800652637745e-06, + "loss": 1.7642, + "step": 15831 + }, + { + "epoch": 1.99, + "grad_norm": 11.801971435546875, + "learning_rate": 6.753963937581057e-06, + "loss": 0.5852, + "step": 15832 + }, + { + "epoch": 1.99, + "grad_norm": 7.212321758270264, + "learning_rate": 6.75312722252437e-06, + "loss": 0.5346, + "step": 15833 + }, + { + "epoch": 1.99, + "grad_norm": 6.900275707244873, + "learning_rate": 6.7522905074676826e-06, + "loss": 1.6755, + "step": 15834 + }, + { + "epoch": 1.99, + "grad_norm": 18.999773025512695, + "learning_rate": 6.7514537924109945e-06, + "loss": 2.0601, + "step": 15835 + }, + { + "epoch": 1.99, + "grad_norm": 22.585071563720703, + "learning_rate": 6.750617077354308e-06, + "loss": 0.6447, + "step": 15836 + }, + { + "epoch": 1.99, + "grad_norm": 10.195775985717773, + "learning_rate": 6.74978036229762e-06, + "loss": 0.3813, + "step": 15837 + }, + { + "epoch": 1.99, + "grad_norm": 25.38553237915039, + "learning_rate": 6.748943647240933e-06, + "loss": 0.5727, + "step": 15838 + }, + { + "epoch": 1.99, + "grad_norm": 17.144113540649414, + "learning_rate": 6.748106932184245e-06, + "loss": 0.3331, + "step": 15839 + }, + { + "epoch": 1.99, + "grad_norm": 13.481681823730469, + "learning_rate": 6.747270217127557e-06, + "loss": 1.0861, + "step": 15840 + }, + { + "epoch": 1.99, + "grad_norm": 31.40520477294922, + "learning_rate": 6.7464335020708705e-06, + "loss": 1.6721, + "step": 15841 + }, + { + "epoch": 1.99, + "grad_norm": 11.16893482208252, + "learning_rate": 6.7455967870141824e-06, + "loss": 0.2693, + "step": 15842 + }, + { + "epoch": 1.99, + "grad_norm": 36.15472412109375, + "learning_rate": 6.744760071957496e-06, + "loss": 1.2241, + "step": 15843 + }, + { + "epoch": 1.99, + "grad_norm": 8.867559432983398, + "learning_rate": 6.743923356900808e-06, + "loss": 0.4183, + "step": 15844 + }, + { + "epoch": 1.99, + "grad_norm": 12.108124732971191, + "learning_rate": 6.743086641844121e-06, + "loss": 0.4826, + "step": 15845 + }, + { + "epoch": 1.99, + "grad_norm": 18.097091674804688, + "learning_rate": 6.742249926787433e-06, + "loss": 1.4556, + "step": 15846 + }, + { + "epoch": 1.99, + "grad_norm": 8.807405471801758, + "learning_rate": 6.741413211730745e-06, + "loss": 0.637, + "step": 15847 + }, + { + "epoch": 1.99, + "grad_norm": 14.497922897338867, + "learning_rate": 6.740576496674058e-06, + "loss": 0.9905, + "step": 15848 + }, + { + "epoch": 1.99, + "grad_norm": 11.1741943359375, + "learning_rate": 6.73973978161737e-06, + "loss": 0.9797, + "step": 15849 + }, + { + "epoch": 1.99, + "grad_norm": 92.6616439819336, + "learning_rate": 6.738903066560684e-06, + "loss": 1.8077, + "step": 15850 + }, + { + "epoch": 1.99, + "grad_norm": 37.04857635498047, + "learning_rate": 6.738066351503996e-06, + "loss": 1.5059, + "step": 15851 + }, + { + "epoch": 1.99, + "grad_norm": 3.5521743297576904, + "learning_rate": 6.737229636447309e-06, + "loss": 0.1421, + "step": 15852 + }, + { + "epoch": 1.99, + "grad_norm": 5.347033500671387, + "learning_rate": 6.736392921390621e-06, + "loss": 0.1225, + "step": 15853 + }, + { + "epoch": 1.99, + "grad_norm": 15.23282527923584, + "learning_rate": 6.735556206333933e-06, + "loss": 1.318, + "step": 15854 + }, + { + "epoch": 1.99, + "grad_norm": 36.05027389526367, + "learning_rate": 6.734719491277246e-06, + "loss": 2.4698, + "step": 15855 + }, + { + "epoch": 1.99, + "grad_norm": 20.172067642211914, + "learning_rate": 6.733882776220558e-06, + "loss": 1.0434, + "step": 15856 + }, + { + "epoch": 1.99, + "grad_norm": 12.372947692871094, + "learning_rate": 6.733046061163871e-06, + "loss": 1.7002, + "step": 15857 + }, + { + "epoch": 1.99, + "grad_norm": 32.12907409667969, + "learning_rate": 6.732209346107184e-06, + "loss": 1.9306, + "step": 15858 + }, + { + "epoch": 1.99, + "grad_norm": 3.9602692127227783, + "learning_rate": 6.731372631050497e-06, + "loss": 0.0323, + "step": 15859 + }, + { + "epoch": 1.99, + "grad_norm": 11.315047264099121, + "learning_rate": 6.730535915993809e-06, + "loss": 0.4447, + "step": 15860 + }, + { + "epoch": 1.99, + "grad_norm": 11.079931259155273, + "learning_rate": 6.729699200937121e-06, + "loss": 0.6453, + "step": 15861 + }, + { + "epoch": 1.99, + "grad_norm": 28.420324325561523, + "learning_rate": 6.728862485880434e-06, + "loss": 0.652, + "step": 15862 + }, + { + "epoch": 1.99, + "grad_norm": 11.298340797424316, + "learning_rate": 6.728025770823746e-06, + "loss": 1.6646, + "step": 15863 + }, + { + "epoch": 1.99, + "grad_norm": 5.225053787231445, + "learning_rate": 6.727189055767059e-06, + "loss": 0.1328, + "step": 15864 + }, + { + "epoch": 1.99, + "grad_norm": 21.30705451965332, + "learning_rate": 6.726352340710372e-06, + "loss": 1.5404, + "step": 15865 + }, + { + "epoch": 1.99, + "grad_norm": 10.347716331481934, + "learning_rate": 6.725515625653685e-06, + "loss": 0.2698, + "step": 15866 + }, + { + "epoch": 1.99, + "grad_norm": 0.7319647073745728, + "learning_rate": 6.7246789105969966e-06, + "loss": 0.0144, + "step": 15867 + }, + { + "epoch": 1.99, + "grad_norm": 9.512052536010742, + "learning_rate": 6.7238421955403085e-06, + "loss": 1.2367, + "step": 15868 + }, + { + "epoch": 1.99, + "grad_norm": 8.839876174926758, + "learning_rate": 6.723005480483622e-06, + "loss": 0.6775, + "step": 15869 + }, + { + "epoch": 1.99, + "grad_norm": 13.457022666931152, + "learning_rate": 6.722168765426934e-06, + "loss": 0.4371, + "step": 15870 + }, + { + "epoch": 1.99, + "grad_norm": 16.29427146911621, + "learning_rate": 6.721332050370247e-06, + "loss": 0.644, + "step": 15871 + }, + { + "epoch": 1.99, + "grad_norm": 10.478692054748535, + "learning_rate": 6.72049533531356e-06, + "loss": 0.5189, + "step": 15872 + }, + { + "epoch": 1.99, + "grad_norm": 20.0339412689209, + "learning_rate": 6.7196586202568725e-06, + "loss": 1.0351, + "step": 15873 + }, + { + "epoch": 1.99, + "grad_norm": 13.582653999328613, + "learning_rate": 6.7188219052001845e-06, + "loss": 2.4049, + "step": 15874 + }, + { + "epoch": 1.99, + "grad_norm": 19.97344398498535, + "learning_rate": 6.7179851901434964e-06, + "loss": 1.2458, + "step": 15875 + }, + { + "epoch": 1.99, + "grad_norm": 4.125348091125488, + "learning_rate": 6.71714847508681e-06, + "loss": 0.2057, + "step": 15876 + }, + { + "epoch": 1.99, + "grad_norm": 7.991709232330322, + "learning_rate": 6.716311760030122e-06, + "loss": 0.7323, + "step": 15877 + }, + { + "epoch": 1.99, + "grad_norm": 9.625768661499023, + "learning_rate": 6.715475044973435e-06, + "loss": 0.8304, + "step": 15878 + }, + { + "epoch": 1.99, + "grad_norm": 15.126842498779297, + "learning_rate": 6.714638329916748e-06, + "loss": 1.3284, + "step": 15879 + }, + { + "epoch": 1.99, + "grad_norm": 14.254463195800781, + "learning_rate": 6.7138016148600604e-06, + "loss": 1.4844, + "step": 15880 + }, + { + "epoch": 1.99, + "grad_norm": 11.407301902770996, + "learning_rate": 6.712964899803372e-06, + "loss": 0.7541, + "step": 15881 + }, + { + "epoch": 1.99, + "grad_norm": 17.281606674194336, + "learning_rate": 6.712128184746684e-06, + "loss": 2.9646, + "step": 15882 + }, + { + "epoch": 1.99, + "grad_norm": 8.113269805908203, + "learning_rate": 6.711291469689998e-06, + "loss": 1.4294, + "step": 15883 + }, + { + "epoch": 1.99, + "grad_norm": 17.159299850463867, + "learning_rate": 6.71045475463331e-06, + "loss": 2.8152, + "step": 15884 + }, + { + "epoch": 1.99, + "grad_norm": 40.33321762084961, + "learning_rate": 6.709618039576623e-06, + "loss": 2.357, + "step": 15885 + }, + { + "epoch": 1.99, + "grad_norm": 50.981689453125, + "learning_rate": 6.708781324519935e-06, + "loss": 1.0512, + "step": 15886 + }, + { + "epoch": 1.99, + "grad_norm": 25.388002395629883, + "learning_rate": 6.707944609463248e-06, + "loss": 1.5439, + "step": 15887 + }, + { + "epoch": 1.99, + "grad_norm": 12.173666954040527, + "learning_rate": 6.70710789440656e-06, + "loss": 1.3675, + "step": 15888 + }, + { + "epoch": 1.99, + "grad_norm": 43.078392028808594, + "learning_rate": 6.706271179349872e-06, + "loss": 1.0155, + "step": 15889 + }, + { + "epoch": 1.99, + "grad_norm": 10.333561897277832, + "learning_rate": 6.705434464293186e-06, + "loss": 1.494, + "step": 15890 + }, + { + "epoch": 1.99, + "grad_norm": 9.58069133758545, + "learning_rate": 6.704597749236498e-06, + "loss": 1.607, + "step": 15891 + }, + { + "epoch": 1.99, + "grad_norm": 11.508028030395508, + "learning_rate": 6.703761034179811e-06, + "loss": 0.87, + "step": 15892 + }, + { + "epoch": 1.99, + "grad_norm": 30.451860427856445, + "learning_rate": 6.702924319123123e-06, + "loss": 1.4955, + "step": 15893 + }, + { + "epoch": 1.99, + "grad_norm": 12.09868335723877, + "learning_rate": 6.702087604066436e-06, + "loss": 0.5746, + "step": 15894 + }, + { + "epoch": 1.99, + "grad_norm": 42.80390930175781, + "learning_rate": 6.701250889009748e-06, + "loss": 1.0079, + "step": 15895 + }, + { + "epoch": 1.99, + "grad_norm": 24.87275505065918, + "learning_rate": 6.70041417395306e-06, + "loss": 2.8248, + "step": 15896 + }, + { + "epoch": 2.0, + "grad_norm": 11.7715482711792, + "learning_rate": 6.699577458896374e-06, + "loss": 0.6132, + "step": 15897 + }, + { + "epoch": 2.0, + "grad_norm": 9.595332145690918, + "learning_rate": 6.698740743839686e-06, + "loss": 1.7415, + "step": 15898 + }, + { + "epoch": 2.0, + "grad_norm": 16.042381286621094, + "learning_rate": 6.697904028782999e-06, + "loss": 1.001, + "step": 15899 + }, + { + "epoch": 2.0, + "grad_norm": 14.5777587890625, + "learning_rate": 6.6970673137263106e-06, + "loss": 1.7448, + "step": 15900 + }, + { + "epoch": 2.0, + "grad_norm": 8.866508483886719, + "learning_rate": 6.696230598669624e-06, + "loss": 0.6962, + "step": 15901 + }, + { + "epoch": 2.0, + "grad_norm": 14.958714485168457, + "learning_rate": 6.695393883612936e-06, + "loss": 1.2525, + "step": 15902 + }, + { + "epoch": 2.0, + "grad_norm": 6.288279056549072, + "learning_rate": 6.694557168556248e-06, + "loss": 0.7352, + "step": 15903 + }, + { + "epoch": 2.0, + "grad_norm": 29.31307601928711, + "learning_rate": 6.693720453499562e-06, + "loss": 1.1016, + "step": 15904 + }, + { + "epoch": 2.0, + "grad_norm": 43.20933532714844, + "learning_rate": 6.692883738442874e-06, + "loss": 0.5855, + "step": 15905 + }, + { + "epoch": 2.0, + "grad_norm": 8.682262420654297, + "learning_rate": 6.6920470233861865e-06, + "loss": 0.4505, + "step": 15906 + }, + { + "epoch": 2.0, + "grad_norm": 4.800933361053467, + "learning_rate": 6.6912103083294985e-06, + "loss": 0.1849, + "step": 15907 + }, + { + "epoch": 2.0, + "grad_norm": 11.816986083984375, + "learning_rate": 6.690373593272812e-06, + "loss": 0.8428, + "step": 15908 + }, + { + "epoch": 2.0, + "grad_norm": 33.56275939941406, + "learning_rate": 6.689536878216124e-06, + "loss": 0.7461, + "step": 15909 + }, + { + "epoch": 2.0, + "grad_norm": 15.040940284729004, + "learning_rate": 6.688700163159436e-06, + "loss": 0.7987, + "step": 15910 + }, + { + "epoch": 2.0, + "grad_norm": 11.767467498779297, + "learning_rate": 6.68786344810275e-06, + "loss": 1.5834, + "step": 15911 + }, + { + "epoch": 2.0, + "grad_norm": 7.761677265167236, + "learning_rate": 6.687026733046062e-06, + "loss": 1.5374, + "step": 15912 + }, + { + "epoch": 2.0, + "grad_norm": 16.573644638061523, + "learning_rate": 6.6861900179893744e-06, + "loss": 1.1204, + "step": 15913 + }, + { + "epoch": 2.0, + "grad_norm": 12.538320541381836, + "learning_rate": 6.685353302932686e-06, + "loss": 0.524, + "step": 15914 + }, + { + "epoch": 2.0, + "grad_norm": 17.518587112426758, + "learning_rate": 6.684516587876e-06, + "loss": 0.9166, + "step": 15915 + }, + { + "epoch": 2.0, + "grad_norm": 11.541840553283691, + "learning_rate": 6.683679872819312e-06, + "loss": 0.6606, + "step": 15916 + }, + { + "epoch": 2.0, + "grad_norm": 80.66234588623047, + "learning_rate": 6.682843157762624e-06, + "loss": 3.8966, + "step": 15917 + }, + { + "epoch": 2.0, + "grad_norm": 21.442481994628906, + "learning_rate": 6.682006442705938e-06, + "loss": 1.0088, + "step": 15918 + }, + { + "epoch": 2.0, + "grad_norm": 12.824055671691895, + "learning_rate": 6.6811697276492495e-06, + "loss": 0.9655, + "step": 15919 + }, + { + "epoch": 2.0, + "grad_norm": 21.428932189941406, + "learning_rate": 6.680333012592562e-06, + "loss": 1.3878, + "step": 15920 + }, + { + "epoch": 2.0, + "grad_norm": 122.69747924804688, + "learning_rate": 6.679496297535874e-06, + "loss": 2.1404, + "step": 15921 + }, + { + "epoch": 2.0, + "grad_norm": 4.174544334411621, + "learning_rate": 6.678659582479188e-06, + "loss": 0.6177, + "step": 15922 + }, + { + "epoch": 2.0, + "grad_norm": 10.728100776672363, + "learning_rate": 6.6778228674225e-06, + "loss": 1.957, + "step": 15923 + }, + { + "epoch": 2.0, + "grad_norm": 11.2260103225708, + "learning_rate": 6.676986152365812e-06, + "loss": 0.4722, + "step": 15924 + }, + { + "epoch": 2.0, + "grad_norm": 14.812954902648926, + "learning_rate": 6.6761494373091255e-06, + "loss": 1.6674, + "step": 15925 + }, + { + "epoch": 2.0, + "grad_norm": 20.64780044555664, + "learning_rate": 6.6753127222524375e-06, + "loss": 0.8833, + "step": 15926 + }, + { + "epoch": 2.0, + "grad_norm": 4.755573272705078, + "learning_rate": 6.67447600719575e-06, + "loss": 0.4229, + "step": 15927 + }, + { + "epoch": 2.0, + "grad_norm": 16.041831970214844, + "learning_rate": 6.673639292139062e-06, + "loss": 0.8347, + "step": 15928 + }, + { + "epoch": 2.0, + "grad_norm": 17.680612564086914, + "learning_rate": 6.672802577082376e-06, + "loss": 1.5199, + "step": 15929 + }, + { + "epoch": 2.0, + "grad_norm": 4.9551849365234375, + "learning_rate": 6.671965862025688e-06, + "loss": 0.1124, + "step": 15930 + }, + { + "epoch": 2.0, + "grad_norm": 64.43891906738281, + "learning_rate": 6.671129146969e-06, + "loss": 0.7415, + "step": 15931 + }, + { + "epoch": 2.0, + "grad_norm": 7.296156883239746, + "learning_rate": 6.6702924319123134e-06, + "loss": 0.6088, + "step": 15932 + }, + { + "epoch": 2.0, + "grad_norm": 21.48379898071289, + "learning_rate": 6.669455716855625e-06, + "loss": 1.7839, + "step": 15933 + }, + { + "epoch": 2.0, + "grad_norm": 8.890734672546387, + "learning_rate": 6.668619001798938e-06, + "loss": 2.0624, + "step": 15934 + }, + { + "epoch": 2.0, + "grad_norm": 87.4519271850586, + "learning_rate": 6.66778228674225e-06, + "loss": 1.0414, + "step": 15935 + }, + { + "epoch": 2.0, + "grad_norm": 20.54428482055664, + "learning_rate": 6.666945571685564e-06, + "loss": 2.0353, + "step": 15936 + }, + { + "epoch": 2.0, + "grad_norm": 15.328015327453613, + "learning_rate": 6.666108856628876e-06, + "loss": 0.8762, + "step": 15937 + }, + { + "epoch": 2.0, + "grad_norm": 390.0461120605469, + "learning_rate": 6.665272141572188e-06, + "loss": 1.416, + "step": 15938 + }, + { + "epoch": 2.0, + "grad_norm": 14.488633155822754, + "learning_rate": 6.6644354265155005e-06, + "loss": 2.1592, + "step": 15939 + }, + { + "epoch": 2.0, + "grad_norm": 25.96225357055664, + "learning_rate": 6.663598711458813e-06, + "loss": 1.4226, + "step": 15940 + }, + { + "epoch": 2.0, + "grad_norm": 12.014482498168945, + "learning_rate": 6.662761996402126e-06, + "loss": 1.2372, + "step": 15941 + }, + { + "epoch": 2.0, + "grad_norm": 15.766691207885742, + "learning_rate": 6.661925281345438e-06, + "loss": 1.0284, + "step": 15942 + }, + { + "epoch": 2.0, + "grad_norm": 20.78445053100586, + "learning_rate": 6.661088566288752e-06, + "loss": 2.3436, + "step": 15943 + }, + { + "epoch": 2.0, + "grad_norm": 22.455188751220703, + "learning_rate": 6.660251851232064e-06, + "loss": 0.9648, + "step": 15944 + }, + { + "epoch": 2.0, + "grad_norm": 16.59456443786621, + "learning_rate": 6.659415136175376e-06, + "loss": 0.7957, + "step": 15945 + }, + { + "epoch": 2.0, + "grad_norm": 10.40251350402832, + "learning_rate": 6.658578421118688e-06, + "loss": 0.9505, + "step": 15946 + }, + { + "epoch": 2.0, + "grad_norm": 20.96028709411621, + "learning_rate": 6.657741706062001e-06, + "loss": 1.5892, + "step": 15947 + }, + { + "epoch": 2.0, + "grad_norm": 23.296125411987305, + "learning_rate": 6.656904991005314e-06, + "loss": 2.7671, + "step": 15948 + }, + { + "epoch": 2.0, + "grad_norm": 10.336874008178711, + "learning_rate": 6.656068275948626e-06, + "loss": 1.5377, + "step": 15949 + }, + { + "epoch": 2.0, + "grad_norm": 6.621896266937256, + "learning_rate": 6.65523156089194e-06, + "loss": 0.4338, + "step": 15950 + }, + { + "epoch": 2.0, + "grad_norm": 0.4018852412700653, + "learning_rate": 6.654394845835252e-06, + "loss": 0.0091, + "step": 15951 + }, + { + "epoch": 2.0, + "grad_norm": 8.823291778564453, + "learning_rate": 6.6535581307785635e-06, + "loss": 1.597, + "step": 15952 + }, + { + "epoch": 2.0, + "grad_norm": 47.25034713745117, + "learning_rate": 6.652721415721876e-06, + "loss": 1.5957, + "step": 15953 + }, + { + "epoch": 2.0, + "grad_norm": 14.00681209564209, + "learning_rate": 6.651884700665189e-06, + "loss": 0.5162, + "step": 15954 + }, + { + "epoch": 2.0, + "grad_norm": 50.61946487426758, + "learning_rate": 6.651047985608502e-06, + "loss": 1.0697, + "step": 15955 + }, + { + "epoch": 2.0, + "grad_norm": 7.162833213806152, + "learning_rate": 6.650211270551814e-06, + "loss": 1.5073, + "step": 15956 + }, + { + "epoch": 2.0, + "grad_norm": 22.026241302490234, + "learning_rate": 6.6493745554951275e-06, + "loss": 0.8263, + "step": 15957 + }, + { + "epoch": 2.0, + "grad_norm": 4.92219877243042, + "learning_rate": 6.6485378404384395e-06, + "loss": 0.2425, + "step": 15958 + }, + { + "epoch": 2.0, + "grad_norm": 3.8470804691314697, + "learning_rate": 6.6477011253817515e-06, + "loss": 0.3319, + "step": 15959 + }, + { + "epoch": 2.0, + "grad_norm": 14.090774536132812, + "learning_rate": 6.646864410325064e-06, + "loss": 1.4975, + "step": 15960 + }, + { + "epoch": 2.0, + "grad_norm": 20.857500076293945, + "learning_rate": 6.646027695268377e-06, + "loss": 0.9667, + "step": 15961 + }, + { + "epoch": 2.0, + "grad_norm": 2.1130306720733643, + "learning_rate": 6.64519098021169e-06, + "loss": 0.0662, + "step": 15962 + }, + { + "epoch": 2.0, + "grad_norm": 25.573955535888672, + "learning_rate": 6.644354265155002e-06, + "loss": 0.8767, + "step": 15963 + }, + { + "epoch": 2.0, + "grad_norm": 14.740697860717773, + "learning_rate": 6.6435175500983155e-06, + "loss": 1.8486, + "step": 15964 + }, + { + "epoch": 2.0, + "grad_norm": 10.59947681427002, + "learning_rate": 6.642680835041627e-06, + "loss": 0.6213, + "step": 15965 + }, + { + "epoch": 2.0, + "grad_norm": 14.851434707641602, + "learning_rate": 6.641844119984939e-06, + "loss": 0.6295, + "step": 15966 + }, + { + "epoch": 2.0, + "grad_norm": 16.237768173217773, + "learning_rate": 6.641007404928252e-06, + "loss": 1.1565, + "step": 15967 + }, + { + "epoch": 2.0, + "grad_norm": 10.23038387298584, + "learning_rate": 6.640170689871564e-06, + "loss": 0.4564, + "step": 15968 + }, + { + "epoch": 2.0, + "grad_norm": 10.985434532165527, + "learning_rate": 6.639333974814878e-06, + "loss": 0.7974, + "step": 15969 + }, + { + "epoch": 2.0, + "grad_norm": 67.11539459228516, + "learning_rate": 6.63849725975819e-06, + "loss": 1.8991, + "step": 15970 + }, + { + "epoch": 2.0, + "grad_norm": 74.15109252929688, + "learning_rate": 6.637660544701502e-06, + "loss": 0.4018, + "step": 15971 + }, + { + "epoch": 2.0, + "grad_norm": 21.274198532104492, + "learning_rate": 6.636823829644815e-06, + "loss": 1.4911, + "step": 15972 + }, + { + "epoch": 2.0, + "grad_norm": 26.153928756713867, + "learning_rate": 6.635987114588127e-06, + "loss": 2.2189, + "step": 15973 + }, + { + "epoch": 2.0, + "grad_norm": 46.78422546386719, + "learning_rate": 6.63515039953144e-06, + "loss": 2.7653, + "step": 15974 + }, + { + "epoch": 2.0, + "grad_norm": 11.140388488769531, + "learning_rate": 6.634313684474752e-06, + "loss": 0.5892, + "step": 15975 + }, + { + "epoch": 2.0, + "grad_norm": 9.493162155151367, + "learning_rate": 6.633476969418066e-06, + "loss": 0.8846, + "step": 15976 + }, + { + "epoch": 2.01, + "grad_norm": 8.969060897827148, + "learning_rate": 6.632640254361378e-06, + "loss": 1.0295, + "step": 15977 + }, + { + "epoch": 2.01, + "grad_norm": 24.31886100769043, + "learning_rate": 6.63180353930469e-06, + "loss": 1.8822, + "step": 15978 + }, + { + "epoch": 2.01, + "grad_norm": 18.47286605834961, + "learning_rate": 6.630966824248003e-06, + "loss": 1.128, + "step": 15979 + }, + { + "epoch": 2.01, + "grad_norm": 8.595629692077637, + "learning_rate": 6.630130109191315e-06, + "loss": 1.3727, + "step": 15980 + }, + { + "epoch": 2.01, + "grad_norm": 10.602871894836426, + "learning_rate": 6.629293394134628e-06, + "loss": 2.0915, + "step": 15981 + }, + { + "epoch": 2.01, + "grad_norm": 5.2133660316467285, + "learning_rate": 6.62845667907794e-06, + "loss": 0.2119, + "step": 15982 + }, + { + "epoch": 2.01, + "grad_norm": 20.664487838745117, + "learning_rate": 6.627619964021254e-06, + "loss": 1.0672, + "step": 15983 + }, + { + "epoch": 2.01, + "grad_norm": 15.849124908447266, + "learning_rate": 6.6267832489645656e-06, + "loss": 3.5109, + "step": 15984 + }, + { + "epoch": 2.01, + "grad_norm": 5.37014627456665, + "learning_rate": 6.6259465339078775e-06, + "loss": 0.2146, + "step": 15985 + }, + { + "epoch": 2.01, + "grad_norm": 16.57113265991211, + "learning_rate": 6.625109818851191e-06, + "loss": 1.0359, + "step": 15986 + }, + { + "epoch": 2.01, + "grad_norm": 14.121919631958008, + "learning_rate": 6.624273103794503e-06, + "loss": 0.7774, + "step": 15987 + }, + { + "epoch": 2.01, + "grad_norm": 4.6017680168151855, + "learning_rate": 6.623436388737816e-06, + "loss": 0.3663, + "step": 15988 + }, + { + "epoch": 2.01, + "grad_norm": 11.929272651672363, + "learning_rate": 6.622599673681128e-06, + "loss": 1.2326, + "step": 15989 + }, + { + "epoch": 2.01, + "grad_norm": 24.052127838134766, + "learning_rate": 6.6217629586244415e-06, + "loss": 1.5549, + "step": 15990 + }, + { + "epoch": 2.01, + "grad_norm": 10.859208106994629, + "learning_rate": 6.6209262435677535e-06, + "loss": 0.5273, + "step": 15991 + }, + { + "epoch": 2.01, + "grad_norm": 17.65408706665039, + "learning_rate": 6.6200895285110654e-06, + "loss": 0.4234, + "step": 15992 + }, + { + "epoch": 2.01, + "grad_norm": 11.088404655456543, + "learning_rate": 6.619252813454379e-06, + "loss": 0.4453, + "step": 15993 + }, + { + "epoch": 2.01, + "grad_norm": 23.68683433532715, + "learning_rate": 6.618416098397691e-06, + "loss": 0.4521, + "step": 15994 + }, + { + "epoch": 2.01, + "grad_norm": 9.509688377380371, + "learning_rate": 6.617579383341004e-06, + "loss": 0.5721, + "step": 15995 + }, + { + "epoch": 2.01, + "grad_norm": 11.537019729614258, + "learning_rate": 6.616742668284316e-06, + "loss": 0.7805, + "step": 15996 + }, + { + "epoch": 2.01, + "grad_norm": 16.062152862548828, + "learning_rate": 6.6159059532276295e-06, + "loss": 0.8361, + "step": 15997 + }, + { + "epoch": 2.01, + "grad_norm": 8.554738998413086, + "learning_rate": 6.615069238170941e-06, + "loss": 1.0527, + "step": 15998 + }, + { + "epoch": 2.01, + "grad_norm": 27.2183895111084, + "learning_rate": 6.614232523114253e-06, + "loss": 0.4897, + "step": 15999 + }, + { + "epoch": 2.01, + "grad_norm": 8.592169761657715, + "learning_rate": 6.613395808057567e-06, + "loss": 0.9188, + "step": 16000 + }, + { + "epoch": 2.01, + "eval_loss": 0.08059647679328918, + "eval_runtime": 95.2297, + "eval_samples_per_second": 37.194, + "eval_steps_per_second": 37.194, + "step": 16000 + }, + { + "epoch": 2.01, + "grad_norm": 19.152080535888672, + "learning_rate": 6.612559093000879e-06, + "loss": 0.5195, + "step": 16001 + }, + { + "epoch": 2.01, + "grad_norm": 22.343149185180664, + "learning_rate": 6.611722377944192e-06, + "loss": 1.7367, + "step": 16002 + }, + { + "epoch": 2.01, + "grad_norm": 8.451386451721191, + "learning_rate": 6.610885662887504e-06, + "loss": 1.1314, + "step": 16003 + }, + { + "epoch": 2.01, + "grad_norm": 13.740487098693848, + "learning_rate": 6.610048947830817e-06, + "loss": 1.0626, + "step": 16004 + }, + { + "epoch": 2.01, + "grad_norm": 8.336570739746094, + "learning_rate": 6.609212232774129e-06, + "loss": 1.1301, + "step": 16005 + }, + { + "epoch": 2.01, + "grad_norm": 25.304929733276367, + "learning_rate": 6.608375517717441e-06, + "loss": 1.0049, + "step": 16006 + }, + { + "epoch": 2.01, + "grad_norm": 9.847472190856934, + "learning_rate": 6.607538802660755e-06, + "loss": 0.7213, + "step": 16007 + }, + { + "epoch": 2.01, + "grad_norm": 9.159112930297852, + "learning_rate": 6.606702087604067e-06, + "loss": 1.4592, + "step": 16008 + }, + { + "epoch": 2.01, + "grad_norm": 4.6362738609313965, + "learning_rate": 6.60586537254738e-06, + "loss": 0.2961, + "step": 16009 + }, + { + "epoch": 2.01, + "grad_norm": 117.4923095703125, + "learning_rate": 6.605028657490692e-06, + "loss": 1.3204, + "step": 16010 + }, + { + "epoch": 2.01, + "grad_norm": 18.80195426940918, + "learning_rate": 6.604191942434005e-06, + "loss": 1.7741, + "step": 16011 + }, + { + "epoch": 2.01, + "grad_norm": 11.38399887084961, + "learning_rate": 6.603355227377317e-06, + "loss": 0.3152, + "step": 16012 + }, + { + "epoch": 2.01, + "grad_norm": 10.227343559265137, + "learning_rate": 6.602518512320629e-06, + "loss": 0.4564, + "step": 16013 + }, + { + "epoch": 2.01, + "grad_norm": 17.74940299987793, + "learning_rate": 6.601681797263942e-06, + "loss": 1.1158, + "step": 16014 + }, + { + "epoch": 2.01, + "grad_norm": 10.877930641174316, + "learning_rate": 6.600845082207255e-06, + "loss": 0.3915, + "step": 16015 + }, + { + "epoch": 2.01, + "grad_norm": 8.845476150512695, + "learning_rate": 6.600008367150568e-06, + "loss": 1.0693, + "step": 16016 + }, + { + "epoch": 2.01, + "grad_norm": 13.181829452514648, + "learning_rate": 6.5991716520938796e-06, + "loss": 0.8126, + "step": 16017 + }, + { + "epoch": 2.01, + "grad_norm": 27.765037536621094, + "learning_rate": 6.598334937037193e-06, + "loss": 1.3239, + "step": 16018 + }, + { + "epoch": 2.01, + "grad_norm": 12.452431678771973, + "learning_rate": 6.597498221980505e-06, + "loss": 0.5701, + "step": 16019 + }, + { + "epoch": 2.01, + "grad_norm": 15.18510627746582, + "learning_rate": 6.596661506923817e-06, + "loss": 1.2167, + "step": 16020 + }, + { + "epoch": 2.01, + "grad_norm": 2.89917254447937, + "learning_rate": 6.59582479186713e-06, + "loss": 0.079, + "step": 16021 + }, + { + "epoch": 2.01, + "grad_norm": 45.87964630126953, + "learning_rate": 6.594988076810443e-06, + "loss": 0.3751, + "step": 16022 + }, + { + "epoch": 2.01, + "grad_norm": 6.7308173179626465, + "learning_rate": 6.5941513617537555e-06, + "loss": 0.2547, + "step": 16023 + }, + { + "epoch": 2.01, + "grad_norm": 6.030806064605713, + "learning_rate": 6.5933146466970675e-06, + "loss": 0.4948, + "step": 16024 + }, + { + "epoch": 2.01, + "grad_norm": 20.833017349243164, + "learning_rate": 6.592477931640381e-06, + "loss": 1.7271, + "step": 16025 + }, + { + "epoch": 2.01, + "grad_norm": 4.376020431518555, + "learning_rate": 6.591641216583693e-06, + "loss": 1.2232, + "step": 16026 + }, + { + "epoch": 2.01, + "grad_norm": 12.35608196258545, + "learning_rate": 6.590804501527005e-06, + "loss": 0.3531, + "step": 16027 + }, + { + "epoch": 2.01, + "grad_norm": 29.041461944580078, + "learning_rate": 6.589967786470318e-06, + "loss": 1.8665, + "step": 16028 + }, + { + "epoch": 2.01, + "grad_norm": 24.14327049255371, + "learning_rate": 6.589131071413631e-06, + "loss": 1.0578, + "step": 16029 + }, + { + "epoch": 2.01, + "grad_norm": 6.061190605163574, + "learning_rate": 6.5882943563569434e-06, + "loss": 0.237, + "step": 16030 + }, + { + "epoch": 2.01, + "grad_norm": 8.493118286132812, + "learning_rate": 6.587457641300255e-06, + "loss": 0.6232, + "step": 16031 + }, + { + "epoch": 2.01, + "grad_norm": 46.43001937866211, + "learning_rate": 6.586620926243569e-06, + "loss": 2.7755, + "step": 16032 + }, + { + "epoch": 2.01, + "grad_norm": 20.77440643310547, + "learning_rate": 6.585784211186881e-06, + "loss": 2.6907, + "step": 16033 + }, + { + "epoch": 2.01, + "grad_norm": 18.526092529296875, + "learning_rate": 6.584947496130193e-06, + "loss": 1.2859, + "step": 16034 + }, + { + "epoch": 2.01, + "grad_norm": 16.359085083007812, + "learning_rate": 6.584110781073506e-06, + "loss": 0.5416, + "step": 16035 + }, + { + "epoch": 2.01, + "grad_norm": 13.703718185424805, + "learning_rate": 6.5832740660168186e-06, + "loss": 2.1605, + "step": 16036 + }, + { + "epoch": 2.01, + "grad_norm": 26.011127471923828, + "learning_rate": 6.582437350960131e-06, + "loss": 1.2199, + "step": 16037 + }, + { + "epoch": 2.01, + "grad_norm": 12.94687557220459, + "learning_rate": 6.581600635903443e-06, + "loss": 1.2709, + "step": 16038 + }, + { + "epoch": 2.01, + "grad_norm": 4.392760276794434, + "learning_rate": 6.580763920846757e-06, + "loss": 0.3088, + "step": 16039 + }, + { + "epoch": 2.01, + "grad_norm": 28.460010528564453, + "learning_rate": 6.579927205790069e-06, + "loss": 0.3051, + "step": 16040 + }, + { + "epoch": 2.01, + "grad_norm": 6.393258094787598, + "learning_rate": 6.579090490733381e-06, + "loss": 0.2671, + "step": 16041 + }, + { + "epoch": 2.01, + "grad_norm": 19.735218048095703, + "learning_rate": 6.578253775676694e-06, + "loss": 0.9901, + "step": 16042 + }, + { + "epoch": 2.01, + "grad_norm": 7.071054458618164, + "learning_rate": 6.5774170606200065e-06, + "loss": 0.5651, + "step": 16043 + }, + { + "epoch": 2.01, + "grad_norm": 102.0082015991211, + "learning_rate": 6.576580345563319e-06, + "loss": 2.522, + "step": 16044 + }, + { + "epoch": 2.01, + "grad_norm": 40.931800842285156, + "learning_rate": 6.575743630506631e-06, + "loss": 0.9531, + "step": 16045 + }, + { + "epoch": 2.01, + "grad_norm": 10.436454772949219, + "learning_rate": 6.574906915449945e-06, + "loss": 0.5927, + "step": 16046 + }, + { + "epoch": 2.01, + "grad_norm": 10.416304588317871, + "learning_rate": 6.574070200393257e-06, + "loss": 0.6314, + "step": 16047 + }, + { + "epoch": 2.01, + "grad_norm": 13.067681312561035, + "learning_rate": 6.573233485336569e-06, + "loss": 0.65, + "step": 16048 + }, + { + "epoch": 2.01, + "grad_norm": 15.980208396911621, + "learning_rate": 6.572396770279882e-06, + "loss": 2.3473, + "step": 16049 + }, + { + "epoch": 2.01, + "grad_norm": 16.0473690032959, + "learning_rate": 6.5715600552231936e-06, + "loss": 1.1509, + "step": 16050 + }, + { + "epoch": 2.01, + "grad_norm": 8.895906448364258, + "learning_rate": 6.570723340166507e-06, + "loss": 0.8638, + "step": 16051 + }, + { + "epoch": 2.01, + "grad_norm": 8.34984016418457, + "learning_rate": 6.569886625109819e-06, + "loss": 0.5495, + "step": 16052 + }, + { + "epoch": 2.01, + "grad_norm": 9.876906394958496, + "learning_rate": 6.569049910053133e-06, + "loss": 1.6489, + "step": 16053 + }, + { + "epoch": 2.01, + "grad_norm": 107.1396713256836, + "learning_rate": 6.568213194996445e-06, + "loss": 1.6347, + "step": 16054 + }, + { + "epoch": 2.01, + "grad_norm": 55.78226089477539, + "learning_rate": 6.567376479939757e-06, + "loss": 1.3686, + "step": 16055 + }, + { + "epoch": 2.01, + "grad_norm": 17.969043731689453, + "learning_rate": 6.5665397648830695e-06, + "loss": 1.5541, + "step": 16056 + }, + { + "epoch": 2.02, + "grad_norm": 8.708976745605469, + "learning_rate": 6.5657030498263815e-06, + "loss": 1.0385, + "step": 16057 + }, + { + "epoch": 2.02, + "grad_norm": 9.779619216918945, + "learning_rate": 6.564866334769695e-06, + "loss": 0.4743, + "step": 16058 + }, + { + "epoch": 2.02, + "grad_norm": 6.712967395782471, + "learning_rate": 6.564029619713007e-06, + "loss": 0.381, + "step": 16059 + }, + { + "epoch": 2.02, + "grad_norm": 6.852048873901367, + "learning_rate": 6.563192904656321e-06, + "loss": 1.7019, + "step": 16060 + }, + { + "epoch": 2.02, + "grad_norm": 7.393848896026611, + "learning_rate": 6.562356189599633e-06, + "loss": 0.4451, + "step": 16061 + }, + { + "epoch": 2.02, + "grad_norm": 10.314332008361816, + "learning_rate": 6.561519474542945e-06, + "loss": 0.8183, + "step": 16062 + }, + { + "epoch": 2.02, + "grad_norm": 24.86186408996582, + "learning_rate": 6.5606827594862574e-06, + "loss": 1.3703, + "step": 16063 + }, + { + "epoch": 2.02, + "grad_norm": 5.029012680053711, + "learning_rate": 6.559846044429569e-06, + "loss": 0.0475, + "step": 16064 + }, + { + "epoch": 2.02, + "grad_norm": 12.916887283325195, + "learning_rate": 6.559009329372883e-06, + "loss": 0.6797, + "step": 16065 + }, + { + "epoch": 2.02, + "grad_norm": 7.170773029327393, + "learning_rate": 6.558172614316195e-06, + "loss": 0.7314, + "step": 16066 + }, + { + "epoch": 2.02, + "grad_norm": 11.910627365112305, + "learning_rate": 6.557335899259508e-06, + "loss": 0.3921, + "step": 16067 + }, + { + "epoch": 2.02, + "grad_norm": 40.78810501098633, + "learning_rate": 6.556499184202821e-06, + "loss": 1.2843, + "step": 16068 + }, + { + "epoch": 2.02, + "grad_norm": 35.77200698852539, + "learning_rate": 6.5556624691461325e-06, + "loss": 1.6284, + "step": 16069 + }, + { + "epoch": 2.02, + "grad_norm": 11.998297691345215, + "learning_rate": 6.554825754089445e-06, + "loss": 1.7564, + "step": 16070 + }, + { + "epoch": 2.02, + "grad_norm": 79.4481201171875, + "learning_rate": 6.553989039032757e-06, + "loss": 1.7479, + "step": 16071 + }, + { + "epoch": 2.02, + "grad_norm": 21.77023696899414, + "learning_rate": 6.553152323976071e-06, + "loss": 2.0174, + "step": 16072 + }, + { + "epoch": 2.02, + "grad_norm": 44.96331787109375, + "learning_rate": 6.552315608919383e-06, + "loss": 0.5996, + "step": 16073 + }, + { + "epoch": 2.02, + "grad_norm": 11.881548881530762, + "learning_rate": 6.551478893862696e-06, + "loss": 0.5137, + "step": 16074 + }, + { + "epoch": 2.02, + "grad_norm": 25.157161712646484, + "learning_rate": 6.5506421788060085e-06, + "loss": 1.3631, + "step": 16075 + }, + { + "epoch": 2.02, + "grad_norm": 7.388939380645752, + "learning_rate": 6.5498054637493205e-06, + "loss": 0.5749, + "step": 16076 + }, + { + "epoch": 2.02, + "grad_norm": 13.814678192138672, + "learning_rate": 6.548968748692633e-06, + "loss": 0.5423, + "step": 16077 + }, + { + "epoch": 2.02, + "grad_norm": 12.471866607666016, + "learning_rate": 6.548132033635945e-06, + "loss": 1.0481, + "step": 16078 + }, + { + "epoch": 2.02, + "grad_norm": 5.258547306060791, + "learning_rate": 6.547295318579259e-06, + "loss": 0.262, + "step": 16079 + }, + { + "epoch": 2.02, + "grad_norm": 14.908158302307129, + "learning_rate": 6.546458603522571e-06, + "loss": 1.0873, + "step": 16080 + }, + { + "epoch": 2.02, + "grad_norm": 11.934511184692383, + "learning_rate": 6.545621888465884e-06, + "loss": 0.7848, + "step": 16081 + }, + { + "epoch": 2.02, + "grad_norm": 19.13489532470703, + "learning_rate": 6.5447851734091964e-06, + "loss": 1.4351, + "step": 16082 + }, + { + "epoch": 2.02, + "grad_norm": 20.378049850463867, + "learning_rate": 6.543948458352508e-06, + "loss": 1.1822, + "step": 16083 + }, + { + "epoch": 2.02, + "grad_norm": 11.956856727600098, + "learning_rate": 6.543111743295821e-06, + "loss": 0.5044, + "step": 16084 + }, + { + "epoch": 2.02, + "grad_norm": 11.374585151672363, + "learning_rate": 6.542275028239133e-06, + "loss": 1.4749, + "step": 16085 + }, + { + "epoch": 2.02, + "grad_norm": 12.470864295959473, + "learning_rate": 6.541438313182447e-06, + "loss": 0.4487, + "step": 16086 + }, + { + "epoch": 2.02, + "grad_norm": 11.731393814086914, + "learning_rate": 6.540601598125759e-06, + "loss": 1.7252, + "step": 16087 + }, + { + "epoch": 2.02, + "grad_norm": 19.13770866394043, + "learning_rate": 6.5397648830690715e-06, + "loss": 0.5294, + "step": 16088 + }, + { + "epoch": 2.02, + "grad_norm": 9.0031156539917, + "learning_rate": 6.538928168012384e-06, + "loss": 0.3526, + "step": 16089 + }, + { + "epoch": 2.02, + "grad_norm": 8.142091751098633, + "learning_rate": 6.538091452955696e-06, + "loss": 0.4971, + "step": 16090 + }, + { + "epoch": 2.02, + "grad_norm": 10.265229225158691, + "learning_rate": 6.537254737899009e-06, + "loss": 0.6881, + "step": 16091 + }, + { + "epoch": 2.02, + "grad_norm": 9.502202987670898, + "learning_rate": 6.536418022842321e-06, + "loss": 0.3887, + "step": 16092 + }, + { + "epoch": 2.02, + "grad_norm": 11.04644775390625, + "learning_rate": 6.535581307785635e-06, + "loss": 0.411, + "step": 16093 + }, + { + "epoch": 2.02, + "grad_norm": 127.50508117675781, + "learning_rate": 6.534744592728947e-06, + "loss": 1.6468, + "step": 16094 + }, + { + "epoch": 2.02, + "grad_norm": 21.08706283569336, + "learning_rate": 6.5339078776722595e-06, + "loss": 1.0562, + "step": 16095 + }, + { + "epoch": 2.02, + "grad_norm": 22.2269344329834, + "learning_rate": 6.533071162615571e-06, + "loss": 0.7309, + "step": 16096 + }, + { + "epoch": 2.02, + "grad_norm": 16.021883010864258, + "learning_rate": 6.532234447558884e-06, + "loss": 0.8293, + "step": 16097 + }, + { + "epoch": 2.02, + "grad_norm": 22.984682083129883, + "learning_rate": 6.531397732502197e-06, + "loss": 1.6267, + "step": 16098 + }, + { + "epoch": 2.02, + "grad_norm": 8.846332550048828, + "learning_rate": 6.530561017445509e-06, + "loss": 0.7814, + "step": 16099 + }, + { + "epoch": 2.02, + "grad_norm": 13.226693153381348, + "learning_rate": 6.529724302388823e-06, + "loss": 0.393, + "step": 16100 + }, + { + "epoch": 2.02, + "grad_norm": 29.962265014648438, + "learning_rate": 6.528887587332135e-06, + "loss": 1.4385, + "step": 16101 + }, + { + "epoch": 2.02, + "grad_norm": 5.677152633666992, + "learning_rate": 6.528050872275447e-06, + "loss": 0.2706, + "step": 16102 + }, + { + "epoch": 2.02, + "grad_norm": 125.49024963378906, + "learning_rate": 6.527214157218759e-06, + "loss": 4.2126, + "step": 16103 + }, + { + "epoch": 2.02, + "grad_norm": 9.701367378234863, + "learning_rate": 6.526377442162072e-06, + "loss": 0.3329, + "step": 16104 + }, + { + "epoch": 2.02, + "grad_norm": 5.124175548553467, + "learning_rate": 6.525540727105385e-06, + "loss": 0.5724, + "step": 16105 + }, + { + "epoch": 2.02, + "grad_norm": 19.317773818969727, + "learning_rate": 6.524704012048697e-06, + "loss": 1.0071, + "step": 16106 + }, + { + "epoch": 2.02, + "grad_norm": 32.84058380126953, + "learning_rate": 6.5238672969920105e-06, + "loss": 1.9674, + "step": 16107 + }, + { + "epoch": 2.02, + "grad_norm": 60.97601318359375, + "learning_rate": 6.5230305819353225e-06, + "loss": 0.9619, + "step": 16108 + }, + { + "epoch": 2.02, + "grad_norm": 9.599262237548828, + "learning_rate": 6.522193866878635e-06, + "loss": 0.5553, + "step": 16109 + }, + { + "epoch": 2.02, + "grad_norm": 5.903003215789795, + "learning_rate": 6.521357151821947e-06, + "loss": 0.5936, + "step": 16110 + }, + { + "epoch": 2.02, + "grad_norm": 19.20045280456543, + "learning_rate": 6.52052043676526e-06, + "loss": 1.0966, + "step": 16111 + }, + { + "epoch": 2.02, + "grad_norm": 32.570472717285156, + "learning_rate": 6.519683721708573e-06, + "loss": 0.6197, + "step": 16112 + }, + { + "epoch": 2.02, + "grad_norm": 16.27768325805664, + "learning_rate": 6.518847006651885e-06, + "loss": 0.7894, + "step": 16113 + }, + { + "epoch": 2.02, + "grad_norm": 27.70846176147461, + "learning_rate": 6.5180102915951985e-06, + "loss": 1.7573, + "step": 16114 + }, + { + "epoch": 2.02, + "grad_norm": 96.64350128173828, + "learning_rate": 6.51717357653851e-06, + "loss": 1.8776, + "step": 16115 + }, + { + "epoch": 2.02, + "grad_norm": 10.895058631896973, + "learning_rate": 6.516336861481823e-06, + "loss": 2.8214, + "step": 16116 + }, + { + "epoch": 2.02, + "grad_norm": 23.313756942749023, + "learning_rate": 6.515500146425135e-06, + "loss": 2.5583, + "step": 16117 + }, + { + "epoch": 2.02, + "grad_norm": 30.129369735717773, + "learning_rate": 6.514663431368448e-06, + "loss": 2.4679, + "step": 16118 + }, + { + "epoch": 2.02, + "grad_norm": 18.207834243774414, + "learning_rate": 6.513826716311761e-06, + "loss": 1.3371, + "step": 16119 + }, + { + "epoch": 2.02, + "grad_norm": 17.993852615356445, + "learning_rate": 6.512990001255073e-06, + "loss": 1.3252, + "step": 16120 + }, + { + "epoch": 2.02, + "grad_norm": 13.623716354370117, + "learning_rate": 6.512153286198386e-06, + "loss": 0.4988, + "step": 16121 + }, + { + "epoch": 2.02, + "grad_norm": 5.604955196380615, + "learning_rate": 6.511316571141698e-06, + "loss": 0.1058, + "step": 16122 + }, + { + "epoch": 2.02, + "grad_norm": 9.774563789367676, + "learning_rate": 6.510479856085011e-06, + "loss": 0.8261, + "step": 16123 + }, + { + "epoch": 2.02, + "grad_norm": 9.117918014526367, + "learning_rate": 6.509643141028323e-06, + "loss": 1.2245, + "step": 16124 + }, + { + "epoch": 2.02, + "grad_norm": 14.01237964630127, + "learning_rate": 6.508806425971636e-06, + "loss": 0.8158, + "step": 16125 + }, + { + "epoch": 2.02, + "grad_norm": 259.6254577636719, + "learning_rate": 6.507969710914949e-06, + "loss": 1.7877, + "step": 16126 + }, + { + "epoch": 2.02, + "grad_norm": 63.59457778930664, + "learning_rate": 6.507132995858261e-06, + "loss": 1.2412, + "step": 16127 + }, + { + "epoch": 2.02, + "grad_norm": 10.758515357971191, + "learning_rate": 6.506296280801574e-06, + "loss": 0.8092, + "step": 16128 + }, + { + "epoch": 2.02, + "grad_norm": 19.523914337158203, + "learning_rate": 6.505459565744886e-06, + "loss": 1.5905, + "step": 16129 + }, + { + "epoch": 2.02, + "grad_norm": 5.093863010406494, + "learning_rate": 6.504622850688199e-06, + "loss": 0.2513, + "step": 16130 + }, + { + "epoch": 2.02, + "grad_norm": 4.924009799957275, + "learning_rate": 6.503786135631511e-06, + "loss": 0.279, + "step": 16131 + }, + { + "epoch": 2.02, + "grad_norm": 8.36949634552002, + "learning_rate": 6.502949420574823e-06, + "loss": 0.1637, + "step": 16132 + }, + { + "epoch": 2.02, + "grad_norm": 9.036155700683594, + "learning_rate": 6.502112705518137e-06, + "loss": 1.0372, + "step": 16133 + }, + { + "epoch": 2.02, + "grad_norm": 13.477315902709961, + "learning_rate": 6.5012759904614486e-06, + "loss": 1.1656, + "step": 16134 + }, + { + "epoch": 2.02, + "grad_norm": 5.973839282989502, + "learning_rate": 6.500439275404762e-06, + "loss": 0.16, + "step": 16135 + }, + { + "epoch": 2.03, + "grad_norm": 54.16847610473633, + "learning_rate": 6.499602560348074e-06, + "loss": 2.3118, + "step": 16136 + }, + { + "epoch": 2.03, + "grad_norm": 110.89966583251953, + "learning_rate": 6.498765845291387e-06, + "loss": 0.6806, + "step": 16137 + }, + { + "epoch": 2.03, + "grad_norm": 11.091172218322754, + "learning_rate": 6.497929130234699e-06, + "loss": 0.5446, + "step": 16138 + }, + { + "epoch": 2.03, + "grad_norm": 13.850701332092285, + "learning_rate": 6.497092415178011e-06, + "loss": 1.1616, + "step": 16139 + }, + { + "epoch": 2.03, + "grad_norm": 24.403318405151367, + "learning_rate": 6.4962557001213245e-06, + "loss": 1.1152, + "step": 16140 + }, + { + "epoch": 2.03, + "grad_norm": 1.9210470914840698, + "learning_rate": 6.4954189850646365e-06, + "loss": 0.0214, + "step": 16141 + }, + { + "epoch": 2.03, + "grad_norm": 6.3548784255981445, + "learning_rate": 6.49458227000795e-06, + "loss": 0.9924, + "step": 16142 + }, + { + "epoch": 2.03, + "grad_norm": 30.173181533813477, + "learning_rate": 6.493745554951262e-06, + "loss": 1.5111, + "step": 16143 + }, + { + "epoch": 2.03, + "grad_norm": 398.6379089355469, + "learning_rate": 6.492908839894575e-06, + "loss": 3.3468, + "step": 16144 + }, + { + "epoch": 2.03, + "grad_norm": 11.111824989318848, + "learning_rate": 6.492072124837887e-06, + "loss": 0.7169, + "step": 16145 + }, + { + "epoch": 2.03, + "grad_norm": 15.077082633972168, + "learning_rate": 6.491235409781199e-06, + "loss": 0.8733, + "step": 16146 + }, + { + "epoch": 2.03, + "grad_norm": 4.078538417816162, + "learning_rate": 6.4903986947245125e-06, + "loss": 0.2718, + "step": 16147 + }, + { + "epoch": 2.03, + "grad_norm": 7.101211071014404, + "learning_rate": 6.489561979667824e-06, + "loss": 0.5178, + "step": 16148 + }, + { + "epoch": 2.03, + "grad_norm": 5.255901336669922, + "learning_rate": 6.488725264611137e-06, + "loss": 0.8095, + "step": 16149 + }, + { + "epoch": 2.03, + "grad_norm": 8.656655311584473, + "learning_rate": 6.48788854955445e-06, + "loss": 0.6101, + "step": 16150 + }, + { + "epoch": 2.03, + "grad_norm": 14.041111946105957, + "learning_rate": 6.487051834497762e-06, + "loss": 1.4424, + "step": 16151 + }, + { + "epoch": 2.03, + "grad_norm": 9.727947235107422, + "learning_rate": 6.486215119441075e-06, + "loss": 2.4062, + "step": 16152 + }, + { + "epoch": 2.03, + "grad_norm": 19.479961395263672, + "learning_rate": 6.485378404384387e-06, + "loss": 2.2615, + "step": 16153 + }, + { + "epoch": 2.03, + "grad_norm": 6.5831732749938965, + "learning_rate": 6.4845416893277e-06, + "loss": 0.4318, + "step": 16154 + }, + { + "epoch": 2.03, + "grad_norm": 4.607565402984619, + "learning_rate": 6.483704974271012e-06, + "loss": 0.3844, + "step": 16155 + }, + { + "epoch": 2.03, + "grad_norm": 8.164737701416016, + "learning_rate": 6.482868259214325e-06, + "loss": 0.362, + "step": 16156 + }, + { + "epoch": 2.03, + "grad_norm": 9.221551895141602, + "learning_rate": 6.482031544157638e-06, + "loss": 0.8428, + "step": 16157 + }, + { + "epoch": 2.03, + "grad_norm": 9.317448616027832, + "learning_rate": 6.48119482910095e-06, + "loss": 0.5997, + "step": 16158 + }, + { + "epoch": 2.03, + "grad_norm": 19.246498107910156, + "learning_rate": 6.480358114044263e-06, + "loss": 1.2734, + "step": 16159 + }, + { + "epoch": 2.03, + "grad_norm": 8.3816499710083, + "learning_rate": 6.479521398987575e-06, + "loss": 0.4112, + "step": 16160 + }, + { + "epoch": 2.03, + "grad_norm": 4.023574352264404, + "learning_rate": 6.478684683930888e-06, + "loss": 0.1879, + "step": 16161 + }, + { + "epoch": 2.03, + "grad_norm": 21.623899459838867, + "learning_rate": 6.4778479688742e-06, + "loss": 1.7094, + "step": 16162 + }, + { + "epoch": 2.03, + "grad_norm": 12.556909561157227, + "learning_rate": 6.477011253817513e-06, + "loss": 1.462, + "step": 16163 + }, + { + "epoch": 2.03, + "grad_norm": 18.379074096679688, + "learning_rate": 6.476174538760826e-06, + "loss": 1.4132, + "step": 16164 + }, + { + "epoch": 2.03, + "grad_norm": 6.25236177444458, + "learning_rate": 6.475337823704138e-06, + "loss": 0.8684, + "step": 16165 + }, + { + "epoch": 2.03, + "grad_norm": 5.661009311676025, + "learning_rate": 6.474501108647451e-06, + "loss": 0.3523, + "step": 16166 + }, + { + "epoch": 2.03, + "grad_norm": 20.906024932861328, + "learning_rate": 6.4736643935907626e-06, + "loss": 1.8841, + "step": 16167 + }, + { + "epoch": 2.03, + "grad_norm": 20.141586303710938, + "learning_rate": 6.472827678534076e-06, + "loss": 0.3814, + "step": 16168 + }, + { + "epoch": 2.03, + "grad_norm": 11.476037979125977, + "learning_rate": 6.471990963477388e-06, + "loss": 0.5897, + "step": 16169 + }, + { + "epoch": 2.03, + "grad_norm": 8.756049156188965, + "learning_rate": 6.471154248420701e-06, + "loss": 0.6763, + "step": 16170 + }, + { + "epoch": 2.03, + "grad_norm": 11.013794898986816, + "learning_rate": 6.470317533364014e-06, + "loss": 0.8109, + "step": 16171 + }, + { + "epoch": 2.03, + "grad_norm": 3.914527177810669, + "learning_rate": 6.469480818307326e-06, + "loss": 0.1943, + "step": 16172 + }, + { + "epoch": 2.03, + "grad_norm": 10.756383895874023, + "learning_rate": 6.4686441032506385e-06, + "loss": 0.267, + "step": 16173 + }, + { + "epoch": 2.03, + "grad_norm": 12.76132869720459, + "learning_rate": 6.4678073881939505e-06, + "loss": 0.7225, + "step": 16174 + }, + { + "epoch": 2.03, + "grad_norm": 8.393716812133789, + "learning_rate": 6.466970673137264e-06, + "loss": 1.6324, + "step": 16175 + }, + { + "epoch": 2.03, + "grad_norm": 32.774295806884766, + "learning_rate": 6.466133958080576e-06, + "loss": 1.6458, + "step": 16176 + }, + { + "epoch": 2.03, + "grad_norm": 16.518552780151367, + "learning_rate": 6.465297243023889e-06, + "loss": 1.0371, + "step": 16177 + }, + { + "epoch": 2.03, + "grad_norm": 18.92826271057129, + "learning_rate": 6.464460527967201e-06, + "loss": 0.6249, + "step": 16178 + }, + { + "epoch": 2.03, + "grad_norm": 17.560800552368164, + "learning_rate": 6.463623812910514e-06, + "loss": 0.7807, + "step": 16179 + }, + { + "epoch": 2.03, + "grad_norm": 24.438623428344727, + "learning_rate": 6.4627870978538264e-06, + "loss": 1.0112, + "step": 16180 + }, + { + "epoch": 2.03, + "grad_norm": 6.223598957061768, + "learning_rate": 6.461950382797138e-06, + "loss": 0.3599, + "step": 16181 + }, + { + "epoch": 2.03, + "grad_norm": 7.577473163604736, + "learning_rate": 6.461113667740452e-06, + "loss": 0.3013, + "step": 16182 + }, + { + "epoch": 2.03, + "grad_norm": 20.38938331604004, + "learning_rate": 6.460276952683764e-06, + "loss": 0.8726, + "step": 16183 + }, + { + "epoch": 2.03, + "grad_norm": 124.90731811523438, + "learning_rate": 6.459440237627077e-06, + "loss": 3.3834, + "step": 16184 + }, + { + "epoch": 2.03, + "grad_norm": 77.51597595214844, + "learning_rate": 6.458603522570389e-06, + "loss": 2.6717, + "step": 16185 + }, + { + "epoch": 2.03, + "grad_norm": 2.382199764251709, + "learning_rate": 6.4577668075137016e-06, + "loss": 0.0814, + "step": 16186 + }, + { + "epoch": 2.03, + "grad_norm": 14.492742538452148, + "learning_rate": 6.456930092457014e-06, + "loss": 0.9572, + "step": 16187 + }, + { + "epoch": 2.03, + "grad_norm": 8.434473037719727, + "learning_rate": 6.456093377400326e-06, + "loss": 0.3047, + "step": 16188 + }, + { + "epoch": 2.03, + "grad_norm": 6.0500569343566895, + "learning_rate": 6.45525666234364e-06, + "loss": 0.4383, + "step": 16189 + }, + { + "epoch": 2.03, + "grad_norm": 4.993636608123779, + "learning_rate": 6.454419947286952e-06, + "loss": 1.2767, + "step": 16190 + }, + { + "epoch": 2.03, + "grad_norm": 14.795984268188477, + "learning_rate": 6.453583232230265e-06, + "loss": 0.6547, + "step": 16191 + }, + { + "epoch": 2.03, + "grad_norm": 27.816919326782227, + "learning_rate": 6.452746517173577e-06, + "loss": 3.139, + "step": 16192 + }, + { + "epoch": 2.03, + "grad_norm": 13.740135192871094, + "learning_rate": 6.4519098021168895e-06, + "loss": 1.9539, + "step": 16193 + }, + { + "epoch": 2.03, + "grad_norm": 22.25745391845703, + "learning_rate": 6.451073087060202e-06, + "loss": 0.6469, + "step": 16194 + }, + { + "epoch": 2.03, + "grad_norm": 20.07583236694336, + "learning_rate": 6.450236372003514e-06, + "loss": 1.3022, + "step": 16195 + }, + { + "epoch": 2.03, + "grad_norm": 36.75003433227539, + "learning_rate": 6.449399656946828e-06, + "loss": 3.4121, + "step": 16196 + }, + { + "epoch": 2.03, + "grad_norm": 9.228172302246094, + "learning_rate": 6.44856294189014e-06, + "loss": 0.6371, + "step": 16197 + }, + { + "epoch": 2.03, + "grad_norm": 34.50830078125, + "learning_rate": 6.447726226833453e-06, + "loss": 1.3297, + "step": 16198 + }, + { + "epoch": 2.03, + "grad_norm": 11.33741569519043, + "learning_rate": 6.446889511776765e-06, + "loss": 0.3887, + "step": 16199 + }, + { + "epoch": 2.03, + "grad_norm": 12.4901123046875, + "learning_rate": 6.446052796720077e-06, + "loss": 1.7242, + "step": 16200 + }, + { + "epoch": 2.03, + "grad_norm": 7.714563846588135, + "learning_rate": 6.44521608166339e-06, + "loss": 0.3376, + "step": 16201 + }, + { + "epoch": 2.03, + "grad_norm": 9.506186485290527, + "learning_rate": 6.444379366606702e-06, + "loss": 0.7857, + "step": 16202 + }, + { + "epoch": 2.03, + "grad_norm": 162.11468505859375, + "learning_rate": 6.443542651550016e-06, + "loss": 2.5083, + "step": 16203 + }, + { + "epoch": 2.03, + "grad_norm": 23.76825714111328, + "learning_rate": 6.442705936493328e-06, + "loss": 1.1915, + "step": 16204 + }, + { + "epoch": 2.03, + "grad_norm": 21.995790481567383, + "learning_rate": 6.4418692214366406e-06, + "loss": 0.8841, + "step": 16205 + }, + { + "epoch": 2.03, + "grad_norm": 21.455398559570312, + "learning_rate": 6.4410325063799525e-06, + "loss": 1.2645, + "step": 16206 + }, + { + "epoch": 2.03, + "grad_norm": 6.0819830894470215, + "learning_rate": 6.4401957913232645e-06, + "loss": 1.8939, + "step": 16207 + }, + { + "epoch": 2.03, + "grad_norm": 21.00604820251465, + "learning_rate": 6.439359076266578e-06, + "loss": 1.5358, + "step": 16208 + }, + { + "epoch": 2.03, + "grad_norm": 19.687803268432617, + "learning_rate": 6.43852236120989e-06, + "loss": 2.1922, + "step": 16209 + }, + { + "epoch": 2.03, + "grad_norm": 18.701154708862305, + "learning_rate": 6.437685646153204e-06, + "loss": 0.8948, + "step": 16210 + }, + { + "epoch": 2.03, + "grad_norm": 17.60835075378418, + "learning_rate": 6.436848931096516e-06, + "loss": 0.7196, + "step": 16211 + }, + { + "epoch": 2.03, + "grad_norm": 16.8372859954834, + "learning_rate": 6.4360122160398285e-06, + "loss": 0.7158, + "step": 16212 + }, + { + "epoch": 2.03, + "grad_norm": 4.943900108337402, + "learning_rate": 6.4351755009831404e-06, + "loss": 0.1508, + "step": 16213 + }, + { + "epoch": 2.03, + "grad_norm": 68.16217803955078, + "learning_rate": 6.434338785926452e-06, + "loss": 1.7712, + "step": 16214 + }, + { + "epoch": 2.03, + "grad_norm": 16.084169387817383, + "learning_rate": 6.433502070869766e-06, + "loss": 0.2979, + "step": 16215 + }, + { + "epoch": 2.04, + "grad_norm": 51.093727111816406, + "learning_rate": 6.432665355813078e-06, + "loss": 1.1893, + "step": 16216 + }, + { + "epoch": 2.04, + "grad_norm": 10.752718925476074, + "learning_rate": 6.431828640756392e-06, + "loss": 0.8647, + "step": 16217 + }, + { + "epoch": 2.04, + "grad_norm": 13.057160377502441, + "learning_rate": 6.430991925699704e-06, + "loss": 1.4833, + "step": 16218 + }, + { + "epoch": 2.04, + "grad_norm": 13.10116195678711, + "learning_rate": 6.430155210643016e-06, + "loss": 2.2656, + "step": 16219 + }, + { + "epoch": 2.04, + "grad_norm": 8.28217601776123, + "learning_rate": 6.429318495586328e-06, + "loss": 0.629, + "step": 16220 + }, + { + "epoch": 2.04, + "grad_norm": 9.03894329071045, + "learning_rate": 6.42848178052964e-06, + "loss": 1.5399, + "step": 16221 + }, + { + "epoch": 2.04, + "grad_norm": 4.5642266273498535, + "learning_rate": 6.427645065472954e-06, + "loss": 0.0701, + "step": 16222 + }, + { + "epoch": 2.04, + "grad_norm": 14.697591781616211, + "learning_rate": 6.426808350416266e-06, + "loss": 1.04, + "step": 16223 + }, + { + "epoch": 2.04, + "grad_norm": 11.729602813720703, + "learning_rate": 6.425971635359579e-06, + "loss": 1.0618, + "step": 16224 + }, + { + "epoch": 2.04, + "grad_norm": 51.51275634765625, + "learning_rate": 6.4251349203028915e-06, + "loss": 3.2666, + "step": 16225 + }, + { + "epoch": 2.04, + "grad_norm": 16.761516571044922, + "learning_rate": 6.424298205246204e-06, + "loss": 1.0147, + "step": 16226 + }, + { + "epoch": 2.04, + "grad_norm": 10.326380729675293, + "learning_rate": 6.423461490189516e-06, + "loss": 1.275, + "step": 16227 + }, + { + "epoch": 2.04, + "grad_norm": 6.452572822570801, + "learning_rate": 6.422624775132828e-06, + "loss": 0.7351, + "step": 16228 + }, + { + "epoch": 2.04, + "grad_norm": 37.921119689941406, + "learning_rate": 6.421788060076142e-06, + "loss": 1.6725, + "step": 16229 + }, + { + "epoch": 2.04, + "grad_norm": 14.596298217773438, + "learning_rate": 6.420951345019454e-06, + "loss": 2.1811, + "step": 16230 + }, + { + "epoch": 2.04, + "grad_norm": 69.2381820678711, + "learning_rate": 6.420114629962767e-06, + "loss": 1.6541, + "step": 16231 + }, + { + "epoch": 2.04, + "grad_norm": 1.9978963136672974, + "learning_rate": 6.4192779149060794e-06, + "loss": 0.0773, + "step": 16232 + }, + { + "epoch": 2.04, + "grad_norm": 18.001026153564453, + "learning_rate": 6.418441199849392e-06, + "loss": 1.02, + "step": 16233 + }, + { + "epoch": 2.04, + "grad_norm": 17.160947799682617, + "learning_rate": 6.417604484792704e-06, + "loss": 0.6446, + "step": 16234 + }, + { + "epoch": 2.04, + "grad_norm": 13.894946098327637, + "learning_rate": 6.416767769736016e-06, + "loss": 1.539, + "step": 16235 + }, + { + "epoch": 2.04, + "grad_norm": 325.5096435546875, + "learning_rate": 6.41593105467933e-06, + "loss": 1.2966, + "step": 16236 + }, + { + "epoch": 2.04, + "grad_norm": 6.154231071472168, + "learning_rate": 6.415094339622642e-06, + "loss": 0.6995, + "step": 16237 + }, + { + "epoch": 2.04, + "grad_norm": 14.135480880737305, + "learning_rate": 6.4142576245659545e-06, + "loss": 1.6384, + "step": 16238 + }, + { + "epoch": 2.04, + "grad_norm": 7.413065433502197, + "learning_rate": 6.413420909509267e-06, + "loss": 0.8911, + "step": 16239 + }, + { + "epoch": 2.04, + "grad_norm": 12.196990013122559, + "learning_rate": 6.41258419445258e-06, + "loss": 1.6185, + "step": 16240 + }, + { + "epoch": 2.04, + "grad_norm": 16.50975227355957, + "learning_rate": 6.411747479395892e-06, + "loss": 2.359, + "step": 16241 + }, + { + "epoch": 2.04, + "grad_norm": 15.77778148651123, + "learning_rate": 6.410910764339204e-06, + "loss": 1.4863, + "step": 16242 + }, + { + "epoch": 2.04, + "grad_norm": 13.257022857666016, + "learning_rate": 6.410074049282518e-06, + "loss": 0.2781, + "step": 16243 + }, + { + "epoch": 2.04, + "grad_norm": 27.746564865112305, + "learning_rate": 6.40923733422583e-06, + "loss": 1.7557, + "step": 16244 + }, + { + "epoch": 2.04, + "grad_norm": 16.584228515625, + "learning_rate": 6.4084006191691425e-06, + "loss": 0.9563, + "step": 16245 + }, + { + "epoch": 2.04, + "grad_norm": 13.610138893127441, + "learning_rate": 6.407563904112455e-06, + "loss": 0.6585, + "step": 16246 + }, + { + "epoch": 2.04, + "grad_norm": 9.752397537231445, + "learning_rate": 6.406727189055768e-06, + "loss": 0.2989, + "step": 16247 + }, + { + "epoch": 2.04, + "grad_norm": 9.259270668029785, + "learning_rate": 6.40589047399908e-06, + "loss": 0.2724, + "step": 16248 + }, + { + "epoch": 2.04, + "grad_norm": 17.941730499267578, + "learning_rate": 6.405053758942392e-06, + "loss": 0.6028, + "step": 16249 + }, + { + "epoch": 2.04, + "grad_norm": 7.572376728057861, + "learning_rate": 6.404217043885706e-06, + "loss": 0.273, + "step": 16250 + }, + { + "epoch": 2.04, + "grad_norm": 110.7977066040039, + "learning_rate": 6.403380328829018e-06, + "loss": 1.3644, + "step": 16251 + }, + { + "epoch": 2.04, + "grad_norm": 5.024694442749023, + "learning_rate": 6.40254361377233e-06, + "loss": 0.5423, + "step": 16252 + }, + { + "epoch": 2.04, + "grad_norm": 36.58980178833008, + "learning_rate": 6.401706898715643e-06, + "loss": 1.0159, + "step": 16253 + }, + { + "epoch": 2.04, + "grad_norm": 30.15936851501465, + "learning_rate": 6.400870183658956e-06, + "loss": 0.9333, + "step": 16254 + }, + { + "epoch": 2.04, + "grad_norm": 21.83393669128418, + "learning_rate": 6.400033468602268e-06, + "loss": 2.7222, + "step": 16255 + }, + { + "epoch": 2.04, + "grad_norm": 20.019332885742188, + "learning_rate": 6.39919675354558e-06, + "loss": 1.6089, + "step": 16256 + }, + { + "epoch": 2.04, + "grad_norm": 16.850595474243164, + "learning_rate": 6.3983600384888935e-06, + "loss": 1.8224, + "step": 16257 + }, + { + "epoch": 2.04, + "grad_norm": 9.293508529663086, + "learning_rate": 6.3975233234322055e-06, + "loss": 0.6111, + "step": 16258 + }, + { + "epoch": 2.04, + "grad_norm": 7.602249622344971, + "learning_rate": 6.396686608375518e-06, + "loss": 0.5085, + "step": 16259 + }, + { + "epoch": 2.04, + "grad_norm": 11.73695182800293, + "learning_rate": 6.39584989331883e-06, + "loss": 1.1122, + "step": 16260 + }, + { + "epoch": 2.04, + "grad_norm": 24.734926223754883, + "learning_rate": 6.395013178262144e-06, + "loss": 0.6938, + "step": 16261 + }, + { + "epoch": 2.04, + "grad_norm": 28.150827407836914, + "learning_rate": 6.394176463205456e-06, + "loss": 0.8636, + "step": 16262 + }, + { + "epoch": 2.04, + "grad_norm": 21.256317138671875, + "learning_rate": 6.393339748148768e-06, + "loss": 0.9196, + "step": 16263 + }, + { + "epoch": 2.04, + "grad_norm": 46.79572296142578, + "learning_rate": 6.3925030330920815e-06, + "loss": 1.6798, + "step": 16264 + }, + { + "epoch": 2.04, + "grad_norm": 8.550714492797852, + "learning_rate": 6.391666318035393e-06, + "loss": 0.7154, + "step": 16265 + }, + { + "epoch": 2.04, + "grad_norm": 5.677887439727783, + "learning_rate": 6.390829602978706e-06, + "loss": 0.3291, + "step": 16266 + }, + { + "epoch": 2.04, + "grad_norm": 8.84570026397705, + "learning_rate": 6.389992887922018e-06, + "loss": 0.6039, + "step": 16267 + }, + { + "epoch": 2.04, + "grad_norm": 14.745457649230957, + "learning_rate": 6.389156172865332e-06, + "loss": 1.8272, + "step": 16268 + }, + { + "epoch": 2.04, + "grad_norm": 13.759382247924805, + "learning_rate": 6.388319457808644e-06, + "loss": 0.6211, + "step": 16269 + }, + { + "epoch": 2.04, + "grad_norm": 19.65152931213379, + "learning_rate": 6.387482742751956e-06, + "loss": 0.6657, + "step": 16270 + }, + { + "epoch": 2.04, + "grad_norm": 16.69057846069336, + "learning_rate": 6.386646027695269e-06, + "loss": 0.7727, + "step": 16271 + }, + { + "epoch": 2.04, + "grad_norm": 11.009669303894043, + "learning_rate": 6.385809312638581e-06, + "loss": 0.8503, + "step": 16272 + }, + { + "epoch": 2.04, + "grad_norm": 14.906228065490723, + "learning_rate": 6.384972597581894e-06, + "loss": 1.5865, + "step": 16273 + }, + { + "epoch": 2.04, + "grad_norm": 52.21102523803711, + "learning_rate": 6.384135882525206e-06, + "loss": 1.8425, + "step": 16274 + }, + { + "epoch": 2.04, + "grad_norm": 14.800554275512695, + "learning_rate": 6.38329916746852e-06, + "loss": 0.6739, + "step": 16275 + }, + { + "epoch": 2.04, + "grad_norm": 11.220170974731445, + "learning_rate": 6.382462452411832e-06, + "loss": 0.4152, + "step": 16276 + }, + { + "epoch": 2.04, + "grad_norm": 26.87870216369629, + "learning_rate": 6.381625737355144e-06, + "loss": 0.6924, + "step": 16277 + }, + { + "epoch": 2.04, + "grad_norm": 14.723917007446289, + "learning_rate": 6.380789022298457e-06, + "loss": 1.1726, + "step": 16278 + }, + { + "epoch": 2.04, + "grad_norm": 17.075979232788086, + "learning_rate": 6.379952307241769e-06, + "loss": 0.7418, + "step": 16279 + }, + { + "epoch": 2.04, + "grad_norm": 4.230533599853516, + "learning_rate": 6.379115592185082e-06, + "loss": 0.4674, + "step": 16280 + }, + { + "epoch": 2.04, + "grad_norm": 10.00238037109375, + "learning_rate": 6.378278877128394e-06, + "loss": 0.2023, + "step": 16281 + }, + { + "epoch": 2.04, + "grad_norm": 10.267596244812012, + "learning_rate": 6.377442162071708e-06, + "loss": 1.499, + "step": 16282 + }, + { + "epoch": 2.04, + "grad_norm": 7.407226085662842, + "learning_rate": 6.37660544701502e-06, + "loss": 0.361, + "step": 16283 + }, + { + "epoch": 2.04, + "grad_norm": 14.427005767822266, + "learning_rate": 6.3757687319583316e-06, + "loss": 1.1869, + "step": 16284 + }, + { + "epoch": 2.04, + "grad_norm": 24.61405372619629, + "learning_rate": 6.374932016901645e-06, + "loss": 0.8257, + "step": 16285 + }, + { + "epoch": 2.04, + "grad_norm": 9.572637557983398, + "learning_rate": 6.374095301844957e-06, + "loss": 0.8062, + "step": 16286 + }, + { + "epoch": 2.04, + "grad_norm": 167.83787536621094, + "learning_rate": 6.37325858678827e-06, + "loss": 1.2678, + "step": 16287 + }, + { + "epoch": 2.04, + "grad_norm": 11.481160163879395, + "learning_rate": 6.372421871731582e-06, + "loss": 1.8921, + "step": 16288 + }, + { + "epoch": 2.04, + "grad_norm": 58.42168045043945, + "learning_rate": 6.371585156674896e-06, + "loss": 1.0874, + "step": 16289 + }, + { + "epoch": 2.04, + "grad_norm": 12.236416816711426, + "learning_rate": 6.3707484416182075e-06, + "loss": 0.5991, + "step": 16290 + }, + { + "epoch": 2.04, + "grad_norm": 1.3818658590316772, + "learning_rate": 6.3699117265615195e-06, + "loss": 0.0391, + "step": 16291 + }, + { + "epoch": 2.04, + "grad_norm": 38.054527282714844, + "learning_rate": 6.369075011504833e-06, + "loss": 1.0768, + "step": 16292 + }, + { + "epoch": 2.04, + "grad_norm": 11.2073974609375, + "learning_rate": 6.368238296448145e-06, + "loss": 0.9246, + "step": 16293 + }, + { + "epoch": 2.04, + "grad_norm": 12.246166229248047, + "learning_rate": 6.367401581391458e-06, + "loss": 0.547, + "step": 16294 + }, + { + "epoch": 2.04, + "grad_norm": 12.671717643737793, + "learning_rate": 6.36656486633477e-06, + "loss": 1.6035, + "step": 16295 + }, + { + "epoch": 2.05, + "grad_norm": 7.1969380378723145, + "learning_rate": 6.3657281512780835e-06, + "loss": 0.483, + "step": 16296 + }, + { + "epoch": 2.05, + "grad_norm": 40.141963958740234, + "learning_rate": 6.3648914362213955e-06, + "loss": 1.3982, + "step": 16297 + }, + { + "epoch": 2.05, + "grad_norm": 38.6901741027832, + "learning_rate": 6.364054721164707e-06, + "loss": 1.5976, + "step": 16298 + }, + { + "epoch": 2.05, + "grad_norm": 12.444235801696777, + "learning_rate": 6.363218006108021e-06, + "loss": 0.8923, + "step": 16299 + }, + { + "epoch": 2.05, + "grad_norm": 33.04055404663086, + "learning_rate": 6.362381291051333e-06, + "loss": 1.8274, + "step": 16300 + }, + { + "epoch": 2.05, + "grad_norm": 17.85938262939453, + "learning_rate": 6.361544575994646e-06, + "loss": 2.887, + "step": 16301 + }, + { + "epoch": 2.05, + "grad_norm": 15.372467041015625, + "learning_rate": 6.360707860937958e-06, + "loss": 1.3028, + "step": 16302 + }, + { + "epoch": 2.05, + "grad_norm": 19.73614501953125, + "learning_rate": 6.359871145881271e-06, + "loss": 1.0338, + "step": 16303 + }, + { + "epoch": 2.05, + "grad_norm": 13.654969215393066, + "learning_rate": 6.359034430824583e-06, + "loss": 2.9795, + "step": 16304 + }, + { + "epoch": 2.05, + "grad_norm": 19.09703826904297, + "learning_rate": 6.358197715767895e-06, + "loss": 1.8351, + "step": 16305 + }, + { + "epoch": 2.05, + "grad_norm": 5.372680187225342, + "learning_rate": 6.357361000711208e-06, + "loss": 1.0544, + "step": 16306 + }, + { + "epoch": 2.05, + "grad_norm": 6.331582546234131, + "learning_rate": 6.356524285654521e-06, + "loss": 0.7461, + "step": 16307 + }, + { + "epoch": 2.05, + "grad_norm": 13.04987907409668, + "learning_rate": 6.355687570597834e-06, + "loss": 0.7377, + "step": 16308 + }, + { + "epoch": 2.05, + "grad_norm": 13.980428695678711, + "learning_rate": 6.354850855541146e-06, + "loss": 0.9329, + "step": 16309 + }, + { + "epoch": 2.05, + "grad_norm": 19.196767807006836, + "learning_rate": 6.354014140484459e-06, + "loss": 0.8305, + "step": 16310 + }, + { + "epoch": 2.05, + "grad_norm": 18.332805633544922, + "learning_rate": 6.353177425427771e-06, + "loss": 1.2122, + "step": 16311 + }, + { + "epoch": 2.05, + "grad_norm": 13.444842338562012, + "learning_rate": 6.352340710371083e-06, + "loss": 1.1295, + "step": 16312 + }, + { + "epoch": 2.05, + "grad_norm": 67.86763000488281, + "learning_rate": 6.351503995314396e-06, + "loss": 1.1505, + "step": 16313 + }, + { + "epoch": 2.05, + "grad_norm": 17.4975643157959, + "learning_rate": 6.350667280257709e-06, + "loss": 1.8383, + "step": 16314 + }, + { + "epoch": 2.05, + "grad_norm": 8.981825828552246, + "learning_rate": 6.349830565201022e-06, + "loss": 0.3658, + "step": 16315 + }, + { + "epoch": 2.05, + "grad_norm": 9.776616096496582, + "learning_rate": 6.348993850144334e-06, + "loss": 0.5528, + "step": 16316 + }, + { + "epoch": 2.05, + "grad_norm": 63.22365188598633, + "learning_rate": 6.348157135087647e-06, + "loss": 1.9502, + "step": 16317 + }, + { + "epoch": 2.05, + "grad_norm": 77.63703155517578, + "learning_rate": 6.347320420030959e-06, + "loss": 0.7079, + "step": 16318 + }, + { + "epoch": 2.05, + "grad_norm": 4.89634370803833, + "learning_rate": 6.346483704974271e-06, + "loss": 0.2386, + "step": 16319 + }, + { + "epoch": 2.05, + "grad_norm": 15.108169555664062, + "learning_rate": 6.345646989917584e-06, + "loss": 0.5373, + "step": 16320 + }, + { + "epoch": 2.05, + "grad_norm": 12.114225387573242, + "learning_rate": 6.344810274860897e-06, + "loss": 0.7184, + "step": 16321 + }, + { + "epoch": 2.05, + "grad_norm": 6.531534671783447, + "learning_rate": 6.3439735598042096e-06, + "loss": 0.3666, + "step": 16322 + }, + { + "epoch": 2.05, + "grad_norm": 24.10402488708496, + "learning_rate": 6.3431368447475215e-06, + "loss": 0.578, + "step": 16323 + }, + { + "epoch": 2.05, + "grad_norm": 25.831274032592773, + "learning_rate": 6.3423001296908335e-06, + "loss": 0.9635, + "step": 16324 + }, + { + "epoch": 2.05, + "grad_norm": 20.153762817382812, + "learning_rate": 6.341463414634147e-06, + "loss": 0.6292, + "step": 16325 + }, + { + "epoch": 2.05, + "grad_norm": 21.914087295532227, + "learning_rate": 6.340626699577459e-06, + "loss": 0.7087, + "step": 16326 + }, + { + "epoch": 2.05, + "grad_norm": 19.947664260864258, + "learning_rate": 6.339789984520772e-06, + "loss": 0.5721, + "step": 16327 + }, + { + "epoch": 2.05, + "grad_norm": 7.1366376876831055, + "learning_rate": 6.338953269464085e-06, + "loss": 0.5614, + "step": 16328 + }, + { + "epoch": 2.05, + "grad_norm": 179.075927734375, + "learning_rate": 6.3381165544073975e-06, + "loss": 4.9287, + "step": 16329 + }, + { + "epoch": 2.05, + "grad_norm": 20.651126861572266, + "learning_rate": 6.3372798393507094e-06, + "loss": 1.4351, + "step": 16330 + }, + { + "epoch": 2.05, + "grad_norm": 26.97290802001953, + "learning_rate": 6.336443124294021e-06, + "loss": 2.8531, + "step": 16331 + }, + { + "epoch": 2.05, + "grad_norm": 8.706089973449707, + "learning_rate": 6.335606409237335e-06, + "loss": 0.8824, + "step": 16332 + }, + { + "epoch": 2.05, + "grad_norm": 5.200913429260254, + "learning_rate": 6.334769694180647e-06, + "loss": 0.1274, + "step": 16333 + }, + { + "epoch": 2.05, + "grad_norm": 28.518836975097656, + "learning_rate": 6.33393297912396e-06, + "loss": 1.2004, + "step": 16334 + }, + { + "epoch": 2.05, + "grad_norm": 7.6518025398254395, + "learning_rate": 6.333096264067272e-06, + "loss": 0.528, + "step": 16335 + }, + { + "epoch": 2.05, + "grad_norm": 39.421993255615234, + "learning_rate": 6.332259549010585e-06, + "loss": 3.1023, + "step": 16336 + }, + { + "epoch": 2.05, + "grad_norm": 9.441398620605469, + "learning_rate": 6.331422833953897e-06, + "loss": 1.2269, + "step": 16337 + }, + { + "epoch": 2.05, + "grad_norm": 10.061051368713379, + "learning_rate": 6.330586118897209e-06, + "loss": 1.3231, + "step": 16338 + }, + { + "epoch": 2.05, + "grad_norm": 7.994561195373535, + "learning_rate": 6.329749403840523e-06, + "loss": 0.4762, + "step": 16339 + }, + { + "epoch": 2.05, + "grad_norm": 30.543291091918945, + "learning_rate": 6.328912688783835e-06, + "loss": 1.2392, + "step": 16340 + }, + { + "epoch": 2.05, + "grad_norm": 11.687193870544434, + "learning_rate": 6.328075973727148e-06, + "loss": 2.0838, + "step": 16341 + }, + { + "epoch": 2.05, + "grad_norm": 11.96678352355957, + "learning_rate": 6.32723925867046e-06, + "loss": 0.4478, + "step": 16342 + }, + { + "epoch": 2.05, + "grad_norm": 5.491357803344727, + "learning_rate": 6.326402543613773e-06, + "loss": 0.9369, + "step": 16343 + }, + { + "epoch": 2.05, + "grad_norm": 9.527076721191406, + "learning_rate": 6.325565828557085e-06, + "loss": 0.6563, + "step": 16344 + }, + { + "epoch": 2.05, + "grad_norm": 31.517492294311523, + "learning_rate": 6.324729113500397e-06, + "loss": 1.1611, + "step": 16345 + }, + { + "epoch": 2.05, + "grad_norm": 9.010356903076172, + "learning_rate": 6.323892398443711e-06, + "loss": 0.5052, + "step": 16346 + }, + { + "epoch": 2.05, + "grad_norm": 28.53644561767578, + "learning_rate": 6.323055683387023e-06, + "loss": 2.9225, + "step": 16347 + }, + { + "epoch": 2.05, + "grad_norm": 35.999961853027344, + "learning_rate": 6.322218968330336e-06, + "loss": 1.7751, + "step": 16348 + }, + { + "epoch": 2.05, + "grad_norm": 19.59505844116211, + "learning_rate": 6.321382253273648e-06, + "loss": 0.9192, + "step": 16349 + }, + { + "epoch": 2.05, + "grad_norm": 19.588050842285156, + "learning_rate": 6.320545538216961e-06, + "loss": 0.5096, + "step": 16350 + }, + { + "epoch": 2.05, + "grad_norm": 6.057505130767822, + "learning_rate": 6.319708823160273e-06, + "loss": 0.2356, + "step": 16351 + }, + { + "epoch": 2.05, + "grad_norm": 5.174546718597412, + "learning_rate": 6.318872108103585e-06, + "loss": 0.3658, + "step": 16352 + }, + { + "epoch": 2.05, + "grad_norm": 11.382641792297363, + "learning_rate": 6.318035393046899e-06, + "loss": 1.3706, + "step": 16353 + }, + { + "epoch": 2.05, + "grad_norm": 8.718911170959473, + "learning_rate": 6.317198677990211e-06, + "loss": 0.762, + "step": 16354 + }, + { + "epoch": 2.05, + "grad_norm": 15.025772094726562, + "learning_rate": 6.3163619629335236e-06, + "loss": 0.956, + "step": 16355 + }, + { + "epoch": 2.05, + "grad_norm": 20.396451950073242, + "learning_rate": 6.3155252478768355e-06, + "loss": 2.0995, + "step": 16356 + }, + { + "epoch": 2.05, + "grad_norm": 16.324954986572266, + "learning_rate": 6.314688532820149e-06, + "loss": 0.6291, + "step": 16357 + }, + { + "epoch": 2.05, + "grad_norm": 11.061717987060547, + "learning_rate": 6.313851817763461e-06, + "loss": 1.262, + "step": 16358 + }, + { + "epoch": 2.05, + "grad_norm": 12.975667953491211, + "learning_rate": 6.313015102706773e-06, + "loss": 0.5564, + "step": 16359 + }, + { + "epoch": 2.05, + "grad_norm": 53.27537155151367, + "learning_rate": 6.312178387650087e-06, + "loss": 0.4769, + "step": 16360 + }, + { + "epoch": 2.05, + "grad_norm": 5.7657904624938965, + "learning_rate": 6.311341672593399e-06, + "loss": 0.3185, + "step": 16361 + }, + { + "epoch": 2.05, + "grad_norm": 20.796558380126953, + "learning_rate": 6.3105049575367115e-06, + "loss": 1.8968, + "step": 16362 + }, + { + "epoch": 2.05, + "grad_norm": 49.964481353759766, + "learning_rate": 6.3096682424800234e-06, + "loss": 0.6971, + "step": 16363 + }, + { + "epoch": 2.05, + "grad_norm": 17.00904655456543, + "learning_rate": 6.308831527423337e-06, + "loss": 0.9146, + "step": 16364 + }, + { + "epoch": 2.05, + "grad_norm": 13.459504127502441, + "learning_rate": 6.307994812366649e-06, + "loss": 2.1822, + "step": 16365 + }, + { + "epoch": 2.05, + "grad_norm": 22.19635772705078, + "learning_rate": 6.307158097309961e-06, + "loss": 1.0316, + "step": 16366 + }, + { + "epoch": 2.05, + "grad_norm": 17.777423858642578, + "learning_rate": 6.306321382253275e-06, + "loss": 0.6407, + "step": 16367 + }, + { + "epoch": 2.05, + "grad_norm": 29.012802124023438, + "learning_rate": 6.305484667196587e-06, + "loss": 0.9657, + "step": 16368 + }, + { + "epoch": 2.05, + "grad_norm": 14.237957954406738, + "learning_rate": 6.304647952139899e-06, + "loss": 0.9197, + "step": 16369 + }, + { + "epoch": 2.05, + "grad_norm": 12.470968246459961, + "learning_rate": 6.303811237083211e-06, + "loss": 0.5394, + "step": 16370 + }, + { + "epoch": 2.05, + "grad_norm": 14.259575843811035, + "learning_rate": 6.302974522026525e-06, + "loss": 0.7527, + "step": 16371 + }, + { + "epoch": 2.05, + "grad_norm": 166.35455322265625, + "learning_rate": 6.302137806969837e-06, + "loss": 1.5979, + "step": 16372 + }, + { + "epoch": 2.05, + "grad_norm": 7.62042760848999, + "learning_rate": 6.301301091913149e-06, + "loss": 0.6344, + "step": 16373 + }, + { + "epoch": 2.05, + "grad_norm": 16.454174041748047, + "learning_rate": 6.3004643768564626e-06, + "loss": 1.7882, + "step": 16374 + }, + { + "epoch": 2.06, + "grad_norm": 27.83538818359375, + "learning_rate": 6.2996276617997745e-06, + "loss": 1.9177, + "step": 16375 + }, + { + "epoch": 2.06, + "grad_norm": 270.7811584472656, + "learning_rate": 6.298790946743087e-06, + "loss": 1.335, + "step": 16376 + }, + { + "epoch": 2.06, + "grad_norm": 27.523265838623047, + "learning_rate": 6.297954231686399e-06, + "loss": 1.2153, + "step": 16377 + }, + { + "epoch": 2.06, + "grad_norm": 47.53352737426758, + "learning_rate": 6.297117516629713e-06, + "loss": 1.9278, + "step": 16378 + }, + { + "epoch": 2.06, + "grad_norm": 30.873390197753906, + "learning_rate": 6.296280801573025e-06, + "loss": 0.7341, + "step": 16379 + }, + { + "epoch": 2.06, + "grad_norm": 14.739265441894531, + "learning_rate": 6.295444086516337e-06, + "loss": 1.216, + "step": 16380 + }, + { + "epoch": 2.06, + "grad_norm": 23.66485595703125, + "learning_rate": 6.2946073714596505e-06, + "loss": 4.2067, + "step": 16381 + }, + { + "epoch": 2.06, + "grad_norm": 11.664165496826172, + "learning_rate": 6.2937706564029624e-06, + "loss": 0.6, + "step": 16382 + }, + { + "epoch": 2.06, + "grad_norm": 44.6564826965332, + "learning_rate": 6.292933941346275e-06, + "loss": 1.4229, + "step": 16383 + }, + { + "epoch": 2.06, + "grad_norm": 5.448591232299805, + "learning_rate": 6.292097226289587e-06, + "loss": 0.4441, + "step": 16384 + }, + { + "epoch": 2.06, + "grad_norm": 18.735206604003906, + "learning_rate": 6.291260511232901e-06, + "loss": 0.7372, + "step": 16385 + }, + { + "epoch": 2.06, + "grad_norm": 55.1180534362793, + "learning_rate": 6.290423796176213e-06, + "loss": 2.4001, + "step": 16386 + }, + { + "epoch": 2.06, + "grad_norm": 13.985992431640625, + "learning_rate": 6.289587081119525e-06, + "loss": 0.9724, + "step": 16387 + }, + { + "epoch": 2.06, + "grad_norm": 12.777236938476562, + "learning_rate": 6.2887503660628375e-06, + "loss": 0.6601, + "step": 16388 + }, + { + "epoch": 2.06, + "grad_norm": 9.22414493560791, + "learning_rate": 6.28791365100615e-06, + "loss": 0.85, + "step": 16389 + }, + { + "epoch": 2.06, + "grad_norm": 4.828909873962402, + "learning_rate": 6.287076935949463e-06, + "loss": 0.2522, + "step": 16390 + }, + { + "epoch": 2.06, + "grad_norm": 6.063150405883789, + "learning_rate": 6.286240220892775e-06, + "loss": 0.3322, + "step": 16391 + }, + { + "epoch": 2.06, + "grad_norm": 24.44740104675293, + "learning_rate": 6.285403505836089e-06, + "loss": 1.1274, + "step": 16392 + }, + { + "epoch": 2.06, + "grad_norm": 4.613438129425049, + "learning_rate": 6.284566790779401e-06, + "loss": 0.5376, + "step": 16393 + }, + { + "epoch": 2.06, + "grad_norm": 6.197526454925537, + "learning_rate": 6.283730075722713e-06, + "loss": 0.3193, + "step": 16394 + }, + { + "epoch": 2.06, + "grad_norm": 12.503471374511719, + "learning_rate": 6.2828933606660255e-06, + "loss": 0.7365, + "step": 16395 + }, + { + "epoch": 2.06, + "grad_norm": 4.639339923858643, + "learning_rate": 6.282056645609338e-06, + "loss": 0.3307, + "step": 16396 + }, + { + "epoch": 2.06, + "grad_norm": 9.989636421203613, + "learning_rate": 6.281219930552651e-06, + "loss": 0.6942, + "step": 16397 + }, + { + "epoch": 2.06, + "grad_norm": 8.763270378112793, + "learning_rate": 6.280383215495963e-06, + "loss": 0.5278, + "step": 16398 + }, + { + "epoch": 2.06, + "grad_norm": 28.691089630126953, + "learning_rate": 6.279546500439277e-06, + "loss": 2.196, + "step": 16399 + }, + { + "epoch": 2.06, + "grad_norm": 24.95621681213379, + "learning_rate": 6.278709785382589e-06, + "loss": 1.4303, + "step": 16400 + }, + { + "epoch": 2.06, + "eval_loss": 0.08141389489173889, + "eval_runtime": 95.1344, + "eval_samples_per_second": 37.232, + "eval_steps_per_second": 37.232, + "step": 16400 + }, + { + "epoch": 2.06, + "grad_norm": 85.77784729003906, + "learning_rate": 6.277873070325901e-06, + "loss": 5.2265, + "step": 16401 + }, + { + "epoch": 2.06, + "grad_norm": 18.586429595947266, + "learning_rate": 6.277036355269213e-06, + "loss": 1.6389, + "step": 16402 + }, + { + "epoch": 2.06, + "grad_norm": 26.477685928344727, + "learning_rate": 6.276199640212526e-06, + "loss": 1.7303, + "step": 16403 + }, + { + "epoch": 2.06, + "grad_norm": 8.007719039916992, + "learning_rate": 6.275362925155839e-06, + "loss": 1.2385, + "step": 16404 + }, + { + "epoch": 2.06, + "grad_norm": 13.247702598571777, + "learning_rate": 6.274526210099151e-06, + "loss": 2.375, + "step": 16405 + }, + { + "epoch": 2.06, + "grad_norm": 5.825837135314941, + "learning_rate": 6.273689495042465e-06, + "loss": 0.2625, + "step": 16406 + }, + { + "epoch": 2.06, + "grad_norm": 15.234128952026367, + "learning_rate": 6.2728527799857765e-06, + "loss": 1.317, + "step": 16407 + }, + { + "epoch": 2.06, + "grad_norm": 13.170249938964844, + "learning_rate": 6.2720160649290885e-06, + "loss": 1.2648, + "step": 16408 + }, + { + "epoch": 2.06, + "grad_norm": 5.402313232421875, + "learning_rate": 6.271179349872401e-06, + "loss": 0.1905, + "step": 16409 + }, + { + "epoch": 2.06, + "grad_norm": 16.913536071777344, + "learning_rate": 6.270342634815714e-06, + "loss": 0.4488, + "step": 16410 + }, + { + "epoch": 2.06, + "grad_norm": 10.359716415405273, + "learning_rate": 6.269505919759027e-06, + "loss": 0.7147, + "step": 16411 + }, + { + "epoch": 2.06, + "grad_norm": 27.842947006225586, + "learning_rate": 6.268669204702339e-06, + "loss": 0.3549, + "step": 16412 + }, + { + "epoch": 2.06, + "grad_norm": 47.443660736083984, + "learning_rate": 6.2678324896456525e-06, + "loss": 1.9468, + "step": 16413 + }, + { + "epoch": 2.06, + "grad_norm": 20.700239181518555, + "learning_rate": 6.2669957745889645e-06, + "loss": 0.3877, + "step": 16414 + }, + { + "epoch": 2.06, + "grad_norm": 9.135885238647461, + "learning_rate": 6.266159059532276e-06, + "loss": 0.5151, + "step": 16415 + }, + { + "epoch": 2.06, + "grad_norm": 5.4493727684021, + "learning_rate": 6.265322344475589e-06, + "loss": 0.8757, + "step": 16416 + }, + { + "epoch": 2.06, + "grad_norm": 17.44921875, + "learning_rate": 6.264485629418901e-06, + "loss": 1.7656, + "step": 16417 + }, + { + "epoch": 2.06, + "grad_norm": 5.254001617431641, + "learning_rate": 6.263648914362215e-06, + "loss": 0.4605, + "step": 16418 + }, + { + "epoch": 2.06, + "grad_norm": 5.9415082931518555, + "learning_rate": 6.262812199305527e-06, + "loss": 0.2035, + "step": 16419 + }, + { + "epoch": 2.06, + "grad_norm": 27.944520950317383, + "learning_rate": 6.2619754842488404e-06, + "loss": 2.6041, + "step": 16420 + }, + { + "epoch": 2.06, + "grad_norm": 6.755660533905029, + "learning_rate": 6.261138769192152e-06, + "loss": 0.1938, + "step": 16421 + }, + { + "epoch": 2.06, + "grad_norm": 20.80733299255371, + "learning_rate": 6.260302054135464e-06, + "loss": 1.573, + "step": 16422 + }, + { + "epoch": 2.06, + "grad_norm": 16.19023895263672, + "learning_rate": 6.259465339078777e-06, + "loss": 1.1644, + "step": 16423 + }, + { + "epoch": 2.06, + "grad_norm": 6.79762601852417, + "learning_rate": 6.258628624022089e-06, + "loss": 0.312, + "step": 16424 + }, + { + "epoch": 2.06, + "grad_norm": 14.102092742919922, + "learning_rate": 6.257791908965403e-06, + "loss": 1.3472, + "step": 16425 + }, + { + "epoch": 2.06, + "grad_norm": 9.547575950622559, + "learning_rate": 6.256955193908715e-06, + "loss": 0.4151, + "step": 16426 + }, + { + "epoch": 2.06, + "grad_norm": 7.325801849365234, + "learning_rate": 6.256118478852028e-06, + "loss": 1.101, + "step": 16427 + }, + { + "epoch": 2.06, + "grad_norm": 8.511363983154297, + "learning_rate": 6.25528176379534e-06, + "loss": 0.5515, + "step": 16428 + }, + { + "epoch": 2.06, + "grad_norm": 34.52033233642578, + "learning_rate": 6.254445048738652e-06, + "loss": 2.2967, + "step": 16429 + }, + { + "epoch": 2.06, + "grad_norm": 15.332392692565918, + "learning_rate": 6.253608333681965e-06, + "loss": 1.3402, + "step": 16430 + }, + { + "epoch": 2.06, + "grad_norm": 43.08570098876953, + "learning_rate": 6.252771618625277e-06, + "loss": 0.66, + "step": 16431 + }, + { + "epoch": 2.06, + "grad_norm": 10.303796768188477, + "learning_rate": 6.251934903568591e-06, + "loss": 0.4777, + "step": 16432 + }, + { + "epoch": 2.06, + "grad_norm": 13.234012603759766, + "learning_rate": 6.251098188511903e-06, + "loss": 0.8158, + "step": 16433 + }, + { + "epoch": 2.06, + "grad_norm": 14.474245071411133, + "learning_rate": 6.250261473455215e-06, + "loss": 0.6216, + "step": 16434 + }, + { + "epoch": 2.06, + "grad_norm": 80.25108337402344, + "learning_rate": 6.249424758398528e-06, + "loss": 1.3335, + "step": 16435 + }, + { + "epoch": 2.06, + "grad_norm": 2.7391889095306396, + "learning_rate": 6.24858804334184e-06, + "loss": 0.0587, + "step": 16436 + }, + { + "epoch": 2.06, + "grad_norm": 8.63875961303711, + "learning_rate": 6.247751328285153e-06, + "loss": 0.7776, + "step": 16437 + }, + { + "epoch": 2.06, + "grad_norm": 19.064735412597656, + "learning_rate": 6.246914613228465e-06, + "loss": 0.6267, + "step": 16438 + }, + { + "epoch": 2.06, + "grad_norm": 28.71219825744629, + "learning_rate": 6.246077898171779e-06, + "loss": 0.9808, + "step": 16439 + }, + { + "epoch": 2.06, + "grad_norm": 132.15008544921875, + "learning_rate": 6.2452411831150905e-06, + "loss": 3.8898, + "step": 16440 + }, + { + "epoch": 2.06, + "grad_norm": 67.4249496459961, + "learning_rate": 6.244404468058403e-06, + "loss": 1.7181, + "step": 16441 + }, + { + "epoch": 2.06, + "grad_norm": 27.999494552612305, + "learning_rate": 6.243567753001716e-06, + "loss": 0.3562, + "step": 16442 + }, + { + "epoch": 2.06, + "grad_norm": 2.9176435470581055, + "learning_rate": 6.242731037945028e-06, + "loss": 0.2938, + "step": 16443 + }, + { + "epoch": 2.06, + "grad_norm": 15.687005996704102, + "learning_rate": 6.241894322888341e-06, + "loss": 1.2571, + "step": 16444 + }, + { + "epoch": 2.06, + "grad_norm": 7.77381706237793, + "learning_rate": 6.241057607831653e-06, + "loss": 1.7553, + "step": 16445 + }, + { + "epoch": 2.06, + "grad_norm": 4.529983043670654, + "learning_rate": 6.2402208927749665e-06, + "loss": 0.2274, + "step": 16446 + }, + { + "epoch": 2.06, + "grad_norm": 13.678400039672852, + "learning_rate": 6.2393841777182785e-06, + "loss": 1.8998, + "step": 16447 + }, + { + "epoch": 2.06, + "grad_norm": 13.9041748046875, + "learning_rate": 6.238547462661591e-06, + "loss": 2.8395, + "step": 16448 + }, + { + "epoch": 2.06, + "grad_norm": 31.396278381347656, + "learning_rate": 6.237710747604904e-06, + "loss": 0.3692, + "step": 16449 + }, + { + "epoch": 2.06, + "grad_norm": 8.526172637939453, + "learning_rate": 6.236874032548216e-06, + "loss": 0.8206, + "step": 16450 + }, + { + "epoch": 2.06, + "grad_norm": 7.275291442871094, + "learning_rate": 6.236037317491529e-06, + "loss": 0.3681, + "step": 16451 + }, + { + "epoch": 2.06, + "grad_norm": 15.358269691467285, + "learning_rate": 6.235200602434841e-06, + "loss": 1.2854, + "step": 16452 + }, + { + "epoch": 2.06, + "grad_norm": 1.6213921308517456, + "learning_rate": 6.234363887378154e-06, + "loss": 0.0466, + "step": 16453 + }, + { + "epoch": 2.06, + "grad_norm": 9.204558372497559, + "learning_rate": 6.233527172321466e-06, + "loss": 0.4216, + "step": 16454 + }, + { + "epoch": 2.07, + "grad_norm": 17.904741287231445, + "learning_rate": 6.232690457264779e-06, + "loss": 0.8174, + "step": 16455 + }, + { + "epoch": 2.07, + "grad_norm": 31.63105583190918, + "learning_rate": 6.231853742208092e-06, + "loss": 1.1253, + "step": 16456 + }, + { + "epoch": 2.07, + "grad_norm": 12.756241798400879, + "learning_rate": 6.231017027151404e-06, + "loss": 0.3802, + "step": 16457 + }, + { + "epoch": 2.07, + "grad_norm": 12.611263275146484, + "learning_rate": 6.230180312094717e-06, + "loss": 1.9548, + "step": 16458 + }, + { + "epoch": 2.07, + "grad_norm": 27.46123695373535, + "learning_rate": 6.229343597038029e-06, + "loss": 2.2183, + "step": 16459 + }, + { + "epoch": 2.07, + "grad_norm": 28.820478439331055, + "learning_rate": 6.228506881981342e-06, + "loss": 3.7846, + "step": 16460 + }, + { + "epoch": 2.07, + "grad_norm": 13.83780574798584, + "learning_rate": 6.227670166924654e-06, + "loss": 0.4726, + "step": 16461 + }, + { + "epoch": 2.07, + "grad_norm": 11.4133939743042, + "learning_rate": 6.226833451867967e-06, + "loss": 0.9803, + "step": 16462 + }, + { + "epoch": 2.07, + "grad_norm": 10.623127937316895, + "learning_rate": 6.22599673681128e-06, + "loss": 1.0674, + "step": 16463 + }, + { + "epoch": 2.07, + "grad_norm": 47.73965072631836, + "learning_rate": 6.225160021754592e-06, + "loss": 0.5572, + "step": 16464 + }, + { + "epoch": 2.07, + "grad_norm": 26.700315475463867, + "learning_rate": 6.224323306697905e-06, + "loss": 1.035, + "step": 16465 + }, + { + "epoch": 2.07, + "grad_norm": 9.991043090820312, + "learning_rate": 6.223486591641217e-06, + "loss": 0.4435, + "step": 16466 + }, + { + "epoch": 2.07, + "grad_norm": 7.870398044586182, + "learning_rate": 6.22264987658453e-06, + "loss": 0.3138, + "step": 16467 + }, + { + "epoch": 2.07, + "grad_norm": 45.63023376464844, + "learning_rate": 6.221813161527842e-06, + "loss": 0.3552, + "step": 16468 + }, + { + "epoch": 2.07, + "grad_norm": 49.17428970336914, + "learning_rate": 6.220976446471155e-06, + "loss": 3.2582, + "step": 16469 + }, + { + "epoch": 2.07, + "grad_norm": 10.028911590576172, + "learning_rate": 6.220139731414467e-06, + "loss": 1.1186, + "step": 16470 + }, + { + "epoch": 2.07, + "grad_norm": 12.706177711486816, + "learning_rate": 6.21930301635778e-06, + "loss": 1.1401, + "step": 16471 + }, + { + "epoch": 2.07, + "grad_norm": 2.1864635944366455, + "learning_rate": 6.2184663013010926e-06, + "loss": 0.1269, + "step": 16472 + }, + { + "epoch": 2.07, + "grad_norm": 14.817275047302246, + "learning_rate": 6.2176295862444045e-06, + "loss": 0.8975, + "step": 16473 + }, + { + "epoch": 2.07, + "grad_norm": 18.514623641967773, + "learning_rate": 6.216792871187718e-06, + "loss": 0.542, + "step": 16474 + }, + { + "epoch": 2.07, + "grad_norm": 21.610883712768555, + "learning_rate": 6.21595615613103e-06, + "loss": 2.3149, + "step": 16475 + }, + { + "epoch": 2.07, + "grad_norm": 21.595613479614258, + "learning_rate": 6.215119441074343e-06, + "loss": 2.1623, + "step": 16476 + }, + { + "epoch": 2.07, + "grad_norm": 18.9511661529541, + "learning_rate": 6.214282726017655e-06, + "loss": 0.5788, + "step": 16477 + }, + { + "epoch": 2.07, + "grad_norm": 8.170537948608398, + "learning_rate": 6.213446010960968e-06, + "loss": 1.1928, + "step": 16478 + }, + { + "epoch": 2.07, + "grad_norm": 10.969879150390625, + "learning_rate": 6.2126092959042805e-06, + "loss": 0.5808, + "step": 16479 + }, + { + "epoch": 2.07, + "grad_norm": 22.910619735717773, + "learning_rate": 6.2117725808475924e-06, + "loss": 1.5677, + "step": 16480 + }, + { + "epoch": 2.07, + "grad_norm": 11.913558006286621, + "learning_rate": 6.210935865790906e-06, + "loss": 0.3098, + "step": 16481 + }, + { + "epoch": 2.07, + "grad_norm": 13.480531692504883, + "learning_rate": 6.210099150734218e-06, + "loss": 1.3747, + "step": 16482 + }, + { + "epoch": 2.07, + "grad_norm": 14.577484130859375, + "learning_rate": 6.209262435677531e-06, + "loss": 0.4669, + "step": 16483 + }, + { + "epoch": 2.07, + "grad_norm": 6.817378044128418, + "learning_rate": 6.208425720620843e-06, + "loss": 0.2599, + "step": 16484 + }, + { + "epoch": 2.07, + "grad_norm": 5.815977096557617, + "learning_rate": 6.207589005564156e-06, + "loss": 0.6524, + "step": 16485 + }, + { + "epoch": 2.07, + "grad_norm": 44.233131408691406, + "learning_rate": 6.206752290507468e-06, + "loss": 1.8451, + "step": 16486 + }, + { + "epoch": 2.07, + "grad_norm": 15.90046215057373, + "learning_rate": 6.20591557545078e-06, + "loss": 2.2867, + "step": 16487 + }, + { + "epoch": 2.07, + "grad_norm": 54.03504180908203, + "learning_rate": 6.205078860394094e-06, + "loss": 1.2996, + "step": 16488 + }, + { + "epoch": 2.07, + "grad_norm": 17.677053451538086, + "learning_rate": 6.204242145337406e-06, + "loss": 1.2908, + "step": 16489 + }, + { + "epoch": 2.07, + "grad_norm": 7.055785655975342, + "learning_rate": 6.203405430280719e-06, + "loss": 0.3546, + "step": 16490 + }, + { + "epoch": 2.07, + "grad_norm": 27.47056007385254, + "learning_rate": 6.202568715224031e-06, + "loss": 1.6954, + "step": 16491 + }, + { + "epoch": 2.07, + "grad_norm": 11.763664245605469, + "learning_rate": 6.2017320001673435e-06, + "loss": 0.8556, + "step": 16492 + }, + { + "epoch": 2.07, + "grad_norm": 14.156816482543945, + "learning_rate": 6.200895285110656e-06, + "loss": 1.5798, + "step": 16493 + }, + { + "epoch": 2.07, + "grad_norm": 5.979343891143799, + "learning_rate": 6.200058570053968e-06, + "loss": 0.1109, + "step": 16494 + }, + { + "epoch": 2.07, + "grad_norm": 11.397286415100098, + "learning_rate": 6.199221854997282e-06, + "loss": 1.6852, + "step": 16495 + }, + { + "epoch": 2.07, + "grad_norm": 8.399728775024414, + "learning_rate": 6.198385139940594e-06, + "loss": 0.4494, + "step": 16496 + }, + { + "epoch": 2.07, + "grad_norm": 11.568540573120117, + "learning_rate": 6.197548424883907e-06, + "loss": 0.4989, + "step": 16497 + }, + { + "epoch": 2.07, + "grad_norm": 6.6547112464904785, + "learning_rate": 6.196711709827219e-06, + "loss": 0.2911, + "step": 16498 + }, + { + "epoch": 2.07, + "grad_norm": 8.213935852050781, + "learning_rate": 6.195874994770531e-06, + "loss": 0.4342, + "step": 16499 + }, + { + "epoch": 2.07, + "grad_norm": 63.703330993652344, + "learning_rate": 6.195038279713844e-06, + "loss": 1.4025, + "step": 16500 + }, + { + "epoch": 2.07, + "grad_norm": 41.80888366699219, + "learning_rate": 6.194201564657156e-06, + "loss": 0.7504, + "step": 16501 + }, + { + "epoch": 2.07, + "grad_norm": 14.639147758483887, + "learning_rate": 6.19336484960047e-06, + "loss": 0.9503, + "step": 16502 + }, + { + "epoch": 2.07, + "grad_norm": 11.767823219299316, + "learning_rate": 6.192528134543782e-06, + "loss": 0.492, + "step": 16503 + }, + { + "epoch": 2.07, + "grad_norm": 13.004864692687988, + "learning_rate": 6.191691419487094e-06, + "loss": 1.4874, + "step": 16504 + }, + { + "epoch": 2.07, + "grad_norm": 7.3194074630737305, + "learning_rate": 6.1908547044304066e-06, + "loss": 0.3932, + "step": 16505 + }, + { + "epoch": 2.07, + "grad_norm": 1.1155903339385986, + "learning_rate": 6.1900179893737185e-06, + "loss": 0.0421, + "step": 16506 + }, + { + "epoch": 2.07, + "grad_norm": 10.562871932983398, + "learning_rate": 6.189181274317032e-06, + "loss": 0.2833, + "step": 16507 + }, + { + "epoch": 2.07, + "grad_norm": 51.60620880126953, + "learning_rate": 6.188344559260344e-06, + "loss": 0.9758, + "step": 16508 + }, + { + "epoch": 2.07, + "grad_norm": 22.08502960205078, + "learning_rate": 6.187507844203658e-06, + "loss": 1.843, + "step": 16509 + }, + { + "epoch": 2.07, + "grad_norm": 21.697383880615234, + "learning_rate": 6.18667112914697e-06, + "loss": 2.187, + "step": 16510 + }, + { + "epoch": 2.07, + "grad_norm": 23.2861328125, + "learning_rate": 6.185834414090282e-06, + "loss": 0.6558, + "step": 16511 + }, + { + "epoch": 2.07, + "grad_norm": 10.984660148620605, + "learning_rate": 6.1849976990335945e-06, + "loss": 0.549, + "step": 16512 + }, + { + "epoch": 2.07, + "grad_norm": 17.949138641357422, + "learning_rate": 6.1841609839769064e-06, + "loss": 1.1071, + "step": 16513 + }, + { + "epoch": 2.07, + "grad_norm": 18.823471069335938, + "learning_rate": 6.18332426892022e-06, + "loss": 0.9926, + "step": 16514 + }, + { + "epoch": 2.07, + "grad_norm": 18.417369842529297, + "learning_rate": 6.182487553863532e-06, + "loss": 1.7676, + "step": 16515 + }, + { + "epoch": 2.07, + "grad_norm": 55.80198669433594, + "learning_rate": 6.181650838806845e-06, + "loss": 1.1355, + "step": 16516 + }, + { + "epoch": 2.07, + "grad_norm": 14.4066743850708, + "learning_rate": 6.180814123750158e-06, + "loss": 2.8706, + "step": 16517 + }, + { + "epoch": 2.07, + "grad_norm": 30.233991622924805, + "learning_rate": 6.17997740869347e-06, + "loss": 1.1455, + "step": 16518 + }, + { + "epoch": 2.07, + "grad_norm": 19.445171356201172, + "learning_rate": 6.179140693636782e-06, + "loss": 1.1413, + "step": 16519 + }, + { + "epoch": 2.07, + "grad_norm": 24.63876724243164, + "learning_rate": 6.178303978580094e-06, + "loss": 0.8298, + "step": 16520 + }, + { + "epoch": 2.07, + "grad_norm": 18.23326301574707, + "learning_rate": 6.177467263523408e-06, + "loss": 2.423, + "step": 16521 + }, + { + "epoch": 2.07, + "grad_norm": 100.80715942382812, + "learning_rate": 6.17663054846672e-06, + "loss": 1.3618, + "step": 16522 + }, + { + "epoch": 2.07, + "grad_norm": 15.919983863830566, + "learning_rate": 6.175793833410033e-06, + "loss": 0.9049, + "step": 16523 + }, + { + "epoch": 2.07, + "grad_norm": 20.7061710357666, + "learning_rate": 6.1749571183533456e-06, + "loss": 0.5704, + "step": 16524 + }, + { + "epoch": 2.07, + "grad_norm": 29.360403060913086, + "learning_rate": 6.1741204032966575e-06, + "loss": 1.7679, + "step": 16525 + }, + { + "epoch": 2.07, + "grad_norm": 22.493640899658203, + "learning_rate": 6.17328368823997e-06, + "loss": 1.6815, + "step": 16526 + }, + { + "epoch": 2.07, + "grad_norm": 26.56145477294922, + "learning_rate": 6.172446973183282e-06, + "loss": 0.378, + "step": 16527 + }, + { + "epoch": 2.07, + "grad_norm": 18.64031410217285, + "learning_rate": 6.171610258126596e-06, + "loss": 0.4292, + "step": 16528 + }, + { + "epoch": 2.07, + "grad_norm": 6.236285209655762, + "learning_rate": 6.170773543069908e-06, + "loss": 0.7544, + "step": 16529 + }, + { + "epoch": 2.07, + "grad_norm": 54.25901412963867, + "learning_rate": 6.169936828013221e-06, + "loss": 0.6793, + "step": 16530 + }, + { + "epoch": 2.07, + "grad_norm": 6.192501068115234, + "learning_rate": 6.1691001129565335e-06, + "loss": 1.6053, + "step": 16531 + }, + { + "epoch": 2.07, + "grad_norm": 4.473605632781982, + "learning_rate": 6.1682633978998454e-06, + "loss": 1.059, + "step": 16532 + }, + { + "epoch": 2.07, + "grad_norm": 38.38534164428711, + "learning_rate": 6.167426682843158e-06, + "loss": 2.0188, + "step": 16533 + }, + { + "epoch": 2.07, + "grad_norm": 5.745693206787109, + "learning_rate": 6.16658996778647e-06, + "loss": 2.0387, + "step": 16534 + }, + { + "epoch": 2.08, + "grad_norm": 9.371700286865234, + "learning_rate": 6.165753252729784e-06, + "loss": 2.7258, + "step": 16535 + }, + { + "epoch": 2.08, + "grad_norm": 14.374646186828613, + "learning_rate": 6.164916537673096e-06, + "loss": 1.3601, + "step": 16536 + }, + { + "epoch": 2.08, + "grad_norm": 45.54530334472656, + "learning_rate": 6.164079822616409e-06, + "loss": 3.1562, + "step": 16537 + }, + { + "epoch": 2.08, + "grad_norm": 12.899748802185059, + "learning_rate": 6.163243107559721e-06, + "loss": 0.9303, + "step": 16538 + }, + { + "epoch": 2.08, + "grad_norm": 8.409440040588379, + "learning_rate": 6.162406392503033e-06, + "loss": 1.0218, + "step": 16539 + }, + { + "epoch": 2.08, + "grad_norm": 16.851974487304688, + "learning_rate": 6.161569677446346e-06, + "loss": 0.3433, + "step": 16540 + }, + { + "epoch": 2.08, + "grad_norm": 15.8661470413208, + "learning_rate": 6.160732962389658e-06, + "loss": 1.2453, + "step": 16541 + }, + { + "epoch": 2.08, + "grad_norm": 33.72930908203125, + "learning_rate": 6.159896247332972e-06, + "loss": 1.4199, + "step": 16542 + }, + { + "epoch": 2.08, + "grad_norm": 99.52818298339844, + "learning_rate": 6.159059532276284e-06, + "loss": 2.0069, + "step": 16543 + }, + { + "epoch": 2.08, + "grad_norm": 6.886445045471191, + "learning_rate": 6.1582228172195965e-06, + "loss": 0.8724, + "step": 16544 + }, + { + "epoch": 2.08, + "grad_norm": 12.840018272399902, + "learning_rate": 6.1573861021629085e-06, + "loss": 1.1763, + "step": 16545 + }, + { + "epoch": 2.08, + "grad_norm": 15.798508644104004, + "learning_rate": 6.156549387106221e-06, + "loss": 0.8332, + "step": 16546 + }, + { + "epoch": 2.08, + "grad_norm": 13.992999076843262, + "learning_rate": 6.155712672049534e-06, + "loss": 0.3119, + "step": 16547 + }, + { + "epoch": 2.08, + "grad_norm": 16.383399963378906, + "learning_rate": 6.154875956992846e-06, + "loss": 0.6404, + "step": 16548 + }, + { + "epoch": 2.08, + "grad_norm": 12.821876525878906, + "learning_rate": 6.15403924193616e-06, + "loss": 1.9272, + "step": 16549 + }, + { + "epoch": 2.08, + "grad_norm": 11.50905704498291, + "learning_rate": 6.153202526879472e-06, + "loss": 0.5468, + "step": 16550 + }, + { + "epoch": 2.08, + "grad_norm": 72.6382827758789, + "learning_rate": 6.1523658118227844e-06, + "loss": 1.3661, + "step": 16551 + }, + { + "epoch": 2.08, + "grad_norm": 12.334877967834473, + "learning_rate": 6.151529096766096e-06, + "loss": 1.9228, + "step": 16552 + }, + { + "epoch": 2.08, + "grad_norm": 14.781194686889648, + "learning_rate": 6.150692381709409e-06, + "loss": 0.684, + "step": 16553 + }, + { + "epoch": 2.08, + "grad_norm": 48.85089111328125, + "learning_rate": 6.149855666652722e-06, + "loss": 2.3371, + "step": 16554 + }, + { + "epoch": 2.08, + "grad_norm": 18.68092155456543, + "learning_rate": 6.149018951596034e-06, + "loss": 0.6029, + "step": 16555 + }, + { + "epoch": 2.08, + "grad_norm": 10.330491065979004, + "learning_rate": 6.148182236539348e-06, + "loss": 0.611, + "step": 16556 + }, + { + "epoch": 2.08, + "grad_norm": 14.230433464050293, + "learning_rate": 6.1473455214826595e-06, + "loss": 0.798, + "step": 16557 + }, + { + "epoch": 2.08, + "grad_norm": 8.532289505004883, + "learning_rate": 6.146508806425972e-06, + "loss": 1.5137, + "step": 16558 + }, + { + "epoch": 2.08, + "grad_norm": 10.542137145996094, + "learning_rate": 6.145672091369284e-06, + "loss": 0.5654, + "step": 16559 + }, + { + "epoch": 2.08, + "grad_norm": 16.967517852783203, + "learning_rate": 6.144835376312597e-06, + "loss": 0.8737, + "step": 16560 + }, + { + "epoch": 2.08, + "grad_norm": 23.06812286376953, + "learning_rate": 6.14399866125591e-06, + "loss": 1.518, + "step": 16561 + }, + { + "epoch": 2.08, + "grad_norm": 47.58055114746094, + "learning_rate": 6.143161946199222e-06, + "loss": 1.056, + "step": 16562 + }, + { + "epoch": 2.08, + "grad_norm": 4.630557060241699, + "learning_rate": 6.1423252311425355e-06, + "loss": 0.1811, + "step": 16563 + }, + { + "epoch": 2.08, + "grad_norm": 17.748130798339844, + "learning_rate": 6.1414885160858475e-06, + "loss": 0.6627, + "step": 16564 + }, + { + "epoch": 2.08, + "grad_norm": 14.95833969116211, + "learning_rate": 6.14065180102916e-06, + "loss": 0.5508, + "step": 16565 + }, + { + "epoch": 2.08, + "grad_norm": 10.208039283752441, + "learning_rate": 6.139815085972472e-06, + "loss": 1.4903, + "step": 16566 + }, + { + "epoch": 2.08, + "grad_norm": 13.743385314941406, + "learning_rate": 6.138978370915785e-06, + "loss": 0.672, + "step": 16567 + }, + { + "epoch": 2.08, + "grad_norm": 9.239084243774414, + "learning_rate": 6.138141655859098e-06, + "loss": 0.3315, + "step": 16568 + }, + { + "epoch": 2.08, + "grad_norm": 41.48976135253906, + "learning_rate": 6.13730494080241e-06, + "loss": 1.9272, + "step": 16569 + }, + { + "epoch": 2.08, + "grad_norm": 95.03357696533203, + "learning_rate": 6.1364682257457234e-06, + "loss": 1.5247, + "step": 16570 + }, + { + "epoch": 2.08, + "grad_norm": 63.016475677490234, + "learning_rate": 6.135631510689035e-06, + "loss": 1.0691, + "step": 16571 + }, + { + "epoch": 2.08, + "grad_norm": 6.475004196166992, + "learning_rate": 6.134794795632348e-06, + "loss": 0.4216, + "step": 16572 + }, + { + "epoch": 2.08, + "grad_norm": 15.221724510192871, + "learning_rate": 6.13395808057566e-06, + "loss": 0.7941, + "step": 16573 + }, + { + "epoch": 2.08, + "grad_norm": 17.43982696533203, + "learning_rate": 6.133121365518973e-06, + "loss": 2.5072, + "step": 16574 + }, + { + "epoch": 2.08, + "grad_norm": 8.709810256958008, + "learning_rate": 6.132284650462286e-06, + "loss": 0.5433, + "step": 16575 + }, + { + "epoch": 2.08, + "grad_norm": 22.730981826782227, + "learning_rate": 6.131447935405598e-06, + "loss": 1.9044, + "step": 16576 + }, + { + "epoch": 2.08, + "grad_norm": 28.671415328979492, + "learning_rate": 6.130611220348911e-06, + "loss": 1.8637, + "step": 16577 + }, + { + "epoch": 2.08, + "grad_norm": 6.897902011871338, + "learning_rate": 6.129774505292223e-06, + "loss": 0.3145, + "step": 16578 + }, + { + "epoch": 2.08, + "grad_norm": 57.18122863769531, + "learning_rate": 6.128937790235536e-06, + "loss": 1.7443, + "step": 16579 + }, + { + "epoch": 2.08, + "grad_norm": 38.12550735473633, + "learning_rate": 6.128101075178848e-06, + "loss": 2.8525, + "step": 16580 + }, + { + "epoch": 2.08, + "grad_norm": 9.102025985717773, + "learning_rate": 6.12726436012216e-06, + "loss": 0.8997, + "step": 16581 + }, + { + "epoch": 2.08, + "grad_norm": 14.379570960998535, + "learning_rate": 6.126427645065474e-06, + "loss": 1.4295, + "step": 16582 + }, + { + "epoch": 2.08, + "grad_norm": 21.59740447998047, + "learning_rate": 6.125590930008786e-06, + "loss": 2.4355, + "step": 16583 + }, + { + "epoch": 2.08, + "grad_norm": 9.414067268371582, + "learning_rate": 6.124754214952099e-06, + "loss": 0.7416, + "step": 16584 + }, + { + "epoch": 2.08, + "grad_norm": 9.886018753051758, + "learning_rate": 6.123917499895411e-06, + "loss": 1.4918, + "step": 16585 + }, + { + "epoch": 2.08, + "grad_norm": 14.882733345031738, + "learning_rate": 6.123080784838724e-06, + "loss": 1.1041, + "step": 16586 + }, + { + "epoch": 2.08, + "grad_norm": 17.257169723510742, + "learning_rate": 6.122244069782036e-06, + "loss": 0.7118, + "step": 16587 + }, + { + "epoch": 2.08, + "grad_norm": 16.37677574157715, + "learning_rate": 6.121407354725348e-06, + "loss": 1.1891, + "step": 16588 + }, + { + "epoch": 2.08, + "grad_norm": 9.790069580078125, + "learning_rate": 6.120570639668662e-06, + "loss": 0.5618, + "step": 16589 + }, + { + "epoch": 2.08, + "grad_norm": 27.5288143157959, + "learning_rate": 6.1197339246119735e-06, + "loss": 1.4248, + "step": 16590 + }, + { + "epoch": 2.08, + "grad_norm": 13.253369331359863, + "learning_rate": 6.118897209555287e-06, + "loss": 0.355, + "step": 16591 + }, + { + "epoch": 2.08, + "grad_norm": 1.6081290245056152, + "learning_rate": 6.118060494498599e-06, + "loss": 0.0221, + "step": 16592 + }, + { + "epoch": 2.08, + "grad_norm": 11.606402397155762, + "learning_rate": 6.117223779441912e-06, + "loss": 0.6794, + "step": 16593 + }, + { + "epoch": 2.08, + "grad_norm": 25.089847564697266, + "learning_rate": 6.116387064385224e-06, + "loss": 0.7521, + "step": 16594 + }, + { + "epoch": 2.08, + "grad_norm": 9.566123008728027, + "learning_rate": 6.115550349328536e-06, + "loss": 1.3233, + "step": 16595 + }, + { + "epoch": 2.08, + "grad_norm": 8.801780700683594, + "learning_rate": 6.1147136342718495e-06, + "loss": 0.8277, + "step": 16596 + }, + { + "epoch": 2.08, + "grad_norm": 36.346614837646484, + "learning_rate": 6.1138769192151615e-06, + "loss": 2.524, + "step": 16597 + }, + { + "epoch": 2.08, + "grad_norm": 14.323728561401367, + "learning_rate": 6.113040204158474e-06, + "loss": 1.3023, + "step": 16598 + }, + { + "epoch": 2.08, + "grad_norm": 13.32658576965332, + "learning_rate": 6.112203489101787e-06, + "loss": 0.8885, + "step": 16599 + }, + { + "epoch": 2.08, + "grad_norm": 8.776103019714355, + "learning_rate": 6.1113667740451e-06, + "loss": 1.3184, + "step": 16600 + }, + { + "epoch": 2.08, + "grad_norm": 5.627133846282959, + "learning_rate": 6.110530058988412e-06, + "loss": 0.9762, + "step": 16601 + }, + { + "epoch": 2.08, + "grad_norm": 25.990951538085938, + "learning_rate": 6.109693343931724e-06, + "loss": 1.3927, + "step": 16602 + }, + { + "epoch": 2.08, + "grad_norm": 13.433871269226074, + "learning_rate": 6.108856628875037e-06, + "loss": 0.6496, + "step": 16603 + }, + { + "epoch": 2.08, + "grad_norm": 11.072653770446777, + "learning_rate": 6.108019913818349e-06, + "loss": 0.7007, + "step": 16604 + }, + { + "epoch": 2.08, + "grad_norm": 15.921674728393555, + "learning_rate": 6.107183198761662e-06, + "loss": 0.774, + "step": 16605 + }, + { + "epoch": 2.08, + "grad_norm": 3.8428282737731934, + "learning_rate": 6.106346483704975e-06, + "loss": 0.6958, + "step": 16606 + }, + { + "epoch": 2.08, + "grad_norm": 10.573344230651855, + "learning_rate": 6.105509768648288e-06, + "loss": 0.8632, + "step": 16607 + }, + { + "epoch": 2.08, + "grad_norm": 13.61486530303955, + "learning_rate": 6.1046730535916e-06, + "loss": 0.2684, + "step": 16608 + }, + { + "epoch": 2.08, + "grad_norm": 2.86008620262146, + "learning_rate": 6.103836338534912e-06, + "loss": 0.0893, + "step": 16609 + }, + { + "epoch": 2.08, + "grad_norm": 12.860692024230957, + "learning_rate": 6.102999623478225e-06, + "loss": 2.0543, + "step": 16610 + }, + { + "epoch": 2.08, + "grad_norm": 13.402647018432617, + "learning_rate": 6.102162908421537e-06, + "loss": 1.267, + "step": 16611 + }, + { + "epoch": 2.08, + "grad_norm": 11.44622802734375, + "learning_rate": 6.10132619336485e-06, + "loss": 1.4332, + "step": 16612 + }, + { + "epoch": 2.08, + "grad_norm": 6.49962854385376, + "learning_rate": 6.100489478308163e-06, + "loss": 0.2258, + "step": 16613 + }, + { + "epoch": 2.09, + "grad_norm": 6.8343987464904785, + "learning_rate": 6.099652763251476e-06, + "loss": 0.4795, + "step": 16614 + }, + { + "epoch": 2.09, + "grad_norm": 19.482908248901367, + "learning_rate": 6.098816048194788e-06, + "loss": 0.9212, + "step": 16615 + }, + { + "epoch": 2.09, + "grad_norm": 10.905943870544434, + "learning_rate": 6.0979793331381e-06, + "loss": 1.0125, + "step": 16616 + }, + { + "epoch": 2.09, + "grad_norm": 13.229659080505371, + "learning_rate": 6.097142618081413e-06, + "loss": 0.8366, + "step": 16617 + }, + { + "epoch": 2.09, + "grad_norm": 28.90032386779785, + "learning_rate": 6.096305903024725e-06, + "loss": 2.8562, + "step": 16618 + }, + { + "epoch": 2.09, + "grad_norm": 11.121617317199707, + "learning_rate": 6.095469187968038e-06, + "loss": 0.6452, + "step": 16619 + }, + { + "epoch": 2.09, + "grad_norm": 14.468332290649414, + "learning_rate": 6.094632472911351e-06, + "loss": 0.642, + "step": 16620 + }, + { + "epoch": 2.09, + "grad_norm": 17.694711685180664, + "learning_rate": 6.093795757854664e-06, + "loss": 1.0717, + "step": 16621 + }, + { + "epoch": 2.09, + "grad_norm": 17.3740234375, + "learning_rate": 6.0929590427979756e-06, + "loss": 0.4831, + "step": 16622 + }, + { + "epoch": 2.09, + "grad_norm": 9.99255084991455, + "learning_rate": 6.0921223277412875e-06, + "loss": 0.2145, + "step": 16623 + }, + { + "epoch": 2.09, + "grad_norm": 13.872404098510742, + "learning_rate": 6.091285612684601e-06, + "loss": 1.1057, + "step": 16624 + }, + { + "epoch": 2.09, + "grad_norm": 16.23759651184082, + "learning_rate": 6.090448897627913e-06, + "loss": 1.049, + "step": 16625 + }, + { + "epoch": 2.09, + "grad_norm": 8.336773872375488, + "learning_rate": 6.089612182571226e-06, + "loss": 1.2625, + "step": 16626 + }, + { + "epoch": 2.09, + "grad_norm": 22.109317779541016, + "learning_rate": 6.088775467514538e-06, + "loss": 1.6373, + "step": 16627 + }, + { + "epoch": 2.09, + "grad_norm": 3.8382434844970703, + "learning_rate": 6.0879387524578515e-06, + "loss": 0.2619, + "step": 16628 + }, + { + "epoch": 2.09, + "grad_norm": 13.31667709350586, + "learning_rate": 6.0871020374011635e-06, + "loss": 0.43, + "step": 16629 + }, + { + "epoch": 2.09, + "grad_norm": 16.8389892578125, + "learning_rate": 6.0862653223444754e-06, + "loss": 2.5586, + "step": 16630 + }, + { + "epoch": 2.09, + "grad_norm": 32.07686996459961, + "learning_rate": 6.085428607287789e-06, + "loss": 1.9517, + "step": 16631 + }, + { + "epoch": 2.09, + "grad_norm": 98.96281433105469, + "learning_rate": 6.084591892231101e-06, + "loss": 2.1242, + "step": 16632 + }, + { + "epoch": 2.09, + "grad_norm": 28.124950408935547, + "learning_rate": 6.083755177174414e-06, + "loss": 1.0739, + "step": 16633 + }, + { + "epoch": 2.09, + "grad_norm": 11.089404106140137, + "learning_rate": 6.082918462117726e-06, + "loss": 0.789, + "step": 16634 + }, + { + "epoch": 2.09, + "grad_norm": 63.404911041259766, + "learning_rate": 6.0820817470610395e-06, + "loss": 2.2756, + "step": 16635 + }, + { + "epoch": 2.09, + "grad_norm": 26.401927947998047, + "learning_rate": 6.081245032004351e-06, + "loss": 2.0859, + "step": 16636 + }, + { + "epoch": 2.09, + "grad_norm": 10.558520317077637, + "learning_rate": 6.080408316947663e-06, + "loss": 0.891, + "step": 16637 + }, + { + "epoch": 2.09, + "grad_norm": 33.86064910888672, + "learning_rate": 6.079571601890977e-06, + "loss": 1.4469, + "step": 16638 + }, + { + "epoch": 2.09, + "grad_norm": 17.923425674438477, + "learning_rate": 6.078734886834289e-06, + "loss": 0.7863, + "step": 16639 + }, + { + "epoch": 2.09, + "grad_norm": 16.607006072998047, + "learning_rate": 6.077898171777602e-06, + "loss": 0.5759, + "step": 16640 + }, + { + "epoch": 2.09, + "grad_norm": 18.633817672729492, + "learning_rate": 6.077061456720914e-06, + "loss": 1.5332, + "step": 16641 + }, + { + "epoch": 2.09, + "grad_norm": 16.57147789001465, + "learning_rate": 6.076224741664227e-06, + "loss": 1.6609, + "step": 16642 + }, + { + "epoch": 2.09, + "grad_norm": 14.349488258361816, + "learning_rate": 6.075388026607539e-06, + "loss": 2.4245, + "step": 16643 + }, + { + "epoch": 2.09, + "grad_norm": 10.078313827514648, + "learning_rate": 6.074551311550851e-06, + "loss": 0.6327, + "step": 16644 + }, + { + "epoch": 2.09, + "grad_norm": 36.451690673828125, + "learning_rate": 6.073714596494165e-06, + "loss": 3.0654, + "step": 16645 + }, + { + "epoch": 2.09, + "grad_norm": 80.59728240966797, + "learning_rate": 6.072877881437477e-06, + "loss": 1.1205, + "step": 16646 + }, + { + "epoch": 2.09, + "grad_norm": 36.045108795166016, + "learning_rate": 6.07204116638079e-06, + "loss": 1.4382, + "step": 16647 + }, + { + "epoch": 2.09, + "grad_norm": 9.137932777404785, + "learning_rate": 6.071204451324102e-06, + "loss": 0.7327, + "step": 16648 + }, + { + "epoch": 2.09, + "grad_norm": 17.748319625854492, + "learning_rate": 6.070367736267415e-06, + "loss": 0.7783, + "step": 16649 + }, + { + "epoch": 2.09, + "grad_norm": 21.247251510620117, + "learning_rate": 6.069531021210727e-06, + "loss": 1.3124, + "step": 16650 + }, + { + "epoch": 2.09, + "grad_norm": 26.821157455444336, + "learning_rate": 6.068694306154039e-06, + "loss": 1.7285, + "step": 16651 + }, + { + "epoch": 2.09, + "grad_norm": 6.4322285652160645, + "learning_rate": 6.067857591097353e-06, + "loss": 1.6197, + "step": 16652 + }, + { + "epoch": 2.09, + "grad_norm": 6.7288055419921875, + "learning_rate": 6.067020876040665e-06, + "loss": 0.6719, + "step": 16653 + }, + { + "epoch": 2.09, + "grad_norm": 97.98516845703125, + "learning_rate": 6.066184160983978e-06, + "loss": 2.367, + "step": 16654 + }, + { + "epoch": 2.09, + "grad_norm": 10.283223152160645, + "learning_rate": 6.0653474459272896e-06, + "loss": 1.7968, + "step": 16655 + }, + { + "epoch": 2.09, + "grad_norm": 6.09956169128418, + "learning_rate": 6.064510730870603e-06, + "loss": 0.6225, + "step": 16656 + }, + { + "epoch": 2.09, + "grad_norm": 18.431461334228516, + "learning_rate": 6.063674015813915e-06, + "loss": 0.7375, + "step": 16657 + }, + { + "epoch": 2.09, + "grad_norm": 8.284697532653809, + "learning_rate": 6.062837300757227e-06, + "loss": 1.4054, + "step": 16658 + }, + { + "epoch": 2.09, + "grad_norm": 9.18532943725586, + "learning_rate": 6.062000585700541e-06, + "loss": 0.6537, + "step": 16659 + }, + { + "epoch": 2.09, + "grad_norm": 8.965328216552734, + "learning_rate": 6.061163870643853e-06, + "loss": 0.6235, + "step": 16660 + }, + { + "epoch": 2.09, + "grad_norm": 100.433837890625, + "learning_rate": 6.0603271555871655e-06, + "loss": 2.0296, + "step": 16661 + }, + { + "epoch": 2.09, + "grad_norm": 17.930927276611328, + "learning_rate": 6.0594904405304775e-06, + "loss": 1.2623, + "step": 16662 + }, + { + "epoch": 2.09, + "grad_norm": 4.940665245056152, + "learning_rate": 6.058653725473791e-06, + "loss": 0.1586, + "step": 16663 + }, + { + "epoch": 2.09, + "grad_norm": 26.91118621826172, + "learning_rate": 6.057817010417103e-06, + "loss": 1.7924, + "step": 16664 + }, + { + "epoch": 2.09, + "grad_norm": 23.53251838684082, + "learning_rate": 6.056980295360415e-06, + "loss": 1.4451, + "step": 16665 + }, + { + "epoch": 2.09, + "grad_norm": 10.728267669677734, + "learning_rate": 6.056143580303729e-06, + "loss": 0.7361, + "step": 16666 + }, + { + "epoch": 2.09, + "grad_norm": 2.9758715629577637, + "learning_rate": 6.055306865247041e-06, + "loss": 0.1412, + "step": 16667 + }, + { + "epoch": 2.09, + "grad_norm": 18.519758224487305, + "learning_rate": 6.0544701501903534e-06, + "loss": 0.8254, + "step": 16668 + }, + { + "epoch": 2.09, + "grad_norm": 6.1864519119262695, + "learning_rate": 6.053633435133665e-06, + "loss": 0.5522, + "step": 16669 + }, + { + "epoch": 2.09, + "grad_norm": 22.56398582458496, + "learning_rate": 6.052796720076979e-06, + "loss": 2.1626, + "step": 16670 + }, + { + "epoch": 2.09, + "grad_norm": 5.48407506942749, + "learning_rate": 6.051960005020291e-06, + "loss": 0.2158, + "step": 16671 + }, + { + "epoch": 2.09, + "grad_norm": 29.247690200805664, + "learning_rate": 6.051123289963603e-06, + "loss": 1.0602, + "step": 16672 + }, + { + "epoch": 2.09, + "grad_norm": 5.660470962524414, + "learning_rate": 6.050286574906916e-06, + "loss": 0.1487, + "step": 16673 + }, + { + "epoch": 2.09, + "grad_norm": 17.81192970275879, + "learning_rate": 6.0494498598502286e-06, + "loss": 1.1854, + "step": 16674 + }, + { + "epoch": 2.09, + "grad_norm": 11.670249938964844, + "learning_rate": 6.048613144793541e-06, + "loss": 1.5447, + "step": 16675 + }, + { + "epoch": 2.09, + "grad_norm": 7.811960697174072, + "learning_rate": 6.047776429736853e-06, + "loss": 0.7241, + "step": 16676 + }, + { + "epoch": 2.09, + "grad_norm": 13.082968711853027, + "learning_rate": 6.046939714680167e-06, + "loss": 0.6176, + "step": 16677 + }, + { + "epoch": 2.09, + "grad_norm": 13.379081726074219, + "learning_rate": 6.046102999623479e-06, + "loss": 0.5995, + "step": 16678 + }, + { + "epoch": 2.09, + "grad_norm": 10.199109077453613, + "learning_rate": 6.045266284566791e-06, + "loss": 0.6131, + "step": 16679 + }, + { + "epoch": 2.09, + "grad_norm": 11.826645851135254, + "learning_rate": 6.044429569510104e-06, + "loss": 0.6941, + "step": 16680 + }, + { + "epoch": 2.09, + "grad_norm": 20.454917907714844, + "learning_rate": 6.0435928544534165e-06, + "loss": 0.8341, + "step": 16681 + }, + { + "epoch": 2.09, + "grad_norm": 13.764884948730469, + "learning_rate": 6.042756139396729e-06, + "loss": 1.596, + "step": 16682 + }, + { + "epoch": 2.09, + "grad_norm": 37.32571792602539, + "learning_rate": 6.041919424340041e-06, + "loss": 2.5734, + "step": 16683 + }, + { + "epoch": 2.09, + "grad_norm": 5.9186553955078125, + "learning_rate": 6.041082709283353e-06, + "loss": 0.3749, + "step": 16684 + }, + { + "epoch": 2.09, + "grad_norm": 7.147922039031982, + "learning_rate": 6.040245994226667e-06, + "loss": 1.8393, + "step": 16685 + }, + { + "epoch": 2.09, + "grad_norm": 13.041387557983398, + "learning_rate": 6.039409279169979e-06, + "loss": 0.3931, + "step": 16686 + }, + { + "epoch": 2.09, + "grad_norm": 12.86512279510498, + "learning_rate": 6.038572564113292e-06, + "loss": 0.3173, + "step": 16687 + }, + { + "epoch": 2.09, + "grad_norm": 5.978394031524658, + "learning_rate": 6.037735849056604e-06, + "loss": 0.4025, + "step": 16688 + }, + { + "epoch": 2.09, + "grad_norm": 58.6747932434082, + "learning_rate": 6.036899133999917e-06, + "loss": 1.4673, + "step": 16689 + }, + { + "epoch": 2.09, + "grad_norm": 35.88507080078125, + "learning_rate": 6.036062418943229e-06, + "loss": 0.612, + "step": 16690 + }, + { + "epoch": 2.09, + "grad_norm": 10.869979858398438, + "learning_rate": 6.035225703886541e-06, + "loss": 1.1676, + "step": 16691 + }, + { + "epoch": 2.09, + "grad_norm": 63.22040557861328, + "learning_rate": 6.034388988829855e-06, + "loss": 1.4376, + "step": 16692 + }, + { + "epoch": 2.09, + "grad_norm": 10.475271224975586, + "learning_rate": 6.033552273773167e-06, + "loss": 1.3598, + "step": 16693 + }, + { + "epoch": 2.1, + "grad_norm": 5.576467514038086, + "learning_rate": 6.0327155587164795e-06, + "loss": 0.9442, + "step": 16694 + }, + { + "epoch": 2.1, + "grad_norm": 23.366558074951172, + "learning_rate": 6.031878843659792e-06, + "loss": 2.5683, + "step": 16695 + }, + { + "epoch": 2.1, + "grad_norm": 18.892393112182617, + "learning_rate": 6.031042128603105e-06, + "loss": 0.7045, + "step": 16696 + }, + { + "epoch": 2.1, + "grad_norm": 23.35431671142578, + "learning_rate": 6.030205413546417e-06, + "loss": 1.4328, + "step": 16697 + }, + { + "epoch": 2.1, + "grad_norm": 36.365665435791016, + "learning_rate": 6.029368698489729e-06, + "loss": 1.9751, + "step": 16698 + }, + { + "epoch": 2.1, + "grad_norm": 7.835363388061523, + "learning_rate": 6.028531983433043e-06, + "loss": 0.2088, + "step": 16699 + }, + { + "epoch": 2.1, + "grad_norm": 9.498435974121094, + "learning_rate": 6.027695268376355e-06, + "loss": 1.686, + "step": 16700 + }, + { + "epoch": 2.1, + "grad_norm": 22.374313354492188, + "learning_rate": 6.0268585533196674e-06, + "loss": 1.1463, + "step": 16701 + }, + { + "epoch": 2.1, + "grad_norm": 11.154014587402344, + "learning_rate": 6.02602183826298e-06, + "loss": 0.7849, + "step": 16702 + }, + { + "epoch": 2.1, + "grad_norm": 14.308752059936523, + "learning_rate": 6.025185123206293e-06, + "loss": 1.6134, + "step": 16703 + }, + { + "epoch": 2.1, + "grad_norm": 4.799342632293701, + "learning_rate": 6.024348408149605e-06, + "loss": 0.2157, + "step": 16704 + }, + { + "epoch": 2.1, + "grad_norm": 16.234066009521484, + "learning_rate": 6.023511693092917e-06, + "loss": 0.759, + "step": 16705 + }, + { + "epoch": 2.1, + "grad_norm": 10.211417198181152, + "learning_rate": 6.022674978036231e-06, + "loss": 0.4717, + "step": 16706 + }, + { + "epoch": 2.1, + "grad_norm": 15.918299674987793, + "learning_rate": 6.0218382629795425e-06, + "loss": 0.7842, + "step": 16707 + }, + { + "epoch": 2.1, + "grad_norm": 5.734321594238281, + "learning_rate": 6.021001547922855e-06, + "loss": 1.2201, + "step": 16708 + }, + { + "epoch": 2.1, + "grad_norm": 7.47617244720459, + "learning_rate": 6.020164832866167e-06, + "loss": 0.505, + "step": 16709 + }, + { + "epoch": 2.1, + "grad_norm": 10.15281867980957, + "learning_rate": 6.019328117809481e-06, + "loss": 0.5723, + "step": 16710 + }, + { + "epoch": 2.1, + "grad_norm": 1.6108498573303223, + "learning_rate": 6.018491402752793e-06, + "loss": 0.0858, + "step": 16711 + }, + { + "epoch": 2.1, + "grad_norm": 17.00050926208496, + "learning_rate": 6.017654687696105e-06, + "loss": 0.789, + "step": 16712 + }, + { + "epoch": 2.1, + "grad_norm": 11.504976272583008, + "learning_rate": 6.0168179726394185e-06, + "loss": 0.2506, + "step": 16713 + }, + { + "epoch": 2.1, + "grad_norm": 12.045175552368164, + "learning_rate": 6.0159812575827305e-06, + "loss": 0.8217, + "step": 16714 + }, + { + "epoch": 2.1, + "grad_norm": 8.361842155456543, + "learning_rate": 6.015144542526043e-06, + "loss": 1.6557, + "step": 16715 + }, + { + "epoch": 2.1, + "grad_norm": 27.015836715698242, + "learning_rate": 6.014307827469355e-06, + "loss": 1.4325, + "step": 16716 + }, + { + "epoch": 2.1, + "grad_norm": 11.400221824645996, + "learning_rate": 6.013471112412669e-06, + "loss": 0.731, + "step": 16717 + }, + { + "epoch": 2.1, + "grad_norm": 13.500144958496094, + "learning_rate": 6.012634397355981e-06, + "loss": 1.4767, + "step": 16718 + }, + { + "epoch": 2.1, + "grad_norm": 32.673580169677734, + "learning_rate": 6.011797682299293e-06, + "loss": 2.2346, + "step": 16719 + }, + { + "epoch": 2.1, + "grad_norm": 61.020599365234375, + "learning_rate": 6.0109609672426064e-06, + "loss": 1.4349, + "step": 16720 + }, + { + "epoch": 2.1, + "grad_norm": 20.405357360839844, + "learning_rate": 6.010124252185918e-06, + "loss": 0.7874, + "step": 16721 + }, + { + "epoch": 2.1, + "grad_norm": 24.97215461730957, + "learning_rate": 6.009287537129231e-06, + "loss": 1.1234, + "step": 16722 + }, + { + "epoch": 2.1, + "grad_norm": 18.511730194091797, + "learning_rate": 6.008450822072543e-06, + "loss": 2.2126, + "step": 16723 + }, + { + "epoch": 2.1, + "grad_norm": 20.64064598083496, + "learning_rate": 6.007614107015857e-06, + "loss": 1.0869, + "step": 16724 + }, + { + "epoch": 2.1, + "grad_norm": 7.057304859161377, + "learning_rate": 6.006777391959169e-06, + "loss": 0.5425, + "step": 16725 + }, + { + "epoch": 2.1, + "grad_norm": 25.64223289489746, + "learning_rate": 6.005940676902481e-06, + "loss": 1.1163, + "step": 16726 + }, + { + "epoch": 2.1, + "grad_norm": 21.548139572143555, + "learning_rate": 6.005103961845794e-06, + "loss": 2.5731, + "step": 16727 + }, + { + "epoch": 2.1, + "grad_norm": 35.97419738769531, + "learning_rate": 6.004267246789106e-06, + "loss": 0.6331, + "step": 16728 + }, + { + "epoch": 2.1, + "grad_norm": 18.08547592163086, + "learning_rate": 6.003430531732419e-06, + "loss": 0.988, + "step": 16729 + }, + { + "epoch": 2.1, + "grad_norm": 16.485050201416016, + "learning_rate": 6.002593816675731e-06, + "loss": 2.8046, + "step": 16730 + }, + { + "epoch": 2.1, + "grad_norm": 32.443904876708984, + "learning_rate": 6.001757101619045e-06, + "loss": 1.4081, + "step": 16731 + }, + { + "epoch": 2.1, + "grad_norm": 14.452461242675781, + "learning_rate": 6.000920386562357e-06, + "loss": 0.4725, + "step": 16732 + }, + { + "epoch": 2.1, + "grad_norm": 15.986902236938477, + "learning_rate": 6.000083671505669e-06, + "loss": 2.028, + "step": 16733 + }, + { + "epoch": 2.1, + "grad_norm": 15.833672523498535, + "learning_rate": 5.999246956448982e-06, + "loss": 3.3999, + "step": 16734 + }, + { + "epoch": 2.1, + "grad_norm": 9.893026351928711, + "learning_rate": 5.998410241392294e-06, + "loss": 0.439, + "step": 16735 + }, + { + "epoch": 2.1, + "grad_norm": 19.52991485595703, + "learning_rate": 5.997573526335607e-06, + "loss": 0.9038, + "step": 16736 + }, + { + "epoch": 2.1, + "grad_norm": 16.582826614379883, + "learning_rate": 5.996736811278919e-06, + "loss": 0.4433, + "step": 16737 + }, + { + "epoch": 2.1, + "grad_norm": 12.043107032775879, + "learning_rate": 5.995900096222233e-06, + "loss": 0.7207, + "step": 16738 + }, + { + "epoch": 2.1, + "grad_norm": 17.766197204589844, + "learning_rate": 5.995063381165545e-06, + "loss": 1.7718, + "step": 16739 + }, + { + "epoch": 2.1, + "grad_norm": 35.01610565185547, + "learning_rate": 5.9942266661088565e-06, + "loss": 1.124, + "step": 16740 + }, + { + "epoch": 2.1, + "grad_norm": 157.57369995117188, + "learning_rate": 5.99338995105217e-06, + "loss": 0.9192, + "step": 16741 + }, + { + "epoch": 2.1, + "grad_norm": 8.94036865234375, + "learning_rate": 5.992553235995482e-06, + "loss": 0.2372, + "step": 16742 + }, + { + "epoch": 2.1, + "grad_norm": 10.568174362182617, + "learning_rate": 5.991716520938795e-06, + "loss": 0.3193, + "step": 16743 + }, + { + "epoch": 2.1, + "grad_norm": 22.63112449645996, + "learning_rate": 5.990879805882107e-06, + "loss": 2.0692, + "step": 16744 + }, + { + "epoch": 2.1, + "grad_norm": 11.302915573120117, + "learning_rate": 5.9900430908254205e-06, + "loss": 0.8339, + "step": 16745 + }, + { + "epoch": 2.1, + "grad_norm": 23.451662063598633, + "learning_rate": 5.9892063757687325e-06, + "loss": 0.9134, + "step": 16746 + }, + { + "epoch": 2.1, + "grad_norm": 20.594402313232422, + "learning_rate": 5.9883696607120445e-06, + "loss": 1.7236, + "step": 16747 + }, + { + "epoch": 2.1, + "grad_norm": 18.51211929321289, + "learning_rate": 5.987532945655358e-06, + "loss": 0.8028, + "step": 16748 + }, + { + "epoch": 2.1, + "grad_norm": 93.31168365478516, + "learning_rate": 5.98669623059867e-06, + "loss": 2.9052, + "step": 16749 + }, + { + "epoch": 2.1, + "grad_norm": 15.28825569152832, + "learning_rate": 5.985859515541983e-06, + "loss": 2.9189, + "step": 16750 + }, + { + "epoch": 2.1, + "grad_norm": 21.951282501220703, + "learning_rate": 5.985022800485295e-06, + "loss": 1.1899, + "step": 16751 + }, + { + "epoch": 2.1, + "grad_norm": 21.420368194580078, + "learning_rate": 5.9841860854286085e-06, + "loss": 0.7361, + "step": 16752 + }, + { + "epoch": 2.1, + "grad_norm": 5.584796905517578, + "learning_rate": 5.98334937037192e-06, + "loss": 0.7013, + "step": 16753 + }, + { + "epoch": 2.1, + "grad_norm": 14.52383041381836, + "learning_rate": 5.982512655315232e-06, + "loss": 0.8589, + "step": 16754 + }, + { + "epoch": 2.1, + "grad_norm": 13.623851776123047, + "learning_rate": 5.981675940258545e-06, + "loss": 0.4374, + "step": 16755 + }, + { + "epoch": 2.1, + "grad_norm": 17.263608932495117, + "learning_rate": 5.980839225201858e-06, + "loss": 1.5883, + "step": 16756 + }, + { + "epoch": 2.1, + "grad_norm": 10.26650333404541, + "learning_rate": 5.980002510145171e-06, + "loss": 0.6201, + "step": 16757 + }, + { + "epoch": 2.1, + "grad_norm": 20.81313705444336, + "learning_rate": 5.979165795088483e-06, + "loss": 0.9087, + "step": 16758 + }, + { + "epoch": 2.1, + "grad_norm": 5.574885845184326, + "learning_rate": 5.978329080031796e-06, + "loss": 0.2811, + "step": 16759 + }, + { + "epoch": 2.1, + "grad_norm": 51.55418014526367, + "learning_rate": 5.977492364975108e-06, + "loss": 2.3663, + "step": 16760 + }, + { + "epoch": 2.1, + "grad_norm": 7.015112400054932, + "learning_rate": 5.97665564991842e-06, + "loss": 0.1887, + "step": 16761 + }, + { + "epoch": 2.1, + "grad_norm": 28.71893310546875, + "learning_rate": 5.975818934861733e-06, + "loss": 2.9956, + "step": 16762 + }, + { + "epoch": 2.1, + "grad_norm": 21.74225425720215, + "learning_rate": 5.974982219805046e-06, + "loss": 1.2374, + "step": 16763 + }, + { + "epoch": 2.1, + "grad_norm": 12.860550880432129, + "learning_rate": 5.974145504748359e-06, + "loss": 1.1596, + "step": 16764 + }, + { + "epoch": 2.1, + "grad_norm": 14.887338638305664, + "learning_rate": 5.973308789691671e-06, + "loss": 1.5552, + "step": 16765 + }, + { + "epoch": 2.1, + "grad_norm": 24.139747619628906, + "learning_rate": 5.972472074634984e-06, + "loss": 1.1814, + "step": 16766 + }, + { + "epoch": 2.1, + "grad_norm": 18.169158935546875, + "learning_rate": 5.971635359578296e-06, + "loss": 0.5871, + "step": 16767 + }, + { + "epoch": 2.1, + "grad_norm": 11.368548393249512, + "learning_rate": 5.970798644521608e-06, + "loss": 1.3178, + "step": 16768 + }, + { + "epoch": 2.1, + "grad_norm": 10.630375862121582, + "learning_rate": 5.969961929464921e-06, + "loss": 2.2562, + "step": 16769 + }, + { + "epoch": 2.1, + "grad_norm": 23.951719284057617, + "learning_rate": 5.969125214408234e-06, + "loss": 1.269, + "step": 16770 + }, + { + "epoch": 2.1, + "grad_norm": 4.449909687042236, + "learning_rate": 5.968288499351547e-06, + "loss": 1.3628, + "step": 16771 + }, + { + "epoch": 2.1, + "grad_norm": 35.25612258911133, + "learning_rate": 5.9674517842948586e-06, + "loss": 1.2419, + "step": 16772 + }, + { + "epoch": 2.1, + "grad_norm": 4.990248680114746, + "learning_rate": 5.966615069238172e-06, + "loss": 0.1098, + "step": 16773 + }, + { + "epoch": 2.11, + "grad_norm": 23.586139678955078, + "learning_rate": 5.965778354181484e-06, + "loss": 1.5185, + "step": 16774 + }, + { + "epoch": 2.11, + "grad_norm": 12.270256996154785, + "learning_rate": 5.964941639124796e-06, + "loss": 0.7521, + "step": 16775 + }, + { + "epoch": 2.11, + "grad_norm": 12.243736267089844, + "learning_rate": 5.964104924068109e-06, + "loss": 0.8228, + "step": 16776 + }, + { + "epoch": 2.11, + "grad_norm": 31.726364135742188, + "learning_rate": 5.963268209011422e-06, + "loss": 2.5089, + "step": 16777 + }, + { + "epoch": 2.11, + "grad_norm": 105.982421875, + "learning_rate": 5.9624314939547345e-06, + "loss": 1.3511, + "step": 16778 + }, + { + "epoch": 2.11, + "grad_norm": 9.604047775268555, + "learning_rate": 5.9615947788980465e-06, + "loss": 0.3661, + "step": 16779 + }, + { + "epoch": 2.11, + "grad_norm": 13.931992530822754, + "learning_rate": 5.96075806384136e-06, + "loss": 1.4841, + "step": 16780 + }, + { + "epoch": 2.11, + "grad_norm": 55.37815856933594, + "learning_rate": 5.959921348784672e-06, + "loss": 1.8066, + "step": 16781 + }, + { + "epoch": 2.11, + "grad_norm": 30.015426635742188, + "learning_rate": 5.959084633727984e-06, + "loss": 2.854, + "step": 16782 + }, + { + "epoch": 2.11, + "grad_norm": 18.840238571166992, + "learning_rate": 5.958247918671297e-06, + "loss": 1.0707, + "step": 16783 + }, + { + "epoch": 2.11, + "grad_norm": 5.141613006591797, + "learning_rate": 5.95741120361461e-06, + "loss": 0.6381, + "step": 16784 + }, + { + "epoch": 2.11, + "grad_norm": 44.44023132324219, + "learning_rate": 5.9565744885579225e-06, + "loss": 2.2979, + "step": 16785 + }, + { + "epoch": 2.11, + "grad_norm": 9.82348918914795, + "learning_rate": 5.955737773501234e-06, + "loss": 0.6119, + "step": 16786 + }, + { + "epoch": 2.11, + "grad_norm": 4.065723419189453, + "learning_rate": 5.954901058444548e-06, + "loss": 0.1256, + "step": 16787 + }, + { + "epoch": 2.11, + "grad_norm": 16.968774795532227, + "learning_rate": 5.95406434338786e-06, + "loss": 1.3901, + "step": 16788 + }, + { + "epoch": 2.11, + "grad_norm": 15.727312088012695, + "learning_rate": 5.953227628331172e-06, + "loss": 1.1866, + "step": 16789 + }, + { + "epoch": 2.11, + "grad_norm": 25.21003532409668, + "learning_rate": 5.952390913274485e-06, + "loss": 1.9156, + "step": 16790 + }, + { + "epoch": 2.11, + "grad_norm": 15.855225563049316, + "learning_rate": 5.951554198217797e-06, + "loss": 0.7767, + "step": 16791 + }, + { + "epoch": 2.11, + "grad_norm": 22.47800636291504, + "learning_rate": 5.95071748316111e-06, + "loss": 1.3269, + "step": 16792 + }, + { + "epoch": 2.11, + "grad_norm": 18.00648307800293, + "learning_rate": 5.949880768104422e-06, + "loss": 1.5794, + "step": 16793 + }, + { + "epoch": 2.11, + "grad_norm": 9.49789810180664, + "learning_rate": 5.949044053047736e-06, + "loss": 0.613, + "step": 16794 + }, + { + "epoch": 2.11, + "grad_norm": 1.8184410333633423, + "learning_rate": 5.948207337991048e-06, + "loss": 0.0311, + "step": 16795 + }, + { + "epoch": 2.11, + "grad_norm": 7.746805191040039, + "learning_rate": 5.94737062293436e-06, + "loss": 0.7932, + "step": 16796 + }, + { + "epoch": 2.11, + "grad_norm": 20.15648651123047, + "learning_rate": 5.946533907877673e-06, + "loss": 1.8011, + "step": 16797 + }, + { + "epoch": 2.11, + "grad_norm": 38.610687255859375, + "learning_rate": 5.945697192820985e-06, + "loss": 2.9669, + "step": 16798 + }, + { + "epoch": 2.11, + "grad_norm": 11.16541862487793, + "learning_rate": 5.944860477764298e-06, + "loss": 0.4904, + "step": 16799 + }, + { + "epoch": 2.11, + "grad_norm": 4.147881507873535, + "learning_rate": 5.94402376270761e-06, + "loss": 0.1599, + "step": 16800 + }, + { + "epoch": 2.11, + "eval_loss": 0.10721652209758759, + "eval_runtime": 95.8916, + "eval_samples_per_second": 36.938, + "eval_steps_per_second": 36.938, + "step": 16800 + }, + { + "epoch": 2.11, + "grad_norm": 20.84210968017578, + "learning_rate": 5.943187047650923e-06, + "loss": 1.1602, + "step": 16801 + }, + { + "epoch": 2.11, + "grad_norm": 14.796680450439453, + "learning_rate": 5.942350332594236e-06, + "loss": 0.8441, + "step": 16802 + }, + { + "epoch": 2.11, + "grad_norm": 18.466554641723633, + "learning_rate": 5.941513617537548e-06, + "loss": 0.6331, + "step": 16803 + }, + { + "epoch": 2.11, + "grad_norm": 5.697744846343994, + "learning_rate": 5.940676902480861e-06, + "loss": 0.2152, + "step": 16804 + }, + { + "epoch": 2.11, + "grad_norm": 46.678199768066406, + "learning_rate": 5.9398401874241726e-06, + "loss": 1.6034, + "step": 16805 + }, + { + "epoch": 2.11, + "grad_norm": 9.598953247070312, + "learning_rate": 5.939003472367486e-06, + "loss": 1.4654, + "step": 16806 + }, + { + "epoch": 2.11, + "grad_norm": 7.446468830108643, + "learning_rate": 5.938166757310798e-06, + "loss": 0.3296, + "step": 16807 + }, + { + "epoch": 2.11, + "grad_norm": 9.030505180358887, + "learning_rate": 5.937330042254111e-06, + "loss": 1.5311, + "step": 16808 + }, + { + "epoch": 2.11, + "grad_norm": 7.710502624511719, + "learning_rate": 5.936493327197424e-06, + "loss": 1.4079, + "step": 16809 + }, + { + "epoch": 2.11, + "grad_norm": 21.966482162475586, + "learning_rate": 5.935656612140736e-06, + "loss": 2.6049, + "step": 16810 + }, + { + "epoch": 2.11, + "grad_norm": 16.916112899780273, + "learning_rate": 5.9348198970840485e-06, + "loss": 1.845, + "step": 16811 + }, + { + "epoch": 2.11, + "grad_norm": 2.8744070529937744, + "learning_rate": 5.9339831820273605e-06, + "loss": 0.0495, + "step": 16812 + }, + { + "epoch": 2.11, + "grad_norm": 20.617137908935547, + "learning_rate": 5.933146466970674e-06, + "loss": 0.9642, + "step": 16813 + }, + { + "epoch": 2.11, + "grad_norm": 134.33131408691406, + "learning_rate": 5.932309751913986e-06, + "loss": 1.755, + "step": 16814 + }, + { + "epoch": 2.11, + "grad_norm": 5.855226516723633, + "learning_rate": 5.931473036857299e-06, + "loss": 0.2101, + "step": 16815 + }, + { + "epoch": 2.11, + "grad_norm": 10.540989875793457, + "learning_rate": 5.930636321800612e-06, + "loss": 0.3734, + "step": 16816 + }, + { + "epoch": 2.11, + "grad_norm": 4.118357181549072, + "learning_rate": 5.929799606743924e-06, + "loss": 1.5305, + "step": 16817 + }, + { + "epoch": 2.11, + "grad_norm": 5.998538970947266, + "learning_rate": 5.9289628916872364e-06, + "loss": 0.4855, + "step": 16818 + }, + { + "epoch": 2.11, + "grad_norm": 9.374042510986328, + "learning_rate": 5.928126176630548e-06, + "loss": 0.599, + "step": 16819 + }, + { + "epoch": 2.11, + "grad_norm": 21.988372802734375, + "learning_rate": 5.927289461573862e-06, + "loss": 0.656, + "step": 16820 + }, + { + "epoch": 2.11, + "grad_norm": 19.108642578125, + "learning_rate": 5.926452746517174e-06, + "loss": 0.8623, + "step": 16821 + }, + { + "epoch": 2.11, + "grad_norm": 9.66044807434082, + "learning_rate": 5.925616031460487e-06, + "loss": 0.7023, + "step": 16822 + }, + { + "epoch": 2.11, + "grad_norm": 10.207428932189941, + "learning_rate": 5.9247793164038e-06, + "loss": 0.6425, + "step": 16823 + }, + { + "epoch": 2.11, + "grad_norm": 2.872533082962036, + "learning_rate": 5.9239426013471116e-06, + "loss": 0.0445, + "step": 16824 + }, + { + "epoch": 2.11, + "grad_norm": 2.80725359916687, + "learning_rate": 5.923105886290424e-06, + "loss": 0.0798, + "step": 16825 + }, + { + "epoch": 2.11, + "grad_norm": 7.395112037658691, + "learning_rate": 5.922269171233736e-06, + "loss": 0.136, + "step": 16826 + }, + { + "epoch": 2.11, + "grad_norm": 24.68223762512207, + "learning_rate": 5.92143245617705e-06, + "loss": 1.9311, + "step": 16827 + }, + { + "epoch": 2.11, + "grad_norm": 16.709436416625977, + "learning_rate": 5.920595741120362e-06, + "loss": 1.2259, + "step": 16828 + }, + { + "epoch": 2.11, + "grad_norm": 37.04315948486328, + "learning_rate": 5.919759026063675e-06, + "loss": 0.8694, + "step": 16829 + }, + { + "epoch": 2.11, + "grad_norm": 38.659637451171875, + "learning_rate": 5.9189223110069875e-06, + "loss": 1.38, + "step": 16830 + }, + { + "epoch": 2.11, + "grad_norm": 100.3625717163086, + "learning_rate": 5.9180855959502995e-06, + "loss": 1.122, + "step": 16831 + }, + { + "epoch": 2.11, + "grad_norm": 84.5021743774414, + "learning_rate": 5.917248880893612e-06, + "loss": 0.9857, + "step": 16832 + }, + { + "epoch": 2.11, + "grad_norm": 11.296195983886719, + "learning_rate": 5.916412165836924e-06, + "loss": 0.6122, + "step": 16833 + }, + { + "epoch": 2.11, + "grad_norm": 6.25674295425415, + "learning_rate": 5.915575450780238e-06, + "loss": 1.6377, + "step": 16834 + }, + { + "epoch": 2.11, + "grad_norm": 12.446991920471191, + "learning_rate": 5.91473873572355e-06, + "loss": 2.3191, + "step": 16835 + }, + { + "epoch": 2.11, + "grad_norm": 83.3683853149414, + "learning_rate": 5.913902020666863e-06, + "loss": 2.8401, + "step": 16836 + }, + { + "epoch": 2.11, + "grad_norm": 16.350399017333984, + "learning_rate": 5.913065305610175e-06, + "loss": 0.758, + "step": 16837 + }, + { + "epoch": 2.11, + "grad_norm": 17.3767032623291, + "learning_rate": 5.912228590553487e-06, + "loss": 1.6186, + "step": 16838 + }, + { + "epoch": 2.11, + "grad_norm": 21.542068481445312, + "learning_rate": 5.9113918754968e-06, + "loss": 2.4041, + "step": 16839 + }, + { + "epoch": 2.11, + "grad_norm": 16.57794952392578, + "learning_rate": 5.910555160440112e-06, + "loss": 0.9186, + "step": 16840 + }, + { + "epoch": 2.11, + "grad_norm": 30.77682113647461, + "learning_rate": 5.909718445383426e-06, + "loss": 0.3596, + "step": 16841 + }, + { + "epoch": 2.11, + "grad_norm": 22.59938621520996, + "learning_rate": 5.908881730326738e-06, + "loss": 0.8914, + "step": 16842 + }, + { + "epoch": 2.11, + "grad_norm": 107.36776733398438, + "learning_rate": 5.9080450152700506e-06, + "loss": 1.1896, + "step": 16843 + }, + { + "epoch": 2.11, + "grad_norm": 65.2825698852539, + "learning_rate": 5.9072083002133625e-06, + "loss": 0.9863, + "step": 16844 + }, + { + "epoch": 2.11, + "grad_norm": 16.80819320678711, + "learning_rate": 5.906371585156675e-06, + "loss": 0.5168, + "step": 16845 + }, + { + "epoch": 2.11, + "grad_norm": 41.62934112548828, + "learning_rate": 5.905534870099988e-06, + "loss": 2.5369, + "step": 16846 + }, + { + "epoch": 2.11, + "grad_norm": 15.434104919433594, + "learning_rate": 5.9046981550433e-06, + "loss": 2.1379, + "step": 16847 + }, + { + "epoch": 2.11, + "grad_norm": 4.339593887329102, + "learning_rate": 5.903861439986614e-06, + "loss": 0.4649, + "step": 16848 + }, + { + "epoch": 2.11, + "grad_norm": 16.37318229675293, + "learning_rate": 5.903024724929926e-06, + "loss": 0.8275, + "step": 16849 + }, + { + "epoch": 2.11, + "grad_norm": 13.673310279846191, + "learning_rate": 5.9021880098732385e-06, + "loss": 0.9344, + "step": 16850 + }, + { + "epoch": 2.11, + "grad_norm": 11.147366523742676, + "learning_rate": 5.9013512948165504e-06, + "loss": 2.0417, + "step": 16851 + }, + { + "epoch": 2.11, + "grad_norm": 12.443256378173828, + "learning_rate": 5.900514579759863e-06, + "loss": 1.7393, + "step": 16852 + }, + { + "epoch": 2.12, + "grad_norm": 12.487602233886719, + "learning_rate": 5.899677864703176e-06, + "loss": 1.4971, + "step": 16853 + }, + { + "epoch": 2.12, + "grad_norm": 12.259377479553223, + "learning_rate": 5.898841149646488e-06, + "loss": 0.9677, + "step": 16854 + }, + { + "epoch": 2.12, + "grad_norm": 17.900497436523438, + "learning_rate": 5.898004434589802e-06, + "loss": 0.7282, + "step": 16855 + }, + { + "epoch": 2.12, + "grad_norm": 12.962688446044922, + "learning_rate": 5.897167719533114e-06, + "loss": 1.843, + "step": 16856 + }, + { + "epoch": 2.12, + "grad_norm": 15.524505615234375, + "learning_rate": 5.896331004476426e-06, + "loss": 0.54, + "step": 16857 + }, + { + "epoch": 2.12, + "grad_norm": 13.852781295776367, + "learning_rate": 5.895494289419738e-06, + "loss": 0.4138, + "step": 16858 + }, + { + "epoch": 2.12, + "grad_norm": 115.11148071289062, + "learning_rate": 5.894657574363051e-06, + "loss": 2.0038, + "step": 16859 + }, + { + "epoch": 2.12, + "grad_norm": 24.396728515625, + "learning_rate": 5.893820859306364e-06, + "loss": 1.4907, + "step": 16860 + }, + { + "epoch": 2.12, + "grad_norm": 17.910776138305664, + "learning_rate": 5.892984144249676e-06, + "loss": 1.9196, + "step": 16861 + }, + { + "epoch": 2.12, + "grad_norm": 16.473308563232422, + "learning_rate": 5.8921474291929896e-06, + "loss": 2.9123, + "step": 16862 + }, + { + "epoch": 2.12, + "grad_norm": 9.617788314819336, + "learning_rate": 5.8913107141363015e-06, + "loss": 0.4149, + "step": 16863 + }, + { + "epoch": 2.12, + "grad_norm": 147.6373291015625, + "learning_rate": 5.8904739990796135e-06, + "loss": 0.584, + "step": 16864 + }, + { + "epoch": 2.12, + "grad_norm": 7.813459873199463, + "learning_rate": 5.889637284022926e-06, + "loss": 0.715, + "step": 16865 + }, + { + "epoch": 2.12, + "grad_norm": 4.4829301834106445, + "learning_rate": 5.888800568966238e-06, + "loss": 0.4809, + "step": 16866 + }, + { + "epoch": 2.12, + "grad_norm": 26.605653762817383, + "learning_rate": 5.887963853909552e-06, + "loss": 1.7967, + "step": 16867 + }, + { + "epoch": 2.12, + "grad_norm": 13.526481628417969, + "learning_rate": 5.887127138852864e-06, + "loss": 0.9315, + "step": 16868 + }, + { + "epoch": 2.12, + "grad_norm": 4.498909950256348, + "learning_rate": 5.8862904237961775e-06, + "loss": 0.289, + "step": 16869 + }, + { + "epoch": 2.12, + "grad_norm": 10.260990142822266, + "learning_rate": 5.8854537087394894e-06, + "loss": 1.0975, + "step": 16870 + }, + { + "epoch": 2.12, + "grad_norm": 3.1513917446136475, + "learning_rate": 5.884616993682801e-06, + "loss": 0.172, + "step": 16871 + }, + { + "epoch": 2.12, + "grad_norm": 24.654848098754883, + "learning_rate": 5.883780278626114e-06, + "loss": 0.7593, + "step": 16872 + }, + { + "epoch": 2.12, + "grad_norm": 4.956201076507568, + "learning_rate": 5.882943563569426e-06, + "loss": 1.1053, + "step": 16873 + }, + { + "epoch": 2.12, + "grad_norm": 18.203449249267578, + "learning_rate": 5.88210684851274e-06, + "loss": 0.6404, + "step": 16874 + }, + { + "epoch": 2.12, + "grad_norm": 18.785099029541016, + "learning_rate": 5.881270133456052e-06, + "loss": 1.0799, + "step": 16875 + }, + { + "epoch": 2.12, + "grad_norm": 28.527324676513672, + "learning_rate": 5.880433418399365e-06, + "loss": 1.4479, + "step": 16876 + }, + { + "epoch": 2.12, + "grad_norm": 6.801303386688232, + "learning_rate": 5.879596703342677e-06, + "loss": 0.8484, + "step": 16877 + }, + { + "epoch": 2.12, + "grad_norm": 15.764060974121094, + "learning_rate": 5.878759988285989e-06, + "loss": 0.7352, + "step": 16878 + }, + { + "epoch": 2.12, + "grad_norm": 21.629823684692383, + "learning_rate": 5.877923273229302e-06, + "loss": 0.6906, + "step": 16879 + }, + { + "epoch": 2.12, + "grad_norm": 3.686356782913208, + "learning_rate": 5.877086558172614e-06, + "loss": 0.1907, + "step": 16880 + }, + { + "epoch": 2.12, + "grad_norm": 12.804296493530273, + "learning_rate": 5.876249843115928e-06, + "loss": 0.7735, + "step": 16881 + }, + { + "epoch": 2.12, + "grad_norm": 14.29104995727539, + "learning_rate": 5.87541312805924e-06, + "loss": 0.9907, + "step": 16882 + }, + { + "epoch": 2.12, + "grad_norm": 95.18588256835938, + "learning_rate": 5.8745764130025525e-06, + "loss": 2.5333, + "step": 16883 + }, + { + "epoch": 2.12, + "grad_norm": 14.519478797912598, + "learning_rate": 5.873739697945865e-06, + "loss": 0.5038, + "step": 16884 + }, + { + "epoch": 2.12, + "grad_norm": 110.04188537597656, + "learning_rate": 5.872902982889177e-06, + "loss": 3.6262, + "step": 16885 + }, + { + "epoch": 2.12, + "grad_norm": 242.27322387695312, + "learning_rate": 5.87206626783249e-06, + "loss": 0.5165, + "step": 16886 + }, + { + "epoch": 2.12, + "grad_norm": 1.291407585144043, + "learning_rate": 5.871229552775802e-06, + "loss": 0.0267, + "step": 16887 + }, + { + "epoch": 2.12, + "grad_norm": 85.1260986328125, + "learning_rate": 5.870392837719116e-06, + "loss": 0.9073, + "step": 16888 + }, + { + "epoch": 2.12, + "grad_norm": 10.375450134277344, + "learning_rate": 5.869556122662428e-06, + "loss": 1.2339, + "step": 16889 + }, + { + "epoch": 2.12, + "grad_norm": 5.949151992797852, + "learning_rate": 5.86871940760574e-06, + "loss": 0.2236, + "step": 16890 + }, + { + "epoch": 2.12, + "grad_norm": 12.326302528381348, + "learning_rate": 5.867882692549053e-06, + "loss": 1.0115, + "step": 16891 + }, + { + "epoch": 2.12, + "grad_norm": 9.207403182983398, + "learning_rate": 5.867045977492365e-06, + "loss": 0.4804, + "step": 16892 + }, + { + "epoch": 2.12, + "grad_norm": 7.769590377807617, + "learning_rate": 5.866209262435678e-06, + "loss": 0.6457, + "step": 16893 + }, + { + "epoch": 2.12, + "grad_norm": 23.980419158935547, + "learning_rate": 5.86537254737899e-06, + "loss": 1.0284, + "step": 16894 + }, + { + "epoch": 2.12, + "grad_norm": 20.030860900878906, + "learning_rate": 5.8645358323223035e-06, + "loss": 0.8965, + "step": 16895 + }, + { + "epoch": 2.12, + "grad_norm": 12.609292984008789, + "learning_rate": 5.8636991172656155e-06, + "loss": 0.9808, + "step": 16896 + }, + { + "epoch": 2.12, + "grad_norm": 9.369558334350586, + "learning_rate": 5.862862402208928e-06, + "loss": 3.3645, + "step": 16897 + }, + { + "epoch": 2.12, + "grad_norm": 21.751758575439453, + "learning_rate": 5.862025687152241e-06, + "loss": 2.034, + "step": 16898 + }, + { + "epoch": 2.12, + "grad_norm": 19.85375213623047, + "learning_rate": 5.861188972095553e-06, + "loss": 1.2635, + "step": 16899 + }, + { + "epoch": 2.12, + "grad_norm": 90.26131439208984, + "learning_rate": 5.860352257038866e-06, + "loss": 1.43, + "step": 16900 + }, + { + "epoch": 2.12, + "grad_norm": 18.804216384887695, + "learning_rate": 5.859515541982178e-06, + "loss": 0.8194, + "step": 16901 + }, + { + "epoch": 2.12, + "grad_norm": 9.671285629272461, + "learning_rate": 5.8586788269254915e-06, + "loss": 0.9071, + "step": 16902 + }, + { + "epoch": 2.12, + "grad_norm": 7.237061977386475, + "learning_rate": 5.857842111868803e-06, + "loss": 0.4013, + "step": 16903 + }, + { + "epoch": 2.12, + "grad_norm": 29.724605560302734, + "learning_rate": 5.857005396812116e-06, + "loss": 1.1553, + "step": 16904 + }, + { + "epoch": 2.12, + "grad_norm": 36.172664642333984, + "learning_rate": 5.856168681755429e-06, + "loss": 0.7432, + "step": 16905 + }, + { + "epoch": 2.12, + "grad_norm": 44.96842575073242, + "learning_rate": 5.855331966698741e-06, + "loss": 0.6883, + "step": 16906 + }, + { + "epoch": 2.12, + "grad_norm": 65.7833251953125, + "learning_rate": 5.854495251642054e-06, + "loss": 2.1407, + "step": 16907 + }, + { + "epoch": 2.12, + "grad_norm": 6.288467884063721, + "learning_rate": 5.853658536585366e-06, + "loss": 0.1448, + "step": 16908 + }, + { + "epoch": 2.12, + "grad_norm": 9.45230484008789, + "learning_rate": 5.852821821528679e-06, + "loss": 0.4311, + "step": 16909 + }, + { + "epoch": 2.12, + "grad_norm": 23.460378646850586, + "learning_rate": 5.851985106471991e-06, + "loss": 1.4705, + "step": 16910 + }, + { + "epoch": 2.12, + "grad_norm": 8.106704711914062, + "learning_rate": 5.851148391415304e-06, + "loss": 2.52, + "step": 16911 + }, + { + "epoch": 2.12, + "grad_norm": 95.96499633789062, + "learning_rate": 5.850311676358617e-06, + "loss": 1.4573, + "step": 16912 + }, + { + "epoch": 2.12, + "grad_norm": 19.770360946655273, + "learning_rate": 5.849474961301929e-06, + "loss": 0.7553, + "step": 16913 + }, + { + "epoch": 2.12, + "grad_norm": 10.828676223754883, + "learning_rate": 5.848638246245242e-06, + "loss": 0.1883, + "step": 16914 + }, + { + "epoch": 2.12, + "grad_norm": 11.930998802185059, + "learning_rate": 5.847801531188554e-06, + "loss": 0.6466, + "step": 16915 + }, + { + "epoch": 2.12, + "grad_norm": 17.37325096130371, + "learning_rate": 5.846964816131867e-06, + "loss": 1.3859, + "step": 16916 + }, + { + "epoch": 2.12, + "grad_norm": 5.8114213943481445, + "learning_rate": 5.846128101075179e-06, + "loss": 0.3087, + "step": 16917 + }, + { + "epoch": 2.12, + "grad_norm": 13.317852973937988, + "learning_rate": 5.845291386018492e-06, + "loss": 1.6657, + "step": 16918 + }, + { + "epoch": 2.12, + "grad_norm": 15.078089714050293, + "learning_rate": 5.844454670961804e-06, + "loss": 0.8716, + "step": 16919 + }, + { + "epoch": 2.12, + "grad_norm": 8.380730628967285, + "learning_rate": 5.843617955905117e-06, + "loss": 0.592, + "step": 16920 + }, + { + "epoch": 2.12, + "grad_norm": 9.53822135925293, + "learning_rate": 5.84278124084843e-06, + "loss": 1.629, + "step": 16921 + }, + { + "epoch": 2.12, + "grad_norm": 7.400240421295166, + "learning_rate": 5.8419445257917416e-06, + "loss": 0.4703, + "step": 16922 + }, + { + "epoch": 2.12, + "grad_norm": 11.016195297241211, + "learning_rate": 5.841107810735055e-06, + "loss": 0.6054, + "step": 16923 + }, + { + "epoch": 2.12, + "grad_norm": 188.6097869873047, + "learning_rate": 5.840271095678367e-06, + "loss": 1.6023, + "step": 16924 + }, + { + "epoch": 2.12, + "grad_norm": 7.374948978424072, + "learning_rate": 5.83943438062168e-06, + "loss": 1.0421, + "step": 16925 + }, + { + "epoch": 2.12, + "grad_norm": 11.707968711853027, + "learning_rate": 5.838597665564992e-06, + "loss": 0.514, + "step": 16926 + }, + { + "epoch": 2.12, + "grad_norm": 10.491926193237305, + "learning_rate": 5.837760950508305e-06, + "loss": 0.6111, + "step": 16927 + }, + { + "epoch": 2.12, + "grad_norm": 8.48752498626709, + "learning_rate": 5.8369242354516175e-06, + "loss": 0.2845, + "step": 16928 + }, + { + "epoch": 2.12, + "grad_norm": 23.22374725341797, + "learning_rate": 5.8360875203949295e-06, + "loss": 0.632, + "step": 16929 + }, + { + "epoch": 2.12, + "grad_norm": 9.480890274047852, + "learning_rate": 5.835250805338243e-06, + "loss": 0.9876, + "step": 16930 + }, + { + "epoch": 2.12, + "grad_norm": 25.120214462280273, + "learning_rate": 5.834414090281555e-06, + "loss": 1.0777, + "step": 16931 + }, + { + "epoch": 2.12, + "grad_norm": 18.795116424560547, + "learning_rate": 5.833577375224868e-06, + "loss": 2.1714, + "step": 16932 + }, + { + "epoch": 2.13, + "grad_norm": 37.169952392578125, + "learning_rate": 5.83274066016818e-06, + "loss": 1.6353, + "step": 16933 + }, + { + "epoch": 2.13, + "grad_norm": 12.385771751403809, + "learning_rate": 5.831903945111493e-06, + "loss": 0.9374, + "step": 16934 + }, + { + "epoch": 2.13, + "grad_norm": 7.301438331604004, + "learning_rate": 5.8310672300548055e-06, + "loss": 0.4351, + "step": 16935 + }, + { + "epoch": 2.13, + "grad_norm": 23.69075584411621, + "learning_rate": 5.830230514998117e-06, + "loss": 1.438, + "step": 16936 + }, + { + "epoch": 2.13, + "grad_norm": 8.792628288269043, + "learning_rate": 5.829393799941431e-06, + "loss": 0.4614, + "step": 16937 + }, + { + "epoch": 2.13, + "grad_norm": 35.86687088012695, + "learning_rate": 5.828557084884743e-06, + "loss": 1.3342, + "step": 16938 + }, + { + "epoch": 2.13, + "grad_norm": 12.236211776733398, + "learning_rate": 5.827720369828056e-06, + "loss": 0.7526, + "step": 16939 + }, + { + "epoch": 2.13, + "grad_norm": 8.944195747375488, + "learning_rate": 5.826883654771368e-06, + "loss": 0.834, + "step": 16940 + }, + { + "epoch": 2.13, + "grad_norm": 20.822357177734375, + "learning_rate": 5.8260469397146806e-06, + "loss": 2.4715, + "step": 16941 + }, + { + "epoch": 2.13, + "grad_norm": 17.76325035095215, + "learning_rate": 5.825210224657993e-06, + "loss": 1.6494, + "step": 16942 + }, + { + "epoch": 2.13, + "grad_norm": 19.366193771362305, + "learning_rate": 5.824373509601305e-06, + "loss": 2.7357, + "step": 16943 + }, + { + "epoch": 2.13, + "grad_norm": 3.908167600631714, + "learning_rate": 5.823536794544619e-06, + "loss": 0.1785, + "step": 16944 + }, + { + "epoch": 2.13, + "grad_norm": 11.59701156616211, + "learning_rate": 5.822700079487931e-06, + "loss": 1.4952, + "step": 16945 + }, + { + "epoch": 2.13, + "grad_norm": 15.114361763000488, + "learning_rate": 5.821863364431244e-06, + "loss": 3.0703, + "step": 16946 + }, + { + "epoch": 2.13, + "grad_norm": 18.187007904052734, + "learning_rate": 5.821026649374556e-06, + "loss": 0.8608, + "step": 16947 + }, + { + "epoch": 2.13, + "grad_norm": 4.346529006958008, + "learning_rate": 5.820189934317868e-06, + "loss": 0.6006, + "step": 16948 + }, + { + "epoch": 2.13, + "grad_norm": 12.202530860900879, + "learning_rate": 5.819353219261181e-06, + "loss": 0.4576, + "step": 16949 + }, + { + "epoch": 2.13, + "grad_norm": 13.990489959716797, + "learning_rate": 5.818516504204493e-06, + "loss": 1.1319, + "step": 16950 + }, + { + "epoch": 2.13, + "grad_norm": 11.866337776184082, + "learning_rate": 5.817679789147807e-06, + "loss": 1.7133, + "step": 16951 + }, + { + "epoch": 2.13, + "grad_norm": 14.335285186767578, + "learning_rate": 5.816843074091119e-06, + "loss": 0.4969, + "step": 16952 + }, + { + "epoch": 2.13, + "grad_norm": 23.00996971130371, + "learning_rate": 5.816006359034432e-06, + "loss": 0.8737, + "step": 16953 + }, + { + "epoch": 2.13, + "grad_norm": 29.987459182739258, + "learning_rate": 5.815169643977744e-06, + "loss": 1.7031, + "step": 16954 + }, + { + "epoch": 2.13, + "grad_norm": 16.34398651123047, + "learning_rate": 5.8143329289210556e-06, + "loss": 0.7954, + "step": 16955 + }, + { + "epoch": 2.13, + "grad_norm": 8.641459465026855, + "learning_rate": 5.813496213864369e-06, + "loss": 0.3977, + "step": 16956 + }, + { + "epoch": 2.13, + "grad_norm": 22.90567970275879, + "learning_rate": 5.812659498807681e-06, + "loss": 1.2081, + "step": 16957 + }, + { + "epoch": 2.13, + "grad_norm": 5.746809482574463, + "learning_rate": 5.811822783750995e-06, + "loss": 0.4631, + "step": 16958 + }, + { + "epoch": 2.13, + "grad_norm": 4.130385398864746, + "learning_rate": 5.810986068694307e-06, + "loss": 0.3609, + "step": 16959 + }, + { + "epoch": 2.13, + "grad_norm": 10.714170455932617, + "learning_rate": 5.8101493536376196e-06, + "loss": 0.6854, + "step": 16960 + }, + { + "epoch": 2.13, + "grad_norm": 16.156368255615234, + "learning_rate": 5.8093126385809315e-06, + "loss": 0.9861, + "step": 16961 + }, + { + "epoch": 2.13, + "grad_norm": 10.297343254089355, + "learning_rate": 5.8084759235242435e-06, + "loss": 0.4309, + "step": 16962 + }, + { + "epoch": 2.13, + "grad_norm": 62.99007034301758, + "learning_rate": 5.807639208467557e-06, + "loss": 1.1902, + "step": 16963 + }, + { + "epoch": 2.13, + "grad_norm": 61.34621810913086, + "learning_rate": 5.806802493410869e-06, + "loss": 2.0277, + "step": 16964 + }, + { + "epoch": 2.13, + "grad_norm": 10.976434707641602, + "learning_rate": 5.805965778354182e-06, + "loss": 1.5143, + "step": 16965 + }, + { + "epoch": 2.13, + "grad_norm": 24.809417724609375, + "learning_rate": 5.805129063297495e-06, + "loss": 1.0778, + "step": 16966 + }, + { + "epoch": 2.13, + "grad_norm": 27.914297103881836, + "learning_rate": 5.8042923482408075e-06, + "loss": 0.6749, + "step": 16967 + }, + { + "epoch": 2.13, + "grad_norm": 11.465100288391113, + "learning_rate": 5.8034556331841194e-06, + "loss": 0.7415, + "step": 16968 + }, + { + "epoch": 2.13, + "grad_norm": 8.154410362243652, + "learning_rate": 5.802618918127431e-06, + "loss": 1.5581, + "step": 16969 + }, + { + "epoch": 2.13, + "grad_norm": 15.883102416992188, + "learning_rate": 5.801782203070745e-06, + "loss": 1.1446, + "step": 16970 + }, + { + "epoch": 2.13, + "grad_norm": 23.598388671875, + "learning_rate": 5.800945488014057e-06, + "loss": 1.029, + "step": 16971 + }, + { + "epoch": 2.13, + "grad_norm": 7.744470596313477, + "learning_rate": 5.80010877295737e-06, + "loss": 0.2278, + "step": 16972 + }, + { + "epoch": 2.13, + "grad_norm": 9.633336067199707, + "learning_rate": 5.799272057900683e-06, + "loss": 0.7205, + "step": 16973 + }, + { + "epoch": 2.13, + "grad_norm": 13.613395690917969, + "learning_rate": 5.798435342843995e-06, + "loss": 2.3816, + "step": 16974 + }, + { + "epoch": 2.13, + "grad_norm": 27.28310203552246, + "learning_rate": 5.797598627787307e-06, + "loss": 1.708, + "step": 16975 + }, + { + "epoch": 2.13, + "grad_norm": 23.683673858642578, + "learning_rate": 5.796761912730619e-06, + "loss": 1.975, + "step": 16976 + }, + { + "epoch": 2.13, + "grad_norm": 17.8934268951416, + "learning_rate": 5.795925197673933e-06, + "loss": 0.2576, + "step": 16977 + }, + { + "epoch": 2.13, + "grad_norm": 18.01639175415039, + "learning_rate": 5.795088482617245e-06, + "loss": 1.0333, + "step": 16978 + }, + { + "epoch": 2.13, + "grad_norm": 6.331151485443115, + "learning_rate": 5.794251767560558e-06, + "loss": 0.6816, + "step": 16979 + }, + { + "epoch": 2.13, + "grad_norm": 21.129030227661133, + "learning_rate": 5.7934150525038705e-06, + "loss": 1.8913, + "step": 16980 + }, + { + "epoch": 2.13, + "grad_norm": 7.894270896911621, + "learning_rate": 5.792578337447183e-06, + "loss": 1.8436, + "step": 16981 + }, + { + "epoch": 2.13, + "grad_norm": 22.673084259033203, + "learning_rate": 5.791741622390495e-06, + "loss": 1.5624, + "step": 16982 + }, + { + "epoch": 2.13, + "grad_norm": 4.034121990203857, + "learning_rate": 5.790904907333807e-06, + "loss": 0.2669, + "step": 16983 + }, + { + "epoch": 2.13, + "grad_norm": 10.858423233032227, + "learning_rate": 5.790068192277121e-06, + "loss": 1.2583, + "step": 16984 + }, + { + "epoch": 2.13, + "grad_norm": 57.927940368652344, + "learning_rate": 5.789231477220433e-06, + "loss": 3.5892, + "step": 16985 + }, + { + "epoch": 2.13, + "grad_norm": 5.054805755615234, + "learning_rate": 5.788394762163746e-06, + "loss": 0.3993, + "step": 16986 + }, + { + "epoch": 2.13, + "grad_norm": 4.532777786254883, + "learning_rate": 5.7875580471070584e-06, + "loss": 0.5755, + "step": 16987 + }, + { + "epoch": 2.13, + "grad_norm": 25.14957046508789, + "learning_rate": 5.786721332050371e-06, + "loss": 2.0354, + "step": 16988 + }, + { + "epoch": 2.13, + "grad_norm": 24.648008346557617, + "learning_rate": 5.785884616993683e-06, + "loss": 0.8927, + "step": 16989 + }, + { + "epoch": 2.13, + "grad_norm": 19.69295310974121, + "learning_rate": 5.785047901936995e-06, + "loss": 0.7736, + "step": 16990 + }, + { + "epoch": 2.13, + "grad_norm": 3.650480031967163, + "learning_rate": 5.784211186880309e-06, + "loss": 0.1339, + "step": 16991 + }, + { + "epoch": 2.13, + "grad_norm": 8.340292930603027, + "learning_rate": 5.783374471823621e-06, + "loss": 1.4557, + "step": 16992 + }, + { + "epoch": 2.13, + "grad_norm": 8.41778564453125, + "learning_rate": 5.7825377567669336e-06, + "loss": 0.9915, + "step": 16993 + }, + { + "epoch": 2.13, + "grad_norm": 8.60283374786377, + "learning_rate": 5.7817010417102455e-06, + "loss": 0.5233, + "step": 16994 + }, + { + "epoch": 2.13, + "grad_norm": 90.2171859741211, + "learning_rate": 5.780864326653559e-06, + "loss": 1.8594, + "step": 16995 + }, + { + "epoch": 2.13, + "grad_norm": 14.10474967956543, + "learning_rate": 5.780027611596871e-06, + "loss": 0.4432, + "step": 16996 + }, + { + "epoch": 2.13, + "grad_norm": 26.008697509765625, + "learning_rate": 5.779190896540183e-06, + "loss": 1.4974, + "step": 16997 + }, + { + "epoch": 2.13, + "grad_norm": 15.756891250610352, + "learning_rate": 5.778354181483497e-06, + "loss": 0.9927, + "step": 16998 + }, + { + "epoch": 2.13, + "grad_norm": 17.05091667175293, + "learning_rate": 5.777517466426809e-06, + "loss": 2.1283, + "step": 16999 + }, + { + "epoch": 2.13, + "grad_norm": 4.916499137878418, + "learning_rate": 5.7766807513701215e-06, + "loss": 0.4318, + "step": 17000 + }, + { + "epoch": 2.13, + "grad_norm": 42.44375228881836, + "learning_rate": 5.7758440363134334e-06, + "loss": 1.2217, + "step": 17001 + }, + { + "epoch": 2.13, + "grad_norm": 15.128802299499512, + "learning_rate": 5.775007321256747e-06, + "loss": 0.6954, + "step": 17002 + }, + { + "epoch": 2.13, + "grad_norm": 30.88568878173828, + "learning_rate": 5.774170606200059e-06, + "loss": 1.633, + "step": 17003 + }, + { + "epoch": 2.13, + "grad_norm": 8.705619812011719, + "learning_rate": 5.773333891143371e-06, + "loss": 0.8603, + "step": 17004 + }, + { + "epoch": 2.13, + "grad_norm": 22.377500534057617, + "learning_rate": 5.772497176086685e-06, + "loss": 1.6253, + "step": 17005 + }, + { + "epoch": 2.13, + "grad_norm": 6.745636940002441, + "learning_rate": 5.771660461029997e-06, + "loss": 0.2458, + "step": 17006 + }, + { + "epoch": 2.13, + "grad_norm": 13.070367813110352, + "learning_rate": 5.770823745973309e-06, + "loss": 1.3823, + "step": 17007 + }, + { + "epoch": 2.13, + "grad_norm": 19.84786033630371, + "learning_rate": 5.769987030916621e-06, + "loss": 0.6723, + "step": 17008 + }, + { + "epoch": 2.13, + "grad_norm": 10.7432861328125, + "learning_rate": 5.769150315859935e-06, + "loss": 0.399, + "step": 17009 + }, + { + "epoch": 2.13, + "grad_norm": 27.663814544677734, + "learning_rate": 5.768313600803247e-06, + "loss": 1.1698, + "step": 17010 + }, + { + "epoch": 2.13, + "grad_norm": 17.132997512817383, + "learning_rate": 5.767476885746559e-06, + "loss": 0.4798, + "step": 17011 + }, + { + "epoch": 2.13, + "grad_norm": 3.635779857635498, + "learning_rate": 5.7666401706898726e-06, + "loss": 0.28, + "step": 17012 + }, + { + "epoch": 2.14, + "grad_norm": 9.170980453491211, + "learning_rate": 5.7658034556331845e-06, + "loss": 0.5628, + "step": 17013 + }, + { + "epoch": 2.14, + "grad_norm": 11.8899564743042, + "learning_rate": 5.764966740576497e-06, + "loss": 0.4594, + "step": 17014 + }, + { + "epoch": 2.14, + "grad_norm": 10.861407279968262, + "learning_rate": 5.764130025519809e-06, + "loss": 2.4474, + "step": 17015 + }, + { + "epoch": 2.14, + "grad_norm": 28.021129608154297, + "learning_rate": 5.763293310463123e-06, + "loss": 0.9954, + "step": 17016 + }, + { + "epoch": 2.14, + "grad_norm": 10.897455215454102, + "learning_rate": 5.762456595406435e-06, + "loss": 0.5923, + "step": 17017 + }, + { + "epoch": 2.14, + "grad_norm": 17.00180435180664, + "learning_rate": 5.761619880349747e-06, + "loss": 1.063, + "step": 17018 + }, + { + "epoch": 2.14, + "grad_norm": 15.839303016662598, + "learning_rate": 5.7607831652930605e-06, + "loss": 0.5052, + "step": 17019 + }, + { + "epoch": 2.14, + "grad_norm": 426.7574462890625, + "learning_rate": 5.7599464502363724e-06, + "loss": 3.619, + "step": 17020 + }, + { + "epoch": 2.14, + "grad_norm": 7.838033199310303, + "learning_rate": 5.759109735179685e-06, + "loss": 0.7132, + "step": 17021 + }, + { + "epoch": 2.14, + "grad_norm": 5.230981826782227, + "learning_rate": 5.758273020122997e-06, + "loss": 0.2577, + "step": 17022 + }, + { + "epoch": 2.14, + "grad_norm": 43.44022750854492, + "learning_rate": 5.757436305066311e-06, + "loss": 1.3863, + "step": 17023 + }, + { + "epoch": 2.14, + "grad_norm": 3.766050100326538, + "learning_rate": 5.756599590009623e-06, + "loss": 1.1447, + "step": 17024 + }, + { + "epoch": 2.14, + "grad_norm": 13.0640230178833, + "learning_rate": 5.755762874952935e-06, + "loss": 1.0206, + "step": 17025 + }, + { + "epoch": 2.14, + "grad_norm": 21.132444381713867, + "learning_rate": 5.754926159896248e-06, + "loss": 0.5415, + "step": 17026 + }, + { + "epoch": 2.14, + "grad_norm": 87.92571258544922, + "learning_rate": 5.75408944483956e-06, + "loss": 2.4722, + "step": 17027 + }, + { + "epoch": 2.14, + "grad_norm": 14.061578750610352, + "learning_rate": 5.753252729782873e-06, + "loss": 1.9392, + "step": 17028 + }, + { + "epoch": 2.14, + "grad_norm": 12.163267135620117, + "learning_rate": 5.752416014726185e-06, + "loss": 0.4628, + "step": 17029 + }, + { + "epoch": 2.14, + "grad_norm": 15.391785621643066, + "learning_rate": 5.751579299669499e-06, + "loss": 0.8887, + "step": 17030 + }, + { + "epoch": 2.14, + "grad_norm": 15.223180770874023, + "learning_rate": 5.750742584612811e-06, + "loss": 0.4689, + "step": 17031 + }, + { + "epoch": 2.14, + "grad_norm": 9.978029251098633, + "learning_rate": 5.749905869556123e-06, + "loss": 0.4122, + "step": 17032 + }, + { + "epoch": 2.14, + "grad_norm": 13.698570251464844, + "learning_rate": 5.749069154499436e-06, + "loss": 0.5871, + "step": 17033 + }, + { + "epoch": 2.14, + "grad_norm": 188.66851806640625, + "learning_rate": 5.748232439442748e-06, + "loss": 1.9903, + "step": 17034 + }, + { + "epoch": 2.14, + "grad_norm": 20.050073623657227, + "learning_rate": 5.747395724386061e-06, + "loss": 0.6087, + "step": 17035 + }, + { + "epoch": 2.14, + "grad_norm": 23.30031394958496, + "learning_rate": 5.746559009329373e-06, + "loss": 2.1471, + "step": 17036 + }, + { + "epoch": 2.14, + "grad_norm": 77.1574935913086, + "learning_rate": 5.745722294272685e-06, + "loss": 0.9452, + "step": 17037 + }, + { + "epoch": 2.14, + "grad_norm": 17.38646697998047, + "learning_rate": 5.744885579215999e-06, + "loss": 1.301, + "step": 17038 + }, + { + "epoch": 2.14, + "grad_norm": 20.22818946838379, + "learning_rate": 5.744048864159311e-06, + "loss": 1.7813, + "step": 17039 + }, + { + "epoch": 2.14, + "grad_norm": 9.979143142700195, + "learning_rate": 5.743212149102624e-06, + "loss": 1.2639, + "step": 17040 + }, + { + "epoch": 2.14, + "grad_norm": 11.882335662841797, + "learning_rate": 5.742375434045936e-06, + "loss": 0.9601, + "step": 17041 + }, + { + "epoch": 2.14, + "grad_norm": 2.124966859817505, + "learning_rate": 5.741538718989249e-06, + "loss": 0.0498, + "step": 17042 + }, + { + "epoch": 2.14, + "grad_norm": 7.987300395965576, + "learning_rate": 5.740702003932561e-06, + "loss": 0.4838, + "step": 17043 + }, + { + "epoch": 2.14, + "grad_norm": 8.915894508361816, + "learning_rate": 5.739865288875873e-06, + "loss": 0.2809, + "step": 17044 + }, + { + "epoch": 2.14, + "grad_norm": 239.8693389892578, + "learning_rate": 5.7390285738191865e-06, + "loss": 1.3925, + "step": 17045 + }, + { + "epoch": 2.14, + "grad_norm": 13.353658676147461, + "learning_rate": 5.7381918587624985e-06, + "loss": 1.0435, + "step": 17046 + }, + { + "epoch": 2.14, + "grad_norm": 12.716243743896484, + "learning_rate": 5.737355143705811e-06, + "loss": 0.68, + "step": 17047 + }, + { + "epoch": 2.14, + "grad_norm": 21.51469612121582, + "learning_rate": 5.736518428649124e-06, + "loss": 1.1867, + "step": 17048 + }, + { + "epoch": 2.14, + "grad_norm": 56.70457077026367, + "learning_rate": 5.735681713592437e-06, + "loss": 2.0627, + "step": 17049 + }, + { + "epoch": 2.14, + "grad_norm": 23.16199493408203, + "learning_rate": 5.734844998535749e-06, + "loss": 0.5439, + "step": 17050 + }, + { + "epoch": 2.14, + "grad_norm": 71.07728576660156, + "learning_rate": 5.734008283479061e-06, + "loss": 2.885, + "step": 17051 + }, + { + "epoch": 2.14, + "grad_norm": 110.95845794677734, + "learning_rate": 5.7331715684223745e-06, + "loss": 1.0208, + "step": 17052 + }, + { + "epoch": 2.14, + "grad_norm": 5.101598739624023, + "learning_rate": 5.732334853365686e-06, + "loss": 0.2971, + "step": 17053 + }, + { + "epoch": 2.14, + "grad_norm": 14.872244834899902, + "learning_rate": 5.731498138308999e-06, + "loss": 0.692, + "step": 17054 + }, + { + "epoch": 2.14, + "grad_norm": 76.74128723144531, + "learning_rate": 5.730661423252312e-06, + "loss": 1.8122, + "step": 17055 + }, + { + "epoch": 2.14, + "grad_norm": 12.32666301727295, + "learning_rate": 5.729824708195625e-06, + "loss": 1.6365, + "step": 17056 + }, + { + "epoch": 2.14, + "grad_norm": 24.103534698486328, + "learning_rate": 5.728987993138937e-06, + "loss": 0.9405, + "step": 17057 + }, + { + "epoch": 2.14, + "grad_norm": 5.0472235679626465, + "learning_rate": 5.728151278082249e-06, + "loss": 0.1581, + "step": 17058 + }, + { + "epoch": 2.14, + "grad_norm": 13.476837158203125, + "learning_rate": 5.727314563025562e-06, + "loss": 1.5368, + "step": 17059 + }, + { + "epoch": 2.14, + "grad_norm": 13.281248092651367, + "learning_rate": 5.726477847968874e-06, + "loss": 0.9484, + "step": 17060 + }, + { + "epoch": 2.14, + "grad_norm": 8.271032333374023, + "learning_rate": 5.725641132912187e-06, + "loss": 0.3287, + "step": 17061 + }, + { + "epoch": 2.14, + "grad_norm": 42.25758743286133, + "learning_rate": 5.7248044178555e-06, + "loss": 2.0096, + "step": 17062 + }, + { + "epoch": 2.14, + "grad_norm": 11.208483695983887, + "learning_rate": 5.723967702798813e-06, + "loss": 0.7167, + "step": 17063 + }, + { + "epoch": 2.14, + "grad_norm": 2.0486040115356445, + "learning_rate": 5.723130987742125e-06, + "loss": 0.0548, + "step": 17064 + }, + { + "epoch": 2.14, + "grad_norm": 6.347630500793457, + "learning_rate": 5.722294272685437e-06, + "loss": 1.5378, + "step": 17065 + }, + { + "epoch": 2.14, + "grad_norm": 27.25133514404297, + "learning_rate": 5.72145755762875e-06, + "loss": 1.6141, + "step": 17066 + }, + { + "epoch": 2.14, + "grad_norm": 22.502073287963867, + "learning_rate": 5.720620842572062e-06, + "loss": 1.139, + "step": 17067 + }, + { + "epoch": 2.14, + "grad_norm": 16.46257209777832, + "learning_rate": 5.719784127515375e-06, + "loss": 0.9123, + "step": 17068 + }, + { + "epoch": 2.14, + "grad_norm": 23.177297592163086, + "learning_rate": 5.718947412458688e-06, + "loss": 0.9117, + "step": 17069 + }, + { + "epoch": 2.14, + "grad_norm": 19.314239501953125, + "learning_rate": 5.718110697402001e-06, + "loss": 1.452, + "step": 17070 + }, + { + "epoch": 2.14, + "grad_norm": 6.970067024230957, + "learning_rate": 5.717273982345313e-06, + "loss": 0.3654, + "step": 17071 + }, + { + "epoch": 2.14, + "grad_norm": 35.67385482788086, + "learning_rate": 5.7164372672886246e-06, + "loss": 0.9929, + "step": 17072 + }, + { + "epoch": 2.14, + "grad_norm": 18.922805786132812, + "learning_rate": 5.715600552231938e-06, + "loss": 0.4126, + "step": 17073 + }, + { + "epoch": 2.14, + "grad_norm": 33.534156799316406, + "learning_rate": 5.71476383717525e-06, + "loss": 0.8119, + "step": 17074 + }, + { + "epoch": 2.14, + "grad_norm": 54.796119689941406, + "learning_rate": 5.713927122118563e-06, + "loss": 1.3977, + "step": 17075 + }, + { + "epoch": 2.14, + "grad_norm": 13.54931354522705, + "learning_rate": 5.713090407061875e-06, + "loss": 0.8326, + "step": 17076 + }, + { + "epoch": 2.14, + "grad_norm": 10.145544052124023, + "learning_rate": 5.712253692005189e-06, + "loss": 0.9113, + "step": 17077 + }, + { + "epoch": 2.14, + "grad_norm": 29.392499923706055, + "learning_rate": 5.7114169769485005e-06, + "loss": 1.2521, + "step": 17078 + }, + { + "epoch": 2.14, + "grad_norm": 6.5416717529296875, + "learning_rate": 5.7105802618918125e-06, + "loss": 0.8656, + "step": 17079 + }, + { + "epoch": 2.14, + "grad_norm": 9.528194427490234, + "learning_rate": 5.709743546835126e-06, + "loss": 0.581, + "step": 17080 + }, + { + "epoch": 2.14, + "grad_norm": 18.567523956298828, + "learning_rate": 5.708906831778438e-06, + "loss": 0.8522, + "step": 17081 + }, + { + "epoch": 2.14, + "grad_norm": 46.99943923950195, + "learning_rate": 5.708070116721751e-06, + "loss": 0.7091, + "step": 17082 + }, + { + "epoch": 2.14, + "grad_norm": 12.262152671813965, + "learning_rate": 5.707233401665063e-06, + "loss": 0.6784, + "step": 17083 + }, + { + "epoch": 2.14, + "grad_norm": 7.790750503540039, + "learning_rate": 5.7063966866083765e-06, + "loss": 1.1704, + "step": 17084 + }, + { + "epoch": 2.14, + "grad_norm": 28.657535552978516, + "learning_rate": 5.7055599715516885e-06, + "loss": 1.029, + "step": 17085 + }, + { + "epoch": 2.14, + "grad_norm": 15.705316543579102, + "learning_rate": 5.704723256495e-06, + "loss": 0.5097, + "step": 17086 + }, + { + "epoch": 2.14, + "grad_norm": 2.930361032485962, + "learning_rate": 5.703886541438314e-06, + "loss": 0.2663, + "step": 17087 + }, + { + "epoch": 2.14, + "grad_norm": 7.7618608474731445, + "learning_rate": 5.703049826381626e-06, + "loss": 0.8066, + "step": 17088 + }, + { + "epoch": 2.14, + "grad_norm": 12.091531753540039, + "learning_rate": 5.702213111324939e-06, + "loss": 0.8412, + "step": 17089 + }, + { + "epoch": 2.14, + "grad_norm": 24.859027862548828, + "learning_rate": 5.701376396268251e-06, + "loss": 1.9814, + "step": 17090 + }, + { + "epoch": 2.14, + "grad_norm": 15.993396759033203, + "learning_rate": 5.700539681211564e-06, + "loss": 0.7045, + "step": 17091 + }, + { + "epoch": 2.15, + "grad_norm": 42.554256439208984, + "learning_rate": 5.699702966154876e-06, + "loss": 1.4175, + "step": 17092 + }, + { + "epoch": 2.15, + "grad_norm": 16.24115753173828, + "learning_rate": 5.698866251098188e-06, + "loss": 0.5687, + "step": 17093 + }, + { + "epoch": 2.15, + "grad_norm": 3.3872950077056885, + "learning_rate": 5.698029536041502e-06, + "loss": 0.1129, + "step": 17094 + }, + { + "epoch": 2.15, + "grad_norm": 10.936417579650879, + "learning_rate": 5.697192820984814e-06, + "loss": 0.5392, + "step": 17095 + }, + { + "epoch": 2.15, + "grad_norm": 30.877866744995117, + "learning_rate": 5.696356105928127e-06, + "loss": 1.763, + "step": 17096 + }, + { + "epoch": 2.15, + "grad_norm": 7.140477657318115, + "learning_rate": 5.695519390871439e-06, + "loss": 0.8474, + "step": 17097 + }, + { + "epoch": 2.15, + "grad_norm": 28.2568359375, + "learning_rate": 5.694682675814752e-06, + "loss": 1.3709, + "step": 17098 + }, + { + "epoch": 2.15, + "grad_norm": 14.172971725463867, + "learning_rate": 5.693845960758064e-06, + "loss": 1.1605, + "step": 17099 + }, + { + "epoch": 2.15, + "grad_norm": 10.639191627502441, + "learning_rate": 5.693009245701376e-06, + "loss": 1.0971, + "step": 17100 + }, + { + "epoch": 2.15, + "grad_norm": 6.369710445404053, + "learning_rate": 5.69217253064469e-06, + "loss": 0.2289, + "step": 17101 + }, + { + "epoch": 2.15, + "grad_norm": 11.544483184814453, + "learning_rate": 5.691335815588002e-06, + "loss": 1.5321, + "step": 17102 + }, + { + "epoch": 2.15, + "grad_norm": 11.986268043518066, + "learning_rate": 5.690499100531315e-06, + "loss": 0.9125, + "step": 17103 + }, + { + "epoch": 2.15, + "grad_norm": 21.226198196411133, + "learning_rate": 5.689662385474627e-06, + "loss": 1.0037, + "step": 17104 + }, + { + "epoch": 2.15, + "grad_norm": 14.601465225219727, + "learning_rate": 5.68882567041794e-06, + "loss": 0.6547, + "step": 17105 + }, + { + "epoch": 2.15, + "grad_norm": 6.2679009437561035, + "learning_rate": 5.687988955361252e-06, + "loss": 0.3051, + "step": 17106 + }, + { + "epoch": 2.15, + "grad_norm": 101.8865737915039, + "learning_rate": 5.687152240304564e-06, + "loss": 1.8805, + "step": 17107 + }, + { + "epoch": 2.15, + "grad_norm": 14.7133150100708, + "learning_rate": 5.686315525247878e-06, + "loss": 0.8835, + "step": 17108 + }, + { + "epoch": 2.15, + "grad_norm": 10.83020305633545, + "learning_rate": 5.68547881019119e-06, + "loss": 0.7251, + "step": 17109 + }, + { + "epoch": 2.15, + "grad_norm": 287.1812438964844, + "learning_rate": 5.6846420951345026e-06, + "loss": 2.1167, + "step": 17110 + }, + { + "epoch": 2.15, + "grad_norm": 6.349530220031738, + "learning_rate": 5.6838053800778145e-06, + "loss": 0.516, + "step": 17111 + }, + { + "epoch": 2.15, + "grad_norm": 13.912372589111328, + "learning_rate": 5.682968665021128e-06, + "loss": 0.9926, + "step": 17112 + }, + { + "epoch": 2.15, + "grad_norm": 17.72711944580078, + "learning_rate": 5.68213194996444e-06, + "loss": 1.1207, + "step": 17113 + }, + { + "epoch": 2.15, + "grad_norm": 22.058942794799805, + "learning_rate": 5.681295234907752e-06, + "loss": 1.7815, + "step": 17114 + }, + { + "epoch": 2.15, + "grad_norm": 4.502975940704346, + "learning_rate": 5.680458519851066e-06, + "loss": 0.1729, + "step": 17115 + }, + { + "epoch": 2.15, + "grad_norm": 5.920248985290527, + "learning_rate": 5.679621804794378e-06, + "loss": 0.261, + "step": 17116 + }, + { + "epoch": 2.15, + "grad_norm": 24.765287399291992, + "learning_rate": 5.6787850897376905e-06, + "loss": 1.0169, + "step": 17117 + }, + { + "epoch": 2.15, + "grad_norm": 24.200904846191406, + "learning_rate": 5.6779483746810024e-06, + "loss": 1.4074, + "step": 17118 + }, + { + "epoch": 2.15, + "grad_norm": 19.04697608947754, + "learning_rate": 5.677111659624316e-06, + "loss": 0.7482, + "step": 17119 + }, + { + "epoch": 2.15, + "grad_norm": 24.989364624023438, + "learning_rate": 5.676274944567628e-06, + "loss": 1.2341, + "step": 17120 + }, + { + "epoch": 2.15, + "grad_norm": 38.26375961303711, + "learning_rate": 5.67543822951094e-06, + "loss": 1.3172, + "step": 17121 + }, + { + "epoch": 2.15, + "grad_norm": 22.313762664794922, + "learning_rate": 5.674601514454253e-06, + "loss": 0.889, + "step": 17122 + }, + { + "epoch": 2.15, + "grad_norm": 16.008052825927734, + "learning_rate": 5.673764799397566e-06, + "loss": 1.0246, + "step": 17123 + }, + { + "epoch": 2.15, + "grad_norm": 19.361263275146484, + "learning_rate": 5.672928084340878e-06, + "loss": 0.898, + "step": 17124 + }, + { + "epoch": 2.15, + "grad_norm": 22.887283325195312, + "learning_rate": 5.67209136928419e-06, + "loss": 1.3125, + "step": 17125 + }, + { + "epoch": 2.15, + "grad_norm": 76.33627319335938, + "learning_rate": 5.671254654227504e-06, + "loss": 2.9896, + "step": 17126 + }, + { + "epoch": 2.15, + "grad_norm": 8.603480339050293, + "learning_rate": 5.670417939170816e-06, + "loss": 0.8129, + "step": 17127 + }, + { + "epoch": 2.15, + "grad_norm": 1.5203114748001099, + "learning_rate": 5.669581224114128e-06, + "loss": 0.0294, + "step": 17128 + }, + { + "epoch": 2.15, + "grad_norm": 11.878643035888672, + "learning_rate": 5.668744509057441e-06, + "loss": 2.1819, + "step": 17129 + }, + { + "epoch": 2.15, + "grad_norm": 8.10049057006836, + "learning_rate": 5.6679077940007535e-06, + "loss": 1.4186, + "step": 17130 + }, + { + "epoch": 2.15, + "grad_norm": 8.806018829345703, + "learning_rate": 5.667071078944066e-06, + "loss": 0.2814, + "step": 17131 + }, + { + "epoch": 2.15, + "grad_norm": 14.122865676879883, + "learning_rate": 5.666234363887378e-06, + "loss": 2.386, + "step": 17132 + }, + { + "epoch": 2.15, + "grad_norm": 28.183515548706055, + "learning_rate": 5.665397648830692e-06, + "loss": 1.6092, + "step": 17133 + }, + { + "epoch": 2.15, + "grad_norm": 26.284934997558594, + "learning_rate": 5.664560933774004e-06, + "loss": 0.4936, + "step": 17134 + }, + { + "epoch": 2.15, + "grad_norm": 11.293306350708008, + "learning_rate": 5.663724218717316e-06, + "loss": 0.8182, + "step": 17135 + }, + { + "epoch": 2.15, + "grad_norm": 7.348964214324951, + "learning_rate": 5.662887503660629e-06, + "loss": 0.8782, + "step": 17136 + }, + { + "epoch": 2.15, + "grad_norm": 25.33659553527832, + "learning_rate": 5.6620507886039414e-06, + "loss": 0.9993, + "step": 17137 + }, + { + "epoch": 2.15, + "grad_norm": 27.249738693237305, + "learning_rate": 5.661214073547254e-06, + "loss": 0.1993, + "step": 17138 + }, + { + "epoch": 2.15, + "grad_norm": 3.9141499996185303, + "learning_rate": 5.660377358490566e-06, + "loss": 1.3535, + "step": 17139 + }, + { + "epoch": 2.15, + "grad_norm": 20.09246826171875, + "learning_rate": 5.65954064343388e-06, + "loss": 0.9172, + "step": 17140 + }, + { + "epoch": 2.15, + "grad_norm": 23.448715209960938, + "learning_rate": 5.658703928377192e-06, + "loss": 0.6893, + "step": 17141 + }, + { + "epoch": 2.15, + "grad_norm": 11.247332572937012, + "learning_rate": 5.657867213320504e-06, + "loss": 0.3365, + "step": 17142 + }, + { + "epoch": 2.15, + "grad_norm": 21.369384765625, + "learning_rate": 5.6570304982638166e-06, + "loss": 1.2574, + "step": 17143 + }, + { + "epoch": 2.15, + "grad_norm": 21.29892349243164, + "learning_rate": 5.656193783207129e-06, + "loss": 0.6715, + "step": 17144 + }, + { + "epoch": 2.15, + "grad_norm": 13.750541687011719, + "learning_rate": 5.655357068150442e-06, + "loss": 1.0391, + "step": 17145 + }, + { + "epoch": 2.15, + "grad_norm": 10.424272537231445, + "learning_rate": 5.654520353093754e-06, + "loss": 1.0453, + "step": 17146 + }, + { + "epoch": 2.15, + "grad_norm": 27.254045486450195, + "learning_rate": 5.653683638037068e-06, + "loss": 3.0949, + "step": 17147 + }, + { + "epoch": 2.15, + "grad_norm": 14.978679656982422, + "learning_rate": 5.65284692298038e-06, + "loss": 0.7781, + "step": 17148 + }, + { + "epoch": 2.15, + "grad_norm": 11.079421997070312, + "learning_rate": 5.652010207923692e-06, + "loss": 0.5054, + "step": 17149 + }, + { + "epoch": 2.15, + "grad_norm": 106.8862075805664, + "learning_rate": 5.6511734928670045e-06, + "loss": 2.5155, + "step": 17150 + }, + { + "epoch": 2.15, + "grad_norm": 27.096290588378906, + "learning_rate": 5.650336777810317e-06, + "loss": 0.6781, + "step": 17151 + }, + { + "epoch": 2.15, + "grad_norm": 16.702634811401367, + "learning_rate": 5.64950006275363e-06, + "loss": 0.9907, + "step": 17152 + }, + { + "epoch": 2.15, + "grad_norm": 15.890658378601074, + "learning_rate": 5.648663347696942e-06, + "loss": 0.7768, + "step": 17153 + }, + { + "epoch": 2.15, + "grad_norm": 14.421810150146484, + "learning_rate": 5.647826632640256e-06, + "loss": 0.846, + "step": 17154 + }, + { + "epoch": 2.15, + "grad_norm": 8.938249588012695, + "learning_rate": 5.646989917583568e-06, + "loss": 0.4093, + "step": 17155 + }, + { + "epoch": 2.15, + "grad_norm": 84.1568374633789, + "learning_rate": 5.64615320252688e-06, + "loss": 1.7657, + "step": 17156 + }, + { + "epoch": 2.15, + "grad_norm": 6.703531742095947, + "learning_rate": 5.645316487470192e-06, + "loss": 0.3169, + "step": 17157 + }, + { + "epoch": 2.15, + "grad_norm": 24.118864059448242, + "learning_rate": 5.644479772413504e-06, + "loss": 0.5048, + "step": 17158 + }, + { + "epoch": 2.15, + "grad_norm": 18.195344924926758, + "learning_rate": 5.643643057356818e-06, + "loss": 1.7614, + "step": 17159 + }, + { + "epoch": 2.15, + "grad_norm": 40.21952438354492, + "learning_rate": 5.64280634230013e-06, + "loss": 0.9294, + "step": 17160 + }, + { + "epoch": 2.15, + "grad_norm": 16.129159927368164, + "learning_rate": 5.641969627243444e-06, + "loss": 0.6936, + "step": 17161 + }, + { + "epoch": 2.15, + "grad_norm": 14.008243560791016, + "learning_rate": 5.6411329121867556e-06, + "loss": 1.0347, + "step": 17162 + }, + { + "epoch": 2.15, + "grad_norm": 10.446682929992676, + "learning_rate": 5.6402961971300675e-06, + "loss": 0.3626, + "step": 17163 + }, + { + "epoch": 2.15, + "grad_norm": 36.29135513305664, + "learning_rate": 5.63945948207338e-06, + "loss": 1.0444, + "step": 17164 + }, + { + "epoch": 2.15, + "grad_norm": 15.276895523071289, + "learning_rate": 5.638622767016692e-06, + "loss": 1.23, + "step": 17165 + }, + { + "epoch": 2.15, + "grad_norm": 17.33664321899414, + "learning_rate": 5.637786051960006e-06, + "loss": 0.7524, + "step": 17166 + }, + { + "epoch": 2.15, + "grad_norm": 12.592365264892578, + "learning_rate": 5.636949336903318e-06, + "loss": 0.2947, + "step": 17167 + }, + { + "epoch": 2.15, + "grad_norm": 21.139570236206055, + "learning_rate": 5.6361126218466315e-06, + "loss": 1.3471, + "step": 17168 + }, + { + "epoch": 2.15, + "grad_norm": 14.71281623840332, + "learning_rate": 5.6352759067899435e-06, + "loss": 1.2693, + "step": 17169 + }, + { + "epoch": 2.15, + "grad_norm": 4.269510269165039, + "learning_rate": 5.6344391917332554e-06, + "loss": 0.3714, + "step": 17170 + }, + { + "epoch": 2.15, + "grad_norm": 18.213468551635742, + "learning_rate": 5.633602476676568e-06, + "loss": 0.9179, + "step": 17171 + }, + { + "epoch": 2.16, + "grad_norm": 87.11209869384766, + "learning_rate": 5.63276576161988e-06, + "loss": 3.7237, + "step": 17172 + }, + { + "epoch": 2.16, + "grad_norm": 11.030548095703125, + "learning_rate": 5.631929046563194e-06, + "loss": 1.569, + "step": 17173 + }, + { + "epoch": 2.16, + "grad_norm": 17.495464324951172, + "learning_rate": 5.631092331506506e-06, + "loss": 0.5882, + "step": 17174 + }, + { + "epoch": 2.16, + "grad_norm": 25.288028717041016, + "learning_rate": 5.630255616449819e-06, + "loss": 0.7862, + "step": 17175 + }, + { + "epoch": 2.16, + "grad_norm": 17.80689239501953, + "learning_rate": 5.629418901393131e-06, + "loss": 1.5696, + "step": 17176 + }, + { + "epoch": 2.16, + "grad_norm": 12.747581481933594, + "learning_rate": 5.628582186336443e-06, + "loss": 1.9478, + "step": 17177 + }, + { + "epoch": 2.16, + "grad_norm": 18.356338500976562, + "learning_rate": 5.627745471279756e-06, + "loss": 2.3311, + "step": 17178 + }, + { + "epoch": 2.16, + "grad_norm": 13.22213077545166, + "learning_rate": 5.626908756223068e-06, + "loss": 1.0209, + "step": 17179 + }, + { + "epoch": 2.16, + "grad_norm": 27.774328231811523, + "learning_rate": 5.626072041166382e-06, + "loss": 0.961, + "step": 17180 + }, + { + "epoch": 2.16, + "grad_norm": 41.390079498291016, + "learning_rate": 5.625235326109694e-06, + "loss": 1.0548, + "step": 17181 + }, + { + "epoch": 2.16, + "grad_norm": 10.147668838500977, + "learning_rate": 5.6243986110530065e-06, + "loss": 0.8387, + "step": 17182 + }, + { + "epoch": 2.16, + "grad_norm": 6.634181022644043, + "learning_rate": 5.623561895996319e-06, + "loss": 0.5521, + "step": 17183 + }, + { + "epoch": 2.16, + "grad_norm": 13.955337524414062, + "learning_rate": 5.622725180939631e-06, + "loss": 1.1473, + "step": 17184 + }, + { + "epoch": 2.16, + "grad_norm": 7.0592360496521, + "learning_rate": 5.621888465882944e-06, + "loss": 0.7489, + "step": 17185 + }, + { + "epoch": 2.16, + "grad_norm": 16.544206619262695, + "learning_rate": 5.621051750826256e-06, + "loss": 2.2619, + "step": 17186 + }, + { + "epoch": 2.16, + "grad_norm": 24.937488555908203, + "learning_rate": 5.62021503576957e-06, + "loss": 0.9906, + "step": 17187 + }, + { + "epoch": 2.16, + "grad_norm": 51.04780960083008, + "learning_rate": 5.619378320712882e-06, + "loss": 1.937, + "step": 17188 + }, + { + "epoch": 2.16, + "grad_norm": 13.549792289733887, + "learning_rate": 5.6185416056561944e-06, + "loss": 1.9272, + "step": 17189 + }, + { + "epoch": 2.16, + "grad_norm": 13.317730903625488, + "learning_rate": 5.617704890599507e-06, + "loss": 0.5433, + "step": 17190 + }, + { + "epoch": 2.16, + "grad_norm": 5.867985248565674, + "learning_rate": 5.616868175542819e-06, + "loss": 0.8244, + "step": 17191 + }, + { + "epoch": 2.16, + "grad_norm": 17.57651710510254, + "learning_rate": 5.616031460486132e-06, + "loss": 2.1001, + "step": 17192 + }, + { + "epoch": 2.16, + "grad_norm": 7.705465793609619, + "learning_rate": 5.615194745429444e-06, + "loss": 0.5644, + "step": 17193 + }, + { + "epoch": 2.16, + "grad_norm": 142.6583251953125, + "learning_rate": 5.614358030372758e-06, + "loss": 1.5906, + "step": 17194 + }, + { + "epoch": 2.16, + "grad_norm": 13.02302074432373, + "learning_rate": 5.6135213153160695e-06, + "loss": 0.8627, + "step": 17195 + }, + { + "epoch": 2.16, + "grad_norm": 12.016786575317383, + "learning_rate": 5.612684600259382e-06, + "loss": 0.4129, + "step": 17196 + }, + { + "epoch": 2.16, + "grad_norm": 168.90357971191406, + "learning_rate": 5.611847885202695e-06, + "loss": 1.415, + "step": 17197 + }, + { + "epoch": 2.16, + "grad_norm": 13.086783409118652, + "learning_rate": 5.611011170146007e-06, + "loss": 0.6233, + "step": 17198 + }, + { + "epoch": 2.16, + "grad_norm": 11.101323127746582, + "learning_rate": 5.61017445508932e-06, + "loss": 0.536, + "step": 17199 + }, + { + "epoch": 2.16, + "grad_norm": 12.286422729492188, + "learning_rate": 5.609337740032632e-06, + "loss": 0.1976, + "step": 17200 + }, + { + "epoch": 2.16, + "eval_loss": 0.0822504535317421, + "eval_runtime": 94.4976, + "eval_samples_per_second": 37.482, + "eval_steps_per_second": 37.482, + "step": 17200 + }, + { + "epoch": 2.16, + "grad_norm": 3.7932496070861816, + "learning_rate": 5.6085010249759455e-06, + "loss": 0.0928, + "step": 17201 + }, + { + "epoch": 2.16, + "grad_norm": 0.5041267275810242, + "learning_rate": 5.6076643099192575e-06, + "loss": 0.0213, + "step": 17202 + }, + { + "epoch": 2.16, + "grad_norm": 16.08133888244629, + "learning_rate": 5.60682759486257e-06, + "loss": 1.0277, + "step": 17203 + }, + { + "epoch": 2.16, + "grad_norm": 21.73778533935547, + "learning_rate": 5.605990879805882e-06, + "loss": 0.7484, + "step": 17204 + }, + { + "epoch": 2.16, + "grad_norm": 3.9961888790130615, + "learning_rate": 5.605154164749195e-06, + "loss": 1.2175, + "step": 17205 + }, + { + "epoch": 2.16, + "grad_norm": 5.718421936035156, + "learning_rate": 5.604317449692508e-06, + "loss": 0.3244, + "step": 17206 + }, + { + "epoch": 2.16, + "grad_norm": 12.62595272064209, + "learning_rate": 5.60348073463582e-06, + "loss": 0.6958, + "step": 17207 + }, + { + "epoch": 2.16, + "grad_norm": 10.217425346374512, + "learning_rate": 5.6026440195791334e-06, + "loss": 1.3796, + "step": 17208 + }, + { + "epoch": 2.16, + "grad_norm": 13.226073265075684, + "learning_rate": 5.601807304522445e-06, + "loss": 0.8957, + "step": 17209 + }, + { + "epoch": 2.16, + "grad_norm": 30.363792419433594, + "learning_rate": 5.600970589465758e-06, + "loss": 2.2451, + "step": 17210 + }, + { + "epoch": 2.16, + "grad_norm": 37.012794494628906, + "learning_rate": 5.60013387440907e-06, + "loss": 1.0801, + "step": 17211 + }, + { + "epoch": 2.16, + "grad_norm": 60.06344223022461, + "learning_rate": 5.599297159352383e-06, + "loss": 1.6978, + "step": 17212 + }, + { + "epoch": 2.16, + "grad_norm": 14.471539497375488, + "learning_rate": 5.598460444295696e-06, + "loss": 1.596, + "step": 17213 + }, + { + "epoch": 2.16, + "grad_norm": 7.093859672546387, + "learning_rate": 5.597623729239008e-06, + "loss": 0.2557, + "step": 17214 + }, + { + "epoch": 2.16, + "grad_norm": 8.392356872558594, + "learning_rate": 5.596787014182321e-06, + "loss": 0.5757, + "step": 17215 + }, + { + "epoch": 2.16, + "grad_norm": 28.59241485595703, + "learning_rate": 5.595950299125633e-06, + "loss": 1.7856, + "step": 17216 + }, + { + "epoch": 2.16, + "grad_norm": 10.944947242736816, + "learning_rate": 5.595113584068945e-06, + "loss": 0.5933, + "step": 17217 + }, + { + "epoch": 2.16, + "grad_norm": 15.047503471374512, + "learning_rate": 5.594276869012258e-06, + "loss": 1.4373, + "step": 17218 + }, + { + "epoch": 2.16, + "grad_norm": 3.249769687652588, + "learning_rate": 5.593440153955571e-06, + "loss": 0.2011, + "step": 17219 + }, + { + "epoch": 2.16, + "grad_norm": 5.669090747833252, + "learning_rate": 5.592603438898884e-06, + "loss": 0.322, + "step": 17220 + }, + { + "epoch": 2.16, + "grad_norm": 24.267330169677734, + "learning_rate": 5.591766723842196e-06, + "loss": 0.2359, + "step": 17221 + }, + { + "epoch": 2.16, + "grad_norm": 10.642965316772461, + "learning_rate": 5.590930008785509e-06, + "loss": 0.4666, + "step": 17222 + }, + { + "epoch": 2.16, + "grad_norm": 9.365653991699219, + "learning_rate": 5.590093293728821e-06, + "loss": 0.564, + "step": 17223 + }, + { + "epoch": 2.16, + "grad_norm": 7.013499736785889, + "learning_rate": 5.589256578672133e-06, + "loss": 0.869, + "step": 17224 + }, + { + "epoch": 2.16, + "grad_norm": 14.563632011413574, + "learning_rate": 5.588419863615446e-06, + "loss": 0.8464, + "step": 17225 + }, + { + "epoch": 2.16, + "grad_norm": 9.125322341918945, + "learning_rate": 5.587583148558759e-06, + "loss": 0.4213, + "step": 17226 + }, + { + "epoch": 2.16, + "grad_norm": 3.3125064373016357, + "learning_rate": 5.586746433502072e-06, + "loss": 0.207, + "step": 17227 + }, + { + "epoch": 2.16, + "grad_norm": 27.964567184448242, + "learning_rate": 5.5859097184453835e-06, + "loss": 1.5827, + "step": 17228 + }, + { + "epoch": 2.16, + "grad_norm": 19.723154067993164, + "learning_rate": 5.585073003388697e-06, + "loss": 0.8522, + "step": 17229 + }, + { + "epoch": 2.16, + "grad_norm": 26.846416473388672, + "learning_rate": 5.584236288332009e-06, + "loss": 0.9918, + "step": 17230 + }, + { + "epoch": 2.16, + "grad_norm": 11.809608459472656, + "learning_rate": 5.583399573275321e-06, + "loss": 0.2451, + "step": 17231 + }, + { + "epoch": 2.16, + "grad_norm": 11.435138702392578, + "learning_rate": 5.582562858218634e-06, + "loss": 0.4537, + "step": 17232 + }, + { + "epoch": 2.16, + "grad_norm": 31.543373107910156, + "learning_rate": 5.581726143161947e-06, + "loss": 0.5097, + "step": 17233 + }, + { + "epoch": 2.16, + "grad_norm": 28.25628662109375, + "learning_rate": 5.5808894281052595e-06, + "loss": 1.1043, + "step": 17234 + }, + { + "epoch": 2.16, + "grad_norm": 14.437735557556152, + "learning_rate": 5.5800527130485715e-06, + "loss": 0.6009, + "step": 17235 + }, + { + "epoch": 2.16, + "grad_norm": 14.932991027832031, + "learning_rate": 5.579215997991885e-06, + "loss": 0.5173, + "step": 17236 + }, + { + "epoch": 2.16, + "grad_norm": 11.054858207702637, + "learning_rate": 5.578379282935197e-06, + "loss": 1.342, + "step": 17237 + }, + { + "epoch": 2.16, + "grad_norm": 103.68341064453125, + "learning_rate": 5.577542567878509e-06, + "loss": 2.8162, + "step": 17238 + }, + { + "epoch": 2.16, + "grad_norm": 15.472814559936523, + "learning_rate": 5.576705852821822e-06, + "loss": 0.7346, + "step": 17239 + }, + { + "epoch": 2.16, + "grad_norm": 13.45470905303955, + "learning_rate": 5.575869137765134e-06, + "loss": 0.7036, + "step": 17240 + }, + { + "epoch": 2.16, + "grad_norm": 13.154472351074219, + "learning_rate": 5.575032422708447e-06, + "loss": 0.6328, + "step": 17241 + }, + { + "epoch": 2.16, + "grad_norm": 8.994661331176758, + "learning_rate": 5.574195707651759e-06, + "loss": 1.4348, + "step": 17242 + }, + { + "epoch": 2.16, + "grad_norm": 15.41472053527832, + "learning_rate": 5.573358992595073e-06, + "loss": 1.0158, + "step": 17243 + }, + { + "epoch": 2.16, + "grad_norm": 9.832098007202148, + "learning_rate": 5.572522277538385e-06, + "loss": 1.2464, + "step": 17244 + }, + { + "epoch": 2.16, + "grad_norm": 5.16098165512085, + "learning_rate": 5.571685562481697e-06, + "loss": 0.83, + "step": 17245 + }, + { + "epoch": 2.16, + "grad_norm": 18.821264266967773, + "learning_rate": 5.57084884742501e-06, + "loss": 0.6399, + "step": 17246 + }, + { + "epoch": 2.16, + "grad_norm": 27.851112365722656, + "learning_rate": 5.570012132368322e-06, + "loss": 1.2835, + "step": 17247 + }, + { + "epoch": 2.16, + "grad_norm": 6.089595794677734, + "learning_rate": 5.569175417311635e-06, + "loss": 0.1637, + "step": 17248 + }, + { + "epoch": 2.16, + "grad_norm": 16.148115158081055, + "learning_rate": 5.568338702254947e-06, + "loss": 1.0256, + "step": 17249 + }, + { + "epoch": 2.16, + "grad_norm": 12.849231719970703, + "learning_rate": 5.567501987198261e-06, + "loss": 0.2856, + "step": 17250 + }, + { + "epoch": 2.16, + "grad_norm": 12.162538528442383, + "learning_rate": 5.566665272141573e-06, + "loss": 1.8148, + "step": 17251 + }, + { + "epoch": 2.17, + "grad_norm": 22.895511627197266, + "learning_rate": 5.565828557084885e-06, + "loss": 1.0507, + "step": 17252 + }, + { + "epoch": 2.17, + "grad_norm": 96.24250030517578, + "learning_rate": 5.564991842028198e-06, + "loss": 1.7526, + "step": 17253 + }, + { + "epoch": 2.17, + "grad_norm": 18.24812126159668, + "learning_rate": 5.56415512697151e-06, + "loss": 1.153, + "step": 17254 + }, + { + "epoch": 2.17, + "grad_norm": 9.374454498291016, + "learning_rate": 5.563318411914823e-06, + "loss": 0.4764, + "step": 17255 + }, + { + "epoch": 2.17, + "grad_norm": 17.744022369384766, + "learning_rate": 5.562481696858135e-06, + "loss": 1.8648, + "step": 17256 + }, + { + "epoch": 2.17, + "grad_norm": 17.501155853271484, + "learning_rate": 5.561644981801448e-06, + "loss": 1.7762, + "step": 17257 + }, + { + "epoch": 2.17, + "grad_norm": 36.74434280395508, + "learning_rate": 5.560808266744761e-06, + "loss": 1.3477, + "step": 17258 + }, + { + "epoch": 2.17, + "grad_norm": 53.19886779785156, + "learning_rate": 5.559971551688073e-06, + "loss": 1.1854, + "step": 17259 + }, + { + "epoch": 2.17, + "grad_norm": 19.726200103759766, + "learning_rate": 5.5591348366313856e-06, + "loss": 1.6473, + "step": 17260 + }, + { + "epoch": 2.17, + "grad_norm": 10.460951805114746, + "learning_rate": 5.5582981215746975e-06, + "loss": 0.827, + "step": 17261 + }, + { + "epoch": 2.17, + "grad_norm": 8.402612686157227, + "learning_rate": 5.557461406518011e-06, + "loss": 0.9577, + "step": 17262 + }, + { + "epoch": 2.17, + "grad_norm": 51.408695220947266, + "learning_rate": 5.556624691461323e-06, + "loss": 2.2755, + "step": 17263 + }, + { + "epoch": 2.17, + "grad_norm": 11.783794403076172, + "learning_rate": 5.555787976404636e-06, + "loss": 1.1531, + "step": 17264 + }, + { + "epoch": 2.17, + "grad_norm": 19.813913345336914, + "learning_rate": 5.554951261347949e-06, + "loss": 1.2154, + "step": 17265 + }, + { + "epoch": 2.17, + "grad_norm": 9.960100173950195, + "learning_rate": 5.554114546291261e-06, + "loss": 0.9847, + "step": 17266 + }, + { + "epoch": 2.17, + "grad_norm": 1.53121817111969, + "learning_rate": 5.5532778312345735e-06, + "loss": 0.1649, + "step": 17267 + }, + { + "epoch": 2.17, + "grad_norm": 16.12785530090332, + "learning_rate": 5.5524411161778854e-06, + "loss": 0.6851, + "step": 17268 + }, + { + "epoch": 2.17, + "grad_norm": 6.951560020446777, + "learning_rate": 5.551604401121199e-06, + "loss": 0.2635, + "step": 17269 + }, + { + "epoch": 2.17, + "grad_norm": 48.68133544921875, + "learning_rate": 5.550767686064511e-06, + "loss": 0.7961, + "step": 17270 + }, + { + "epoch": 2.17, + "grad_norm": 13.477615356445312, + "learning_rate": 5.549930971007824e-06, + "loss": 0.4971, + "step": 17271 + }, + { + "epoch": 2.17, + "grad_norm": 24.526586532592773, + "learning_rate": 5.549094255951137e-06, + "loss": 1.4679, + "step": 17272 + }, + { + "epoch": 2.17, + "grad_norm": 29.198915481567383, + "learning_rate": 5.548257540894449e-06, + "loss": 1.934, + "step": 17273 + }, + { + "epoch": 2.17, + "grad_norm": 20.456193923950195, + "learning_rate": 5.547420825837761e-06, + "loss": 1.0854, + "step": 17274 + }, + { + "epoch": 2.17, + "grad_norm": 12.702293395996094, + "learning_rate": 5.546584110781073e-06, + "loss": 1.004, + "step": 17275 + }, + { + "epoch": 2.17, + "grad_norm": 13.598925590515137, + "learning_rate": 5.545747395724387e-06, + "loss": 0.5497, + "step": 17276 + }, + { + "epoch": 2.17, + "grad_norm": 14.103580474853516, + "learning_rate": 5.544910680667699e-06, + "loss": 1.1607, + "step": 17277 + }, + { + "epoch": 2.17, + "grad_norm": 16.528247833251953, + "learning_rate": 5.544073965611012e-06, + "loss": 0.8895, + "step": 17278 + }, + { + "epoch": 2.17, + "grad_norm": 14.104270935058594, + "learning_rate": 5.5432372505543246e-06, + "loss": 0.6044, + "step": 17279 + }, + { + "epoch": 2.17, + "grad_norm": 17.39735984802246, + "learning_rate": 5.5424005354976365e-06, + "loss": 0.4925, + "step": 17280 + }, + { + "epoch": 2.17, + "grad_norm": 40.398956298828125, + "learning_rate": 5.541563820440949e-06, + "loss": 1.3486, + "step": 17281 + }, + { + "epoch": 2.17, + "grad_norm": 10.429101943969727, + "learning_rate": 5.540727105384261e-06, + "loss": 1.1617, + "step": 17282 + }, + { + "epoch": 2.17, + "grad_norm": 50.06691360473633, + "learning_rate": 5.539890390327575e-06, + "loss": 4.6265, + "step": 17283 + }, + { + "epoch": 2.17, + "grad_norm": 8.183061599731445, + "learning_rate": 5.539053675270887e-06, + "loss": 0.5522, + "step": 17284 + }, + { + "epoch": 2.17, + "grad_norm": 25.18187141418457, + "learning_rate": 5.5382169602142e-06, + "loss": 1.5083, + "step": 17285 + }, + { + "epoch": 2.17, + "grad_norm": 15.119938850402832, + "learning_rate": 5.537380245157512e-06, + "loss": 1.7149, + "step": 17286 + }, + { + "epoch": 2.17, + "grad_norm": 17.999061584472656, + "learning_rate": 5.5365435301008244e-06, + "loss": 1.1504, + "step": 17287 + }, + { + "epoch": 2.17, + "grad_norm": 12.407395362854004, + "learning_rate": 5.535706815044137e-06, + "loss": 0.9227, + "step": 17288 + }, + { + "epoch": 2.17, + "grad_norm": 5.739668369293213, + "learning_rate": 5.534870099987449e-06, + "loss": 1.2882, + "step": 17289 + }, + { + "epoch": 2.17, + "grad_norm": 14.490335464477539, + "learning_rate": 5.534033384930763e-06, + "loss": 0.5932, + "step": 17290 + }, + { + "epoch": 2.17, + "grad_norm": 13.677287101745605, + "learning_rate": 5.533196669874075e-06, + "loss": 0.4711, + "step": 17291 + }, + { + "epoch": 2.17, + "grad_norm": 229.60195922851562, + "learning_rate": 5.532359954817388e-06, + "loss": 2.2706, + "step": 17292 + }, + { + "epoch": 2.17, + "grad_norm": 64.56965637207031, + "learning_rate": 5.5315232397606996e-06, + "loss": 1.1746, + "step": 17293 + }, + { + "epoch": 2.17, + "grad_norm": 11.953279495239258, + "learning_rate": 5.530686524704012e-06, + "loss": 0.78, + "step": 17294 + }, + { + "epoch": 2.17, + "grad_norm": 14.518604278564453, + "learning_rate": 5.529849809647325e-06, + "loss": 0.371, + "step": 17295 + }, + { + "epoch": 2.17, + "grad_norm": 12.54118537902832, + "learning_rate": 5.529013094590637e-06, + "loss": 1.3565, + "step": 17296 + }, + { + "epoch": 2.17, + "grad_norm": 6.692623615264893, + "learning_rate": 5.528176379533951e-06, + "loss": 0.305, + "step": 17297 + }, + { + "epoch": 2.17, + "grad_norm": 9.086627960205078, + "learning_rate": 5.527339664477263e-06, + "loss": 0.0801, + "step": 17298 + }, + { + "epoch": 2.17, + "grad_norm": 23.496551513671875, + "learning_rate": 5.5265029494205755e-06, + "loss": 0.6855, + "step": 17299 + }, + { + "epoch": 2.17, + "grad_norm": 20.957077026367188, + "learning_rate": 5.5256662343638875e-06, + "loss": 0.8958, + "step": 17300 + }, + { + "epoch": 2.17, + "grad_norm": 42.834800720214844, + "learning_rate": 5.5248295193072e-06, + "loss": 2.9654, + "step": 17301 + }, + { + "epoch": 2.17, + "grad_norm": 19.34640884399414, + "learning_rate": 5.523992804250513e-06, + "loss": 0.8642, + "step": 17302 + }, + { + "epoch": 2.17, + "grad_norm": 8.827037811279297, + "learning_rate": 5.523156089193825e-06, + "loss": 0.4299, + "step": 17303 + }, + { + "epoch": 2.17, + "grad_norm": 10.81065559387207, + "learning_rate": 5.522319374137139e-06, + "loss": 0.3272, + "step": 17304 + }, + { + "epoch": 2.17, + "grad_norm": 11.338500022888184, + "learning_rate": 5.521482659080451e-06, + "loss": 0.2851, + "step": 17305 + }, + { + "epoch": 2.17, + "grad_norm": 10.40788745880127, + "learning_rate": 5.5206459440237634e-06, + "loss": 0.6665, + "step": 17306 + }, + { + "epoch": 2.17, + "grad_norm": 128.7415008544922, + "learning_rate": 5.519809228967075e-06, + "loss": 0.7273, + "step": 17307 + }, + { + "epoch": 2.17, + "grad_norm": 5.43458890914917, + "learning_rate": 5.518972513910388e-06, + "loss": 0.0861, + "step": 17308 + }, + { + "epoch": 2.17, + "grad_norm": 17.122840881347656, + "learning_rate": 5.518135798853701e-06, + "loss": 1.4625, + "step": 17309 + }, + { + "epoch": 2.17, + "grad_norm": 12.97182846069336, + "learning_rate": 5.517299083797013e-06, + "loss": 0.6766, + "step": 17310 + }, + { + "epoch": 2.17, + "grad_norm": 9.539743423461914, + "learning_rate": 5.516462368740327e-06, + "loss": 1.0235, + "step": 17311 + }, + { + "epoch": 2.17, + "grad_norm": 13.798757553100586, + "learning_rate": 5.5156256536836386e-06, + "loss": 0.4069, + "step": 17312 + }, + { + "epoch": 2.17, + "grad_norm": 9.476639747619629, + "learning_rate": 5.514788938626951e-06, + "loss": 1.3685, + "step": 17313 + }, + { + "epoch": 2.17, + "grad_norm": 4.449814796447754, + "learning_rate": 5.513952223570263e-06, + "loss": 0.1509, + "step": 17314 + }, + { + "epoch": 2.17, + "grad_norm": 11.458866119384766, + "learning_rate": 5.513115508513575e-06, + "loss": 1.1127, + "step": 17315 + }, + { + "epoch": 2.17, + "grad_norm": 86.14989471435547, + "learning_rate": 5.512278793456889e-06, + "loss": 0.8236, + "step": 17316 + }, + { + "epoch": 2.17, + "grad_norm": 74.04230499267578, + "learning_rate": 5.511442078400201e-06, + "loss": 1.3761, + "step": 17317 + }, + { + "epoch": 2.17, + "grad_norm": 15.484674453735352, + "learning_rate": 5.5106053633435145e-06, + "loss": 0.9134, + "step": 17318 + }, + { + "epoch": 2.17, + "grad_norm": 6.356900215148926, + "learning_rate": 5.5097686482868265e-06, + "loss": 0.3291, + "step": 17319 + }, + { + "epoch": 2.17, + "grad_norm": 6.78397274017334, + "learning_rate": 5.508931933230139e-06, + "loss": 0.1679, + "step": 17320 + }, + { + "epoch": 2.17, + "grad_norm": 33.1566047668457, + "learning_rate": 5.508095218173451e-06, + "loss": 0.9174, + "step": 17321 + }, + { + "epoch": 2.17, + "grad_norm": 11.877245903015137, + "learning_rate": 5.507258503116763e-06, + "loss": 0.6691, + "step": 17322 + }, + { + "epoch": 2.17, + "grad_norm": 321.0672607421875, + "learning_rate": 5.506421788060077e-06, + "loss": 2.0765, + "step": 17323 + }, + { + "epoch": 2.17, + "grad_norm": 7.010623455047607, + "learning_rate": 5.505585073003389e-06, + "loss": 0.6944, + "step": 17324 + }, + { + "epoch": 2.17, + "grad_norm": 31.49948501586914, + "learning_rate": 5.5047483579467024e-06, + "loss": 0.5722, + "step": 17325 + }, + { + "epoch": 2.17, + "grad_norm": 9.569120407104492, + "learning_rate": 5.503911642890014e-06, + "loss": 0.3115, + "step": 17326 + }, + { + "epoch": 2.17, + "grad_norm": 17.101133346557617, + "learning_rate": 5.503074927833327e-06, + "loss": 0.7327, + "step": 17327 + }, + { + "epoch": 2.17, + "grad_norm": 34.0203971862793, + "learning_rate": 5.502238212776639e-06, + "loss": 2.2211, + "step": 17328 + }, + { + "epoch": 2.17, + "grad_norm": 61.76817321777344, + "learning_rate": 5.501401497719951e-06, + "loss": 1.3252, + "step": 17329 + }, + { + "epoch": 2.17, + "grad_norm": 3.967968702316284, + "learning_rate": 5.500564782663265e-06, + "loss": 0.3913, + "step": 17330 + }, + { + "epoch": 2.18, + "grad_norm": 8.685111045837402, + "learning_rate": 5.499728067606577e-06, + "loss": 0.3842, + "step": 17331 + }, + { + "epoch": 2.18, + "grad_norm": 9.978775978088379, + "learning_rate": 5.4988913525498895e-06, + "loss": 1.1715, + "step": 17332 + }, + { + "epoch": 2.18, + "grad_norm": 8.293410301208496, + "learning_rate": 5.498054637493202e-06, + "loss": 0.1429, + "step": 17333 + }, + { + "epoch": 2.18, + "grad_norm": 27.3328800201416, + "learning_rate": 5.497217922436515e-06, + "loss": 0.9815, + "step": 17334 + }, + { + "epoch": 2.18, + "grad_norm": 13.956252098083496, + "learning_rate": 5.496381207379827e-06, + "loss": 0.8331, + "step": 17335 + }, + { + "epoch": 2.18, + "grad_norm": 52.58840560913086, + "learning_rate": 5.495544492323139e-06, + "loss": 1.2963, + "step": 17336 + }, + { + "epoch": 2.18, + "grad_norm": 11.63193416595459, + "learning_rate": 5.494707777266453e-06, + "loss": 0.4902, + "step": 17337 + }, + { + "epoch": 2.18, + "grad_norm": 12.473057746887207, + "learning_rate": 5.493871062209765e-06, + "loss": 1.2166, + "step": 17338 + }, + { + "epoch": 2.18, + "grad_norm": 14.674174308776855, + "learning_rate": 5.4930343471530774e-06, + "loss": 1.8472, + "step": 17339 + }, + { + "epoch": 2.18, + "grad_norm": 5.343213081359863, + "learning_rate": 5.49219763209639e-06, + "loss": 0.3098, + "step": 17340 + }, + { + "epoch": 2.18, + "grad_norm": 16.50578498840332, + "learning_rate": 5.491360917039703e-06, + "loss": 0.3704, + "step": 17341 + }, + { + "epoch": 2.18, + "grad_norm": 11.50082015991211, + "learning_rate": 5.490524201983015e-06, + "loss": 0.9004, + "step": 17342 + }, + { + "epoch": 2.18, + "grad_norm": 9.233156204223633, + "learning_rate": 5.489687486926327e-06, + "loss": 0.4855, + "step": 17343 + }, + { + "epoch": 2.18, + "grad_norm": 17.008365631103516, + "learning_rate": 5.488850771869641e-06, + "loss": 0.5787, + "step": 17344 + }, + { + "epoch": 2.18, + "grad_norm": 16.309478759765625, + "learning_rate": 5.4880140568129525e-06, + "loss": 0.9927, + "step": 17345 + }, + { + "epoch": 2.18, + "grad_norm": 9.710187911987305, + "learning_rate": 5.487177341756265e-06, + "loss": 1.9782, + "step": 17346 + }, + { + "epoch": 2.18, + "grad_norm": 6.9360270500183105, + "learning_rate": 5.486340626699578e-06, + "loss": 0.4631, + "step": 17347 + }, + { + "epoch": 2.18, + "grad_norm": 16.3384952545166, + "learning_rate": 5.485503911642891e-06, + "loss": 1.7151, + "step": 17348 + }, + { + "epoch": 2.18, + "grad_norm": 8.947735786437988, + "learning_rate": 5.484667196586203e-06, + "loss": 1.7752, + "step": 17349 + }, + { + "epoch": 2.18, + "grad_norm": 6.5993499755859375, + "learning_rate": 5.483830481529515e-06, + "loss": 0.6455, + "step": 17350 + }, + { + "epoch": 2.18, + "grad_norm": 3.6965012550354004, + "learning_rate": 5.4829937664728285e-06, + "loss": 0.8186, + "step": 17351 + }, + { + "epoch": 2.18, + "grad_norm": 8.805519104003906, + "learning_rate": 5.4821570514161405e-06, + "loss": 0.3716, + "step": 17352 + }, + { + "epoch": 2.18, + "grad_norm": 11.217713356018066, + "learning_rate": 5.481320336359453e-06, + "loss": 1.1276, + "step": 17353 + }, + { + "epoch": 2.18, + "grad_norm": 37.654449462890625, + "learning_rate": 5.480483621302766e-06, + "loss": 1.6106, + "step": 17354 + }, + { + "epoch": 2.18, + "grad_norm": 55.929996490478516, + "learning_rate": 5.479646906246079e-06, + "loss": 2.1045, + "step": 17355 + }, + { + "epoch": 2.18, + "grad_norm": 4.489141464233398, + "learning_rate": 5.478810191189391e-06, + "loss": 0.2208, + "step": 17356 + }, + { + "epoch": 2.18, + "grad_norm": 12.1909761428833, + "learning_rate": 5.477973476132703e-06, + "loss": 1.5717, + "step": 17357 + }, + { + "epoch": 2.18, + "grad_norm": 50.50084686279297, + "learning_rate": 5.4771367610760164e-06, + "loss": 2.6516, + "step": 17358 + }, + { + "epoch": 2.18, + "grad_norm": 13.93696117401123, + "learning_rate": 5.476300046019328e-06, + "loss": 1.3454, + "step": 17359 + }, + { + "epoch": 2.18, + "grad_norm": 115.9477767944336, + "learning_rate": 5.475463330962641e-06, + "loss": 1.1183, + "step": 17360 + }, + { + "epoch": 2.18, + "grad_norm": 13.79046630859375, + "learning_rate": 5.474626615905954e-06, + "loss": 0.9186, + "step": 17361 + }, + { + "epoch": 2.18, + "grad_norm": 9.239429473876953, + "learning_rate": 5.473789900849267e-06, + "loss": 1.5267, + "step": 17362 + }, + { + "epoch": 2.18, + "grad_norm": 17.92961311340332, + "learning_rate": 5.472953185792579e-06, + "loss": 0.2789, + "step": 17363 + }, + { + "epoch": 2.18, + "grad_norm": 23.084224700927734, + "learning_rate": 5.472116470735891e-06, + "loss": 2.3164, + "step": 17364 + }, + { + "epoch": 2.18, + "grad_norm": 12.765448570251465, + "learning_rate": 5.471279755679204e-06, + "loss": 0.9762, + "step": 17365 + }, + { + "epoch": 2.18, + "grad_norm": 23.09999656677246, + "learning_rate": 5.470443040622516e-06, + "loss": 1.0109, + "step": 17366 + }, + { + "epoch": 2.18, + "grad_norm": 51.35440444946289, + "learning_rate": 5.469606325565829e-06, + "loss": 1.6294, + "step": 17367 + }, + { + "epoch": 2.18, + "grad_norm": 15.2879056930542, + "learning_rate": 5.468769610509141e-06, + "loss": 1.4256, + "step": 17368 + }, + { + "epoch": 2.18, + "grad_norm": 43.24333190917969, + "learning_rate": 5.467932895452455e-06, + "loss": 1.2839, + "step": 17369 + }, + { + "epoch": 2.18, + "grad_norm": 5.293132781982422, + "learning_rate": 5.467096180395767e-06, + "loss": 0.2765, + "step": 17370 + }, + { + "epoch": 2.18, + "grad_norm": 14.49522590637207, + "learning_rate": 5.466259465339079e-06, + "loss": 1.0694, + "step": 17371 + }, + { + "epoch": 2.18, + "grad_norm": 11.335780143737793, + "learning_rate": 5.465422750282392e-06, + "loss": 1.1652, + "step": 17372 + }, + { + "epoch": 2.18, + "grad_norm": 3.1716794967651367, + "learning_rate": 5.464586035225704e-06, + "loss": 0.0202, + "step": 17373 + }, + { + "epoch": 2.18, + "grad_norm": 4.579696178436279, + "learning_rate": 5.463749320169017e-06, + "loss": 0.9418, + "step": 17374 + }, + { + "epoch": 2.18, + "grad_norm": 21.71388816833496, + "learning_rate": 5.462912605112329e-06, + "loss": 0.6225, + "step": 17375 + }, + { + "epoch": 2.18, + "grad_norm": 13.785713195800781, + "learning_rate": 5.462075890055643e-06, + "loss": 0.7838, + "step": 17376 + }, + { + "epoch": 2.18, + "grad_norm": 10.466874122619629, + "learning_rate": 5.461239174998955e-06, + "loss": 0.9145, + "step": 17377 + }, + { + "epoch": 2.18, + "grad_norm": 21.933841705322266, + "learning_rate": 5.4604024599422665e-06, + "loss": 0.9588, + "step": 17378 + }, + { + "epoch": 2.18, + "grad_norm": 17.930667877197266, + "learning_rate": 5.45956574488558e-06, + "loss": 1.2568, + "step": 17379 + }, + { + "epoch": 2.18, + "grad_norm": 15.766815185546875, + "learning_rate": 5.458729029828892e-06, + "loss": 0.9218, + "step": 17380 + }, + { + "epoch": 2.18, + "grad_norm": 20.991121292114258, + "learning_rate": 5.457892314772205e-06, + "loss": 1.9213, + "step": 17381 + }, + { + "epoch": 2.18, + "grad_norm": 71.60786437988281, + "learning_rate": 5.457055599715517e-06, + "loss": 1.5991, + "step": 17382 + }, + { + "epoch": 2.18, + "grad_norm": 5.674159526824951, + "learning_rate": 5.4562188846588305e-06, + "loss": 0.277, + "step": 17383 + }, + { + "epoch": 2.18, + "grad_norm": 4.543520450592041, + "learning_rate": 5.4553821696021425e-06, + "loss": 0.0572, + "step": 17384 + }, + { + "epoch": 2.18, + "grad_norm": 10.390192031860352, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.2012, + "step": 17385 + }, + { + "epoch": 2.18, + "grad_norm": 20.592607498168945, + "learning_rate": 5.453708739488768e-06, + "loss": 1.5867, + "step": 17386 + }, + { + "epoch": 2.18, + "grad_norm": 9.117387771606445, + "learning_rate": 5.45287202443208e-06, + "loss": 0.5777, + "step": 17387 + }, + { + "epoch": 2.18, + "grad_norm": 14.631564140319824, + "learning_rate": 5.452035309375393e-06, + "loss": 1.2054, + "step": 17388 + }, + { + "epoch": 2.18, + "grad_norm": 9.758777618408203, + "learning_rate": 5.451198594318705e-06, + "loss": 0.2893, + "step": 17389 + }, + { + "epoch": 2.18, + "grad_norm": 17.93175506591797, + "learning_rate": 5.4503618792620185e-06, + "loss": 0.7353, + "step": 17390 + }, + { + "epoch": 2.18, + "grad_norm": 18.551061630249023, + "learning_rate": 5.44952516420533e-06, + "loss": 0.6877, + "step": 17391 + }, + { + "epoch": 2.18, + "grad_norm": 24.413114547729492, + "learning_rate": 5.448688449148642e-06, + "loss": 0.6687, + "step": 17392 + }, + { + "epoch": 2.18, + "grad_norm": 3.8335249423980713, + "learning_rate": 5.447851734091956e-06, + "loss": 0.2867, + "step": 17393 + }, + { + "epoch": 2.18, + "grad_norm": 7.5022807121276855, + "learning_rate": 5.447015019035268e-06, + "loss": 0.4303, + "step": 17394 + }, + { + "epoch": 2.18, + "grad_norm": 10.752012252807617, + "learning_rate": 5.446178303978581e-06, + "loss": 1.0519, + "step": 17395 + }, + { + "epoch": 2.18, + "grad_norm": 29.8737850189209, + "learning_rate": 5.445341588921893e-06, + "loss": 1.8048, + "step": 17396 + }, + { + "epoch": 2.18, + "grad_norm": 31.07291603088379, + "learning_rate": 5.444504873865205e-06, + "loss": 1.9485, + "step": 17397 + }, + { + "epoch": 2.18, + "grad_norm": 122.77910614013672, + "learning_rate": 5.443668158808518e-06, + "loss": 1.9164, + "step": 17398 + }, + { + "epoch": 2.18, + "grad_norm": 17.61583709716797, + "learning_rate": 5.44283144375183e-06, + "loss": 1.5101, + "step": 17399 + }, + { + "epoch": 2.18, + "grad_norm": 19.916318893432617, + "learning_rate": 5.441994728695144e-06, + "loss": 1.4863, + "step": 17400 + }, + { + "epoch": 2.18, + "grad_norm": 14.747823715209961, + "learning_rate": 5.441158013638456e-06, + "loss": 0.4247, + "step": 17401 + }, + { + "epoch": 2.18, + "grad_norm": 30.967308044433594, + "learning_rate": 5.440321298581769e-06, + "loss": 3.5009, + "step": 17402 + }, + { + "epoch": 2.18, + "grad_norm": 6.669802188873291, + "learning_rate": 5.439484583525081e-06, + "loss": 1.0505, + "step": 17403 + }, + { + "epoch": 2.18, + "grad_norm": 14.642730712890625, + "learning_rate": 5.438647868468393e-06, + "loss": 0.7301, + "step": 17404 + }, + { + "epoch": 2.18, + "grad_norm": 10.343039512634277, + "learning_rate": 5.437811153411706e-06, + "loss": 0.3258, + "step": 17405 + }, + { + "epoch": 2.18, + "grad_norm": 21.614656448364258, + "learning_rate": 5.436974438355018e-06, + "loss": 1.0137, + "step": 17406 + }, + { + "epoch": 2.18, + "grad_norm": 9.189238548278809, + "learning_rate": 5.436137723298332e-06, + "loss": 2.0193, + "step": 17407 + }, + { + "epoch": 2.18, + "grad_norm": 6.540914535522461, + "learning_rate": 5.435301008241644e-06, + "loss": 0.9241, + "step": 17408 + }, + { + "epoch": 2.18, + "grad_norm": 15.18000316619873, + "learning_rate": 5.434464293184957e-06, + "loss": 0.7895, + "step": 17409 + }, + { + "epoch": 2.18, + "grad_norm": 26.84891128540039, + "learning_rate": 5.4336275781282686e-06, + "loss": 1.0056, + "step": 17410 + }, + { + "epoch": 2.19, + "grad_norm": 6.153435707092285, + "learning_rate": 5.4327908630715805e-06, + "loss": 1.6114, + "step": 17411 + }, + { + "epoch": 2.19, + "grad_norm": 11.218341827392578, + "learning_rate": 5.431954148014894e-06, + "loss": 1.0399, + "step": 17412 + }, + { + "epoch": 2.19, + "grad_norm": 13.591835975646973, + "learning_rate": 5.431117432958206e-06, + "loss": 0.7951, + "step": 17413 + }, + { + "epoch": 2.19, + "grad_norm": 15.405440330505371, + "learning_rate": 5.430280717901519e-06, + "loss": 0.9816, + "step": 17414 + }, + { + "epoch": 2.19, + "grad_norm": 20.23955726623535, + "learning_rate": 5.429444002844832e-06, + "loss": 0.9346, + "step": 17415 + }, + { + "epoch": 2.19, + "grad_norm": 9.765039443969727, + "learning_rate": 5.4286072877881445e-06, + "loss": 1.1874, + "step": 17416 + }, + { + "epoch": 2.19, + "grad_norm": 39.12038803100586, + "learning_rate": 5.4277705727314565e-06, + "loss": 1.787, + "step": 17417 + }, + { + "epoch": 2.19, + "grad_norm": 7.175165176391602, + "learning_rate": 5.4269338576747684e-06, + "loss": 0.451, + "step": 17418 + }, + { + "epoch": 2.19, + "grad_norm": 12.465749740600586, + "learning_rate": 5.426097142618082e-06, + "loss": 0.9885, + "step": 17419 + }, + { + "epoch": 2.19, + "grad_norm": 6.003023624420166, + "learning_rate": 5.425260427561394e-06, + "loss": 1.206, + "step": 17420 + }, + { + "epoch": 2.19, + "grad_norm": 29.761905670166016, + "learning_rate": 5.424423712504707e-06, + "loss": 1.4167, + "step": 17421 + }, + { + "epoch": 2.19, + "grad_norm": 13.826894760131836, + "learning_rate": 5.42358699744802e-06, + "loss": 0.4789, + "step": 17422 + }, + { + "epoch": 2.19, + "grad_norm": 4.056317329406738, + "learning_rate": 5.4227502823913325e-06, + "loss": 0.2607, + "step": 17423 + }, + { + "epoch": 2.19, + "grad_norm": 76.80038452148438, + "learning_rate": 5.421913567334644e-06, + "loss": 2.3984, + "step": 17424 + }, + { + "epoch": 2.19, + "grad_norm": 6.621973037719727, + "learning_rate": 5.421076852277956e-06, + "loss": 0.7779, + "step": 17425 + }, + { + "epoch": 2.19, + "grad_norm": 12.780646324157715, + "learning_rate": 5.42024013722127e-06, + "loss": 0.6478, + "step": 17426 + }, + { + "epoch": 2.19, + "grad_norm": 25.02349281311035, + "learning_rate": 5.419403422164582e-06, + "loss": 1.3122, + "step": 17427 + }, + { + "epoch": 2.19, + "grad_norm": 21.12056541442871, + "learning_rate": 5.418566707107895e-06, + "loss": 0.5195, + "step": 17428 + }, + { + "epoch": 2.19, + "grad_norm": 14.527588844299316, + "learning_rate": 5.4177299920512076e-06, + "loss": 0.8056, + "step": 17429 + }, + { + "epoch": 2.19, + "grad_norm": 28.591569900512695, + "learning_rate": 5.41689327699452e-06, + "loss": 1.4283, + "step": 17430 + }, + { + "epoch": 2.19, + "grad_norm": 11.90042495727539, + "learning_rate": 5.416056561937832e-06, + "loss": 0.9807, + "step": 17431 + }, + { + "epoch": 2.19, + "grad_norm": 26.2853946685791, + "learning_rate": 5.415219846881144e-06, + "loss": 0.7286, + "step": 17432 + }, + { + "epoch": 2.19, + "grad_norm": 8.017561912536621, + "learning_rate": 5.414383131824458e-06, + "loss": 0.5506, + "step": 17433 + }, + { + "epoch": 2.19, + "grad_norm": 13.73519515991211, + "learning_rate": 5.41354641676777e-06, + "loss": 0.7781, + "step": 17434 + }, + { + "epoch": 2.19, + "grad_norm": 13.892803192138672, + "learning_rate": 5.412709701711083e-06, + "loss": 0.9187, + "step": 17435 + }, + { + "epoch": 2.19, + "grad_norm": 10.17711353302002, + "learning_rate": 5.4118729866543955e-06, + "loss": 1.3602, + "step": 17436 + }, + { + "epoch": 2.19, + "grad_norm": 22.67192268371582, + "learning_rate": 5.411036271597708e-06, + "loss": 1.5603, + "step": 17437 + }, + { + "epoch": 2.19, + "grad_norm": 23.222631454467773, + "learning_rate": 5.41019955654102e-06, + "loss": 0.5541, + "step": 17438 + }, + { + "epoch": 2.19, + "grad_norm": 17.52680015563965, + "learning_rate": 5.409362841484332e-06, + "loss": 0.9239, + "step": 17439 + }, + { + "epoch": 2.19, + "grad_norm": 9.070002555847168, + "learning_rate": 5.408526126427646e-06, + "loss": 0.6678, + "step": 17440 + }, + { + "epoch": 2.19, + "grad_norm": 13.469550132751465, + "learning_rate": 5.407689411370958e-06, + "loss": 0.7127, + "step": 17441 + }, + { + "epoch": 2.19, + "grad_norm": 26.821613311767578, + "learning_rate": 5.406852696314271e-06, + "loss": 0.5464, + "step": 17442 + }, + { + "epoch": 2.19, + "grad_norm": 20.423538208007812, + "learning_rate": 5.4060159812575826e-06, + "loss": 0.87, + "step": 17443 + }, + { + "epoch": 2.19, + "grad_norm": 4.490708827972412, + "learning_rate": 5.405179266200896e-06, + "loss": 0.4314, + "step": 17444 + }, + { + "epoch": 2.19, + "grad_norm": 26.50490379333496, + "learning_rate": 5.404342551144208e-06, + "loss": 2.3217, + "step": 17445 + }, + { + "epoch": 2.19, + "grad_norm": 33.80739974975586, + "learning_rate": 5.40350583608752e-06, + "loss": 3.0031, + "step": 17446 + }, + { + "epoch": 2.19, + "grad_norm": 24.16927146911621, + "learning_rate": 5.402669121030834e-06, + "loss": 0.7531, + "step": 17447 + }, + { + "epoch": 2.19, + "grad_norm": 16.82111930847168, + "learning_rate": 5.401832405974146e-06, + "loss": 1.4441, + "step": 17448 + }, + { + "epoch": 2.19, + "grad_norm": 7.376926898956299, + "learning_rate": 5.4009956909174585e-06, + "loss": 1.3377, + "step": 17449 + }, + { + "epoch": 2.19, + "grad_norm": 32.68447494506836, + "learning_rate": 5.4001589758607705e-06, + "loss": 1.5825, + "step": 17450 + }, + { + "epoch": 2.19, + "grad_norm": 10.641036987304688, + "learning_rate": 5.399322260804084e-06, + "loss": 0.5053, + "step": 17451 + }, + { + "epoch": 2.19, + "grad_norm": 9.782011032104492, + "learning_rate": 5.398485545747396e-06, + "loss": 0.1486, + "step": 17452 + }, + { + "epoch": 2.19, + "grad_norm": 24.293102264404297, + "learning_rate": 5.397648830690708e-06, + "loss": 0.4749, + "step": 17453 + }, + { + "epoch": 2.19, + "grad_norm": 33.50642776489258, + "learning_rate": 5.396812115634022e-06, + "loss": 0.8247, + "step": 17454 + }, + { + "epoch": 2.19, + "grad_norm": 29.112720489501953, + "learning_rate": 5.395975400577334e-06, + "loss": 0.6069, + "step": 17455 + }, + { + "epoch": 2.19, + "grad_norm": 32.35807800292969, + "learning_rate": 5.3951386855206464e-06, + "loss": 1.8802, + "step": 17456 + }, + { + "epoch": 2.19, + "grad_norm": 6.860402584075928, + "learning_rate": 5.394301970463958e-06, + "loss": 0.2353, + "step": 17457 + }, + { + "epoch": 2.19, + "grad_norm": 11.936412811279297, + "learning_rate": 5.393465255407272e-06, + "loss": 0.1448, + "step": 17458 + }, + { + "epoch": 2.19, + "grad_norm": 12.567363739013672, + "learning_rate": 5.392628540350584e-06, + "loss": 0.9206, + "step": 17459 + }, + { + "epoch": 2.19, + "grad_norm": 7.77496337890625, + "learning_rate": 5.391791825293896e-06, + "loss": 0.8399, + "step": 17460 + }, + { + "epoch": 2.19, + "grad_norm": 19.330772399902344, + "learning_rate": 5.39095511023721e-06, + "loss": 0.9187, + "step": 17461 + }, + { + "epoch": 2.19, + "grad_norm": 6.970808982849121, + "learning_rate": 5.3901183951805216e-06, + "loss": 0.8056, + "step": 17462 + }, + { + "epoch": 2.19, + "grad_norm": 14.46152400970459, + "learning_rate": 5.389281680123834e-06, + "loss": 0.4848, + "step": 17463 + }, + { + "epoch": 2.19, + "grad_norm": 10.946375846862793, + "learning_rate": 5.388444965067146e-06, + "loss": 1.0824, + "step": 17464 + }, + { + "epoch": 2.19, + "grad_norm": 10.016998291015625, + "learning_rate": 5.38760825001046e-06, + "loss": 0.3515, + "step": 17465 + }, + { + "epoch": 2.19, + "grad_norm": 7.684912204742432, + "learning_rate": 5.386771534953772e-06, + "loss": 0.689, + "step": 17466 + }, + { + "epoch": 2.19, + "grad_norm": 13.379766464233398, + "learning_rate": 5.385934819897084e-06, + "loss": 1.5413, + "step": 17467 + }, + { + "epoch": 2.19, + "grad_norm": 7.916872024536133, + "learning_rate": 5.3850981048403975e-06, + "loss": 1.6151, + "step": 17468 + }, + { + "epoch": 2.19, + "grad_norm": 6.842104911804199, + "learning_rate": 5.3842613897837095e-06, + "loss": 0.2652, + "step": 17469 + }, + { + "epoch": 2.19, + "grad_norm": 14.729266166687012, + "learning_rate": 5.383424674727022e-06, + "loss": 1.7612, + "step": 17470 + }, + { + "epoch": 2.19, + "grad_norm": 14.622649192810059, + "learning_rate": 5.382587959670334e-06, + "loss": 1.2312, + "step": 17471 + }, + { + "epoch": 2.19, + "grad_norm": 14.882375717163086, + "learning_rate": 5.381751244613648e-06, + "loss": 1.3312, + "step": 17472 + }, + { + "epoch": 2.19, + "grad_norm": 11.30505084991455, + "learning_rate": 5.38091452955696e-06, + "loss": 0.4109, + "step": 17473 + }, + { + "epoch": 2.19, + "grad_norm": 14.341401100158691, + "learning_rate": 5.380077814500272e-06, + "loss": 0.5819, + "step": 17474 + }, + { + "epoch": 2.19, + "grad_norm": 71.64712524414062, + "learning_rate": 5.3792410994435854e-06, + "loss": 1.3767, + "step": 17475 + }, + { + "epoch": 2.19, + "grad_norm": 11.888653755187988, + "learning_rate": 5.378404384386897e-06, + "loss": 0.8234, + "step": 17476 + }, + { + "epoch": 2.19, + "grad_norm": 12.765649795532227, + "learning_rate": 5.37756766933021e-06, + "loss": 0.7687, + "step": 17477 + }, + { + "epoch": 2.19, + "grad_norm": 18.29014778137207, + "learning_rate": 5.376730954273522e-06, + "loss": 1.1736, + "step": 17478 + }, + { + "epoch": 2.19, + "grad_norm": 11.075541496276855, + "learning_rate": 5.375894239216836e-06, + "loss": 0.4156, + "step": 17479 + }, + { + "epoch": 2.19, + "grad_norm": 11.490434646606445, + "learning_rate": 5.375057524160148e-06, + "loss": 0.8648, + "step": 17480 + }, + { + "epoch": 2.19, + "grad_norm": 24.799287796020508, + "learning_rate": 5.37422080910346e-06, + "loss": 0.7148, + "step": 17481 + }, + { + "epoch": 2.19, + "grad_norm": 122.64232635498047, + "learning_rate": 5.373384094046773e-06, + "loss": 1.5835, + "step": 17482 + }, + { + "epoch": 2.19, + "grad_norm": 27.34222412109375, + "learning_rate": 5.372547378990085e-06, + "loss": 1.2789, + "step": 17483 + }, + { + "epoch": 2.19, + "grad_norm": 23.286502838134766, + "learning_rate": 5.371710663933398e-06, + "loss": 2.4069, + "step": 17484 + }, + { + "epoch": 2.19, + "grad_norm": 7.177731037139893, + "learning_rate": 5.37087394887671e-06, + "loss": 0.8277, + "step": 17485 + }, + { + "epoch": 2.19, + "grad_norm": 19.814857482910156, + "learning_rate": 5.370037233820024e-06, + "loss": 0.2308, + "step": 17486 + }, + { + "epoch": 2.19, + "grad_norm": 34.60707473754883, + "learning_rate": 5.369200518763336e-06, + "loss": 3.0668, + "step": 17487 + }, + { + "epoch": 2.19, + "grad_norm": 11.37707805633545, + "learning_rate": 5.368363803706648e-06, + "loss": 0.7361, + "step": 17488 + }, + { + "epoch": 2.19, + "grad_norm": 3.2040646076202393, + "learning_rate": 5.367527088649961e-06, + "loss": 0.2547, + "step": 17489 + }, + { + "epoch": 2.19, + "grad_norm": 11.337788581848145, + "learning_rate": 5.366690373593273e-06, + "loss": 1.5259, + "step": 17490 + }, + { + "epoch": 2.2, + "grad_norm": 15.41157341003418, + "learning_rate": 5.365853658536586e-06, + "loss": 1.224, + "step": 17491 + }, + { + "epoch": 2.2, + "grad_norm": 39.75604248046875, + "learning_rate": 5.365016943479898e-06, + "loss": 0.8128, + "step": 17492 + }, + { + "epoch": 2.2, + "grad_norm": 6.115261554718018, + "learning_rate": 5.364180228423212e-06, + "loss": 0.162, + "step": 17493 + }, + { + "epoch": 2.2, + "grad_norm": 12.996819496154785, + "learning_rate": 5.363343513366524e-06, + "loss": 0.4829, + "step": 17494 + }, + { + "epoch": 2.2, + "grad_norm": 8.252242088317871, + "learning_rate": 5.3625067983098355e-06, + "loss": 0.4253, + "step": 17495 + }, + { + "epoch": 2.2, + "grad_norm": 7.422755718231201, + "learning_rate": 5.361670083253148e-06, + "loss": 0.3621, + "step": 17496 + }, + { + "epoch": 2.2, + "grad_norm": 10.536725044250488, + "learning_rate": 5.360833368196461e-06, + "loss": 1.104, + "step": 17497 + }, + { + "epoch": 2.2, + "grad_norm": 65.46198272705078, + "learning_rate": 5.359996653139774e-06, + "loss": 1.338, + "step": 17498 + }, + { + "epoch": 2.2, + "grad_norm": 1.54756498336792, + "learning_rate": 5.359159938083086e-06, + "loss": 0.0467, + "step": 17499 + }, + { + "epoch": 2.2, + "grad_norm": 6.500692367553711, + "learning_rate": 5.3583232230263996e-06, + "loss": 0.3971, + "step": 17500 + }, + { + "epoch": 2.2, + "grad_norm": 34.247474670410156, + "learning_rate": 5.3574865079697115e-06, + "loss": 1.277, + "step": 17501 + }, + { + "epoch": 2.2, + "grad_norm": 18.990402221679688, + "learning_rate": 5.3566497929130235e-06, + "loss": 2.9791, + "step": 17502 + }, + { + "epoch": 2.2, + "grad_norm": 14.820756912231445, + "learning_rate": 5.355813077856336e-06, + "loss": 2.2124, + "step": 17503 + }, + { + "epoch": 2.2, + "grad_norm": 22.829708099365234, + "learning_rate": 5.354976362799649e-06, + "loss": 1.2131, + "step": 17504 + }, + { + "epoch": 2.2, + "grad_norm": 19.45446014404297, + "learning_rate": 5.354139647742962e-06, + "loss": 1.1271, + "step": 17505 + }, + { + "epoch": 2.2, + "grad_norm": 9.499173164367676, + "learning_rate": 5.353302932686274e-06, + "loss": 1.1174, + "step": 17506 + }, + { + "epoch": 2.2, + "grad_norm": 15.749574661254883, + "learning_rate": 5.3524662176295875e-06, + "loss": 2.0691, + "step": 17507 + }, + { + "epoch": 2.2, + "grad_norm": 10.936659812927246, + "learning_rate": 5.3516295025728994e-06, + "loss": 0.8516, + "step": 17508 + }, + { + "epoch": 2.2, + "grad_norm": 3.4799952507019043, + "learning_rate": 5.350792787516211e-06, + "loss": 0.0853, + "step": 17509 + }, + { + "epoch": 2.2, + "grad_norm": 17.480205535888672, + "learning_rate": 5.349956072459524e-06, + "loss": 1.037, + "step": 17510 + }, + { + "epoch": 2.2, + "grad_norm": 23.715269088745117, + "learning_rate": 5.349119357402837e-06, + "loss": 2.084, + "step": 17511 + }, + { + "epoch": 2.2, + "grad_norm": 39.498802185058594, + "learning_rate": 5.34828264234615e-06, + "loss": 1.6885, + "step": 17512 + }, + { + "epoch": 2.2, + "grad_norm": 8.803202629089355, + "learning_rate": 5.347445927289462e-06, + "loss": 0.3146, + "step": 17513 + }, + { + "epoch": 2.2, + "grad_norm": 8.69205379486084, + "learning_rate": 5.346609212232775e-06, + "loss": 0.3413, + "step": 17514 + }, + { + "epoch": 2.2, + "grad_norm": 6.984821319580078, + "learning_rate": 5.345772497176087e-06, + "loss": 0.4525, + "step": 17515 + }, + { + "epoch": 2.2, + "grad_norm": 66.03707122802734, + "learning_rate": 5.344935782119399e-06, + "loss": 0.8255, + "step": 17516 + }, + { + "epoch": 2.2, + "grad_norm": 14.11879825592041, + "learning_rate": 5.344099067062712e-06, + "loss": 0.8665, + "step": 17517 + }, + { + "epoch": 2.2, + "grad_norm": 33.14910888671875, + "learning_rate": 5.343262352006025e-06, + "loss": 2.288, + "step": 17518 + }, + { + "epoch": 2.2, + "grad_norm": 14.559138298034668, + "learning_rate": 5.342425636949338e-06, + "loss": 3.0581, + "step": 17519 + }, + { + "epoch": 2.2, + "grad_norm": 15.356159210205078, + "learning_rate": 5.34158892189265e-06, + "loss": 1.2524, + "step": 17520 + }, + { + "epoch": 2.2, + "grad_norm": 37.47621154785156, + "learning_rate": 5.340752206835963e-06, + "loss": 0.5242, + "step": 17521 + }, + { + "epoch": 2.2, + "grad_norm": 87.72489929199219, + "learning_rate": 5.339915491779275e-06, + "loss": 1.4852, + "step": 17522 + }, + { + "epoch": 2.2, + "grad_norm": 36.96324920654297, + "learning_rate": 5.339078776722587e-06, + "loss": 0.9385, + "step": 17523 + }, + { + "epoch": 2.2, + "grad_norm": 7.783929347991943, + "learning_rate": 5.3382420616659e-06, + "loss": 0.7675, + "step": 17524 + }, + { + "epoch": 2.2, + "grad_norm": 15.189329147338867, + "learning_rate": 5.337405346609212e-06, + "loss": 1.3922, + "step": 17525 + }, + { + "epoch": 2.2, + "grad_norm": 9.003801345825195, + "learning_rate": 5.336568631552526e-06, + "loss": 1.2127, + "step": 17526 + }, + { + "epoch": 2.2, + "grad_norm": 13.577678680419922, + "learning_rate": 5.335731916495838e-06, + "loss": 1.5522, + "step": 17527 + }, + { + "epoch": 2.2, + "grad_norm": 45.36927032470703, + "learning_rate": 5.334895201439151e-06, + "loss": 2.2272, + "step": 17528 + }, + { + "epoch": 2.2, + "grad_norm": 18.816207885742188, + "learning_rate": 5.334058486382463e-06, + "loss": 0.8083, + "step": 17529 + }, + { + "epoch": 2.2, + "grad_norm": 9.518589973449707, + "learning_rate": 5.333221771325775e-06, + "loss": 0.3954, + "step": 17530 + }, + { + "epoch": 2.2, + "grad_norm": 143.4003143310547, + "learning_rate": 5.332385056269088e-06, + "loss": 2.8706, + "step": 17531 + }, + { + "epoch": 2.2, + "grad_norm": 7.963112831115723, + "learning_rate": 5.3315483412124e-06, + "loss": 1.2719, + "step": 17532 + }, + { + "epoch": 2.2, + "grad_norm": 20.16355323791504, + "learning_rate": 5.3307116261557135e-06, + "loss": 2.0285, + "step": 17533 + }, + { + "epoch": 2.2, + "grad_norm": 10.042006492614746, + "learning_rate": 5.3298749110990255e-06, + "loss": 1.1726, + "step": 17534 + }, + { + "epoch": 2.2, + "grad_norm": 3.8463993072509766, + "learning_rate": 5.329038196042339e-06, + "loss": 0.4411, + "step": 17535 + }, + { + "epoch": 2.2, + "grad_norm": 11.920524597167969, + "learning_rate": 5.328201480985651e-06, + "loss": 0.5937, + "step": 17536 + }, + { + "epoch": 2.2, + "grad_norm": 27.405885696411133, + "learning_rate": 5.327364765928963e-06, + "loss": 0.2532, + "step": 17537 + }, + { + "epoch": 2.2, + "grad_norm": 10.97453498840332, + "learning_rate": 5.326528050872276e-06, + "loss": 0.5277, + "step": 17538 + }, + { + "epoch": 2.2, + "grad_norm": 35.17599105834961, + "learning_rate": 5.325691335815588e-06, + "loss": 0.7843, + "step": 17539 + }, + { + "epoch": 2.2, + "grad_norm": 13.560148239135742, + "learning_rate": 5.3248546207589015e-06, + "loss": 0.7429, + "step": 17540 + }, + { + "epoch": 2.2, + "grad_norm": 23.572628021240234, + "learning_rate": 5.324017905702213e-06, + "loss": 1.2264, + "step": 17541 + }, + { + "epoch": 2.2, + "grad_norm": 12.191099166870117, + "learning_rate": 5.323181190645526e-06, + "loss": 0.8501, + "step": 17542 + }, + { + "epoch": 2.2, + "grad_norm": 13.795705795288086, + "learning_rate": 5.322344475588839e-06, + "loss": 1.266, + "step": 17543 + }, + { + "epoch": 2.2, + "grad_norm": 11.90030288696289, + "learning_rate": 5.321507760532151e-06, + "loss": 0.7567, + "step": 17544 + }, + { + "epoch": 2.2, + "grad_norm": 34.31806182861328, + "learning_rate": 5.320671045475464e-06, + "loss": 1.7403, + "step": 17545 + }, + { + "epoch": 2.2, + "grad_norm": 11.005486488342285, + "learning_rate": 5.319834330418776e-06, + "loss": 0.6046, + "step": 17546 + }, + { + "epoch": 2.2, + "grad_norm": 11.976456642150879, + "learning_rate": 5.318997615362089e-06, + "loss": 0.4688, + "step": 17547 + }, + { + "epoch": 2.2, + "grad_norm": 3.7449660301208496, + "learning_rate": 5.318160900305401e-06, + "loss": 0.1778, + "step": 17548 + }, + { + "epoch": 2.2, + "grad_norm": 10.648322105407715, + "learning_rate": 5.317324185248714e-06, + "loss": 1.4392, + "step": 17549 + }, + { + "epoch": 2.2, + "grad_norm": 9.711552619934082, + "learning_rate": 5.316487470192027e-06, + "loss": 0.5422, + "step": 17550 + }, + { + "epoch": 2.2, + "grad_norm": 11.51738166809082, + "learning_rate": 5.315650755135339e-06, + "loss": 1.7181, + "step": 17551 + }, + { + "epoch": 2.2, + "grad_norm": 31.752824783325195, + "learning_rate": 5.314814040078652e-06, + "loss": 1.9601, + "step": 17552 + }, + { + "epoch": 2.2, + "grad_norm": 20.286109924316406, + "learning_rate": 5.313977325021964e-06, + "loss": 0.5984, + "step": 17553 + }, + { + "epoch": 2.2, + "grad_norm": 2.763519763946533, + "learning_rate": 5.313140609965277e-06, + "loss": 0.147, + "step": 17554 + }, + { + "epoch": 2.2, + "grad_norm": 153.0485076904297, + "learning_rate": 5.312303894908589e-06, + "loss": 1.3208, + "step": 17555 + }, + { + "epoch": 2.2, + "grad_norm": 6.014166831970215, + "learning_rate": 5.311467179851902e-06, + "loss": 0.7034, + "step": 17556 + }, + { + "epoch": 2.2, + "grad_norm": 13.632169723510742, + "learning_rate": 5.310630464795215e-06, + "loss": 2.0591, + "step": 17557 + }, + { + "epoch": 2.2, + "grad_norm": 40.078006744384766, + "learning_rate": 5.309793749738527e-06, + "loss": 0.7908, + "step": 17558 + }, + { + "epoch": 2.2, + "grad_norm": 51.911190032958984, + "learning_rate": 5.30895703468184e-06, + "loss": 1.8859, + "step": 17559 + }, + { + "epoch": 2.2, + "grad_norm": 12.481959342956543, + "learning_rate": 5.3081203196251516e-06, + "loss": 1.0484, + "step": 17560 + }, + { + "epoch": 2.2, + "grad_norm": 14.264859199523926, + "learning_rate": 5.307283604568465e-06, + "loss": 1.6794, + "step": 17561 + }, + { + "epoch": 2.2, + "grad_norm": 11.537360191345215, + "learning_rate": 5.306446889511777e-06, + "loss": 1.0252, + "step": 17562 + }, + { + "epoch": 2.2, + "grad_norm": 16.290449142456055, + "learning_rate": 5.30561017445509e-06, + "loss": 1.8328, + "step": 17563 + }, + { + "epoch": 2.2, + "grad_norm": 33.80059814453125, + "learning_rate": 5.304773459398403e-06, + "loss": 2.6643, + "step": 17564 + }, + { + "epoch": 2.2, + "grad_norm": 13.9462251663208, + "learning_rate": 5.303936744341715e-06, + "loss": 2.2787, + "step": 17565 + }, + { + "epoch": 2.2, + "grad_norm": 14.095157623291016, + "learning_rate": 5.3031000292850275e-06, + "loss": 0.6708, + "step": 17566 + }, + { + "epoch": 2.2, + "grad_norm": 6.371540546417236, + "learning_rate": 5.3022633142283395e-06, + "loss": 0.2943, + "step": 17567 + }, + { + "epoch": 2.2, + "grad_norm": 23.037803649902344, + "learning_rate": 5.301426599171653e-06, + "loss": 1.0293, + "step": 17568 + }, + { + "epoch": 2.2, + "grad_norm": 13.298918724060059, + "learning_rate": 5.300589884114965e-06, + "loss": 0.9281, + "step": 17569 + }, + { + "epoch": 2.21, + "grad_norm": 9.514727592468262, + "learning_rate": 5.299753169058278e-06, + "loss": 0.5065, + "step": 17570 + }, + { + "epoch": 2.21, + "grad_norm": 10.597125053405762, + "learning_rate": 5.298916454001591e-06, + "loss": 0.7589, + "step": 17571 + }, + { + "epoch": 2.21, + "grad_norm": 13.349027633666992, + "learning_rate": 5.298079738944903e-06, + "loss": 2.7605, + "step": 17572 + }, + { + "epoch": 2.21, + "grad_norm": 4.783844470977783, + "learning_rate": 5.2972430238882155e-06, + "loss": 0.5558, + "step": 17573 + }, + { + "epoch": 2.21, + "grad_norm": 21.072872161865234, + "learning_rate": 5.296406308831527e-06, + "loss": 0.6579, + "step": 17574 + }, + { + "epoch": 2.21, + "grad_norm": 3.221395254135132, + "learning_rate": 5.295569593774841e-06, + "loss": 0.2594, + "step": 17575 + }, + { + "epoch": 2.21, + "grad_norm": 12.586833000183105, + "learning_rate": 5.294732878718153e-06, + "loss": 1.1632, + "step": 17576 + }, + { + "epoch": 2.21, + "grad_norm": 16.660640716552734, + "learning_rate": 5.293896163661465e-06, + "loss": 0.4993, + "step": 17577 + }, + { + "epoch": 2.21, + "grad_norm": 7.333919525146484, + "learning_rate": 5.293059448604778e-06, + "loss": 0.4903, + "step": 17578 + }, + { + "epoch": 2.21, + "grad_norm": 31.236549377441406, + "learning_rate": 5.2922227335480906e-06, + "loss": 0.9915, + "step": 17579 + }, + { + "epoch": 2.21, + "grad_norm": 14.862787246704102, + "learning_rate": 5.291386018491403e-06, + "loss": 0.4041, + "step": 17580 + }, + { + "epoch": 2.21, + "grad_norm": 13.543603897094727, + "learning_rate": 5.290549303434715e-06, + "loss": 0.9671, + "step": 17581 + }, + { + "epoch": 2.21, + "grad_norm": 23.977313995361328, + "learning_rate": 5.289712588378029e-06, + "loss": 1.0142, + "step": 17582 + }, + { + "epoch": 2.21, + "grad_norm": 24.36107635498047, + "learning_rate": 5.288875873321341e-06, + "loss": 1.347, + "step": 17583 + }, + { + "epoch": 2.21, + "grad_norm": 5.181324005126953, + "learning_rate": 5.288039158264653e-06, + "loss": 0.8222, + "step": 17584 + }, + { + "epoch": 2.21, + "grad_norm": 11.00633716583252, + "learning_rate": 5.287202443207966e-06, + "loss": 0.3454, + "step": 17585 + }, + { + "epoch": 2.21, + "grad_norm": 97.79345703125, + "learning_rate": 5.2863657281512785e-06, + "loss": 1.1273, + "step": 17586 + }, + { + "epoch": 2.21, + "grad_norm": 5.67331600189209, + "learning_rate": 5.285529013094591e-06, + "loss": 0.4736, + "step": 17587 + }, + { + "epoch": 2.21, + "grad_norm": 14.97656536102295, + "learning_rate": 5.284692298037903e-06, + "loss": 0.8189, + "step": 17588 + }, + { + "epoch": 2.21, + "grad_norm": 13.51713752746582, + "learning_rate": 5.283855582981217e-06, + "loss": 1.57, + "step": 17589 + }, + { + "epoch": 2.21, + "grad_norm": 17.975173950195312, + "learning_rate": 5.283018867924529e-06, + "loss": 1.2526, + "step": 17590 + }, + { + "epoch": 2.21, + "grad_norm": 18.56690216064453, + "learning_rate": 5.282182152867841e-06, + "loss": 1.7809, + "step": 17591 + }, + { + "epoch": 2.21, + "grad_norm": 9.219497680664062, + "learning_rate": 5.281345437811154e-06, + "loss": 0.4182, + "step": 17592 + }, + { + "epoch": 2.21, + "grad_norm": 24.483352661132812, + "learning_rate": 5.280508722754466e-06, + "loss": 2.3645, + "step": 17593 + }, + { + "epoch": 2.21, + "grad_norm": 10.796198844909668, + "learning_rate": 5.279672007697779e-06, + "loss": 0.3318, + "step": 17594 + }, + { + "epoch": 2.21, + "grad_norm": 27.02687644958496, + "learning_rate": 5.278835292641091e-06, + "loss": 1.1065, + "step": 17595 + }, + { + "epoch": 2.21, + "grad_norm": 4.5153303146362305, + "learning_rate": 5.277998577584405e-06, + "loss": 0.1791, + "step": 17596 + }, + { + "epoch": 2.21, + "grad_norm": 12.836956024169922, + "learning_rate": 5.277161862527717e-06, + "loss": 0.3242, + "step": 17597 + }, + { + "epoch": 2.21, + "grad_norm": 57.1956787109375, + "learning_rate": 5.276325147471029e-06, + "loss": 0.6991, + "step": 17598 + }, + { + "epoch": 2.21, + "grad_norm": 18.503223419189453, + "learning_rate": 5.2754884324143415e-06, + "loss": 1.3607, + "step": 17599 + }, + { + "epoch": 2.21, + "grad_norm": 13.974534034729004, + "learning_rate": 5.274651717357654e-06, + "loss": 0.7077, + "step": 17600 + }, + { + "epoch": 2.21, + "eval_loss": 0.08298930525779724, + "eval_runtime": 95.116, + "eval_samples_per_second": 37.239, + "eval_steps_per_second": 37.239, + "step": 17600 + }, + { + "epoch": 2.21, + "grad_norm": 33.60075759887695, + "learning_rate": 5.273815002300967e-06, + "loss": 2.1778, + "step": 17601 + }, + { + "epoch": 2.21, + "grad_norm": 20.371700286865234, + "learning_rate": 5.272978287244279e-06, + "loss": 1.0122, + "step": 17602 + }, + { + "epoch": 2.21, + "grad_norm": 7.336877346038818, + "learning_rate": 5.272141572187593e-06, + "loss": 0.3176, + "step": 17603 + }, + { + "epoch": 2.21, + "grad_norm": 232.3487091064453, + "learning_rate": 5.271304857130905e-06, + "loss": 1.5995, + "step": 17604 + }, + { + "epoch": 2.21, + "grad_norm": 12.699992179870605, + "learning_rate": 5.270468142074217e-06, + "loss": 0.5626, + "step": 17605 + }, + { + "epoch": 2.21, + "grad_norm": 15.630974769592285, + "learning_rate": 5.2696314270175294e-06, + "loss": 1.003, + "step": 17606 + }, + { + "epoch": 2.21, + "grad_norm": 8.735376358032227, + "learning_rate": 5.268794711960841e-06, + "loss": 0.6007, + "step": 17607 + }, + { + "epoch": 2.21, + "grad_norm": 12.420937538146973, + "learning_rate": 5.267957996904155e-06, + "loss": 0.4516, + "step": 17608 + }, + { + "epoch": 2.21, + "grad_norm": 20.599655151367188, + "learning_rate": 5.267121281847467e-06, + "loss": 0.724, + "step": 17609 + }, + { + "epoch": 2.21, + "grad_norm": 3.0221431255340576, + "learning_rate": 5.266284566790781e-06, + "loss": 0.0848, + "step": 17610 + }, + { + "epoch": 2.21, + "grad_norm": 26.17032814025879, + "learning_rate": 5.265447851734093e-06, + "loss": 1.3508, + "step": 17611 + }, + { + "epoch": 2.21, + "grad_norm": 30.255577087402344, + "learning_rate": 5.2646111366774046e-06, + "loss": 1.1052, + "step": 17612 + }, + { + "epoch": 2.21, + "grad_norm": 18.89253044128418, + "learning_rate": 5.263774421620717e-06, + "loss": 0.9781, + "step": 17613 + }, + { + "epoch": 2.21, + "grad_norm": 11.845908164978027, + "learning_rate": 5.262937706564029e-06, + "loss": 0.4968, + "step": 17614 + }, + { + "epoch": 2.21, + "grad_norm": 142.40744018554688, + "learning_rate": 5.262100991507343e-06, + "loss": 2.5384, + "step": 17615 + }, + { + "epoch": 2.21, + "grad_norm": 11.632862091064453, + "learning_rate": 5.261264276450655e-06, + "loss": 0.7456, + "step": 17616 + }, + { + "epoch": 2.21, + "grad_norm": 49.94683074951172, + "learning_rate": 5.2604275613939686e-06, + "loss": 4.7063, + "step": 17617 + }, + { + "epoch": 2.21, + "grad_norm": 13.744133949279785, + "learning_rate": 5.2595908463372805e-06, + "loss": 0.8506, + "step": 17618 + }, + { + "epoch": 2.21, + "grad_norm": 6.055446147918701, + "learning_rate": 5.2587541312805925e-06, + "loss": 1.0562, + "step": 17619 + }, + { + "epoch": 2.21, + "grad_norm": 18.613239288330078, + "learning_rate": 5.257917416223905e-06, + "loss": 0.6748, + "step": 17620 + }, + { + "epoch": 2.21, + "grad_norm": 49.51359176635742, + "learning_rate": 5.257080701167217e-06, + "loss": 1.5565, + "step": 17621 + }, + { + "epoch": 2.21, + "grad_norm": 23.81951141357422, + "learning_rate": 5.256243986110531e-06, + "loss": 0.8403, + "step": 17622 + }, + { + "epoch": 2.21, + "grad_norm": 18.800947189331055, + "learning_rate": 5.255407271053843e-06, + "loss": 2.0553, + "step": 17623 + }, + { + "epoch": 2.21, + "grad_norm": 14.106256484985352, + "learning_rate": 5.254570555997156e-06, + "loss": 2.1392, + "step": 17624 + }, + { + "epoch": 2.21, + "grad_norm": 6.436473846435547, + "learning_rate": 5.2537338409404684e-06, + "loss": 0.068, + "step": 17625 + }, + { + "epoch": 2.21, + "grad_norm": 47.873111724853516, + "learning_rate": 5.25289712588378e-06, + "loss": 2.3184, + "step": 17626 + }, + { + "epoch": 2.21, + "grad_norm": 15.037662506103516, + "learning_rate": 5.252060410827093e-06, + "loss": 1.1386, + "step": 17627 + }, + { + "epoch": 2.21, + "grad_norm": 199.6589813232422, + "learning_rate": 5.251223695770405e-06, + "loss": 1.5881, + "step": 17628 + }, + { + "epoch": 2.21, + "grad_norm": 10.401008605957031, + "learning_rate": 5.250386980713719e-06, + "loss": 1.2039, + "step": 17629 + }, + { + "epoch": 2.21, + "grad_norm": 11.683107376098633, + "learning_rate": 5.249550265657031e-06, + "loss": 0.7879, + "step": 17630 + }, + { + "epoch": 2.21, + "grad_norm": 7.70859956741333, + "learning_rate": 5.2487135506003436e-06, + "loss": 0.6255, + "step": 17631 + }, + { + "epoch": 2.21, + "grad_norm": 24.480422973632812, + "learning_rate": 5.247876835543656e-06, + "loss": 1.4999, + "step": 17632 + }, + { + "epoch": 2.21, + "grad_norm": 20.174907684326172, + "learning_rate": 5.247040120486968e-06, + "loss": 1.5308, + "step": 17633 + }, + { + "epoch": 2.21, + "grad_norm": 26.942628860473633, + "learning_rate": 5.246203405430281e-06, + "loss": 1.1784, + "step": 17634 + }, + { + "epoch": 2.21, + "grad_norm": 13.76342487335205, + "learning_rate": 5.245366690373593e-06, + "loss": 0.4631, + "step": 17635 + }, + { + "epoch": 2.21, + "grad_norm": 38.779319763183594, + "learning_rate": 5.244529975316907e-06, + "loss": 0.6271, + "step": 17636 + }, + { + "epoch": 2.21, + "grad_norm": 6.188003063201904, + "learning_rate": 5.243693260260219e-06, + "loss": 0.9849, + "step": 17637 + }, + { + "epoch": 2.21, + "grad_norm": 18.986454010009766, + "learning_rate": 5.2428565452035315e-06, + "loss": 0.9632, + "step": 17638 + }, + { + "epoch": 2.21, + "grad_norm": 88.78659057617188, + "learning_rate": 5.242019830146844e-06, + "loss": 0.7307, + "step": 17639 + }, + { + "epoch": 2.21, + "grad_norm": 5.771511554718018, + "learning_rate": 5.241183115090156e-06, + "loss": 0.8864, + "step": 17640 + }, + { + "epoch": 2.21, + "grad_norm": 10.675668716430664, + "learning_rate": 5.240346400033469e-06, + "loss": 1.0148, + "step": 17641 + }, + { + "epoch": 2.21, + "grad_norm": 32.972877502441406, + "learning_rate": 5.239509684976781e-06, + "loss": 1.5309, + "step": 17642 + }, + { + "epoch": 2.21, + "grad_norm": 7.496189117431641, + "learning_rate": 5.238672969920095e-06, + "loss": 0.5997, + "step": 17643 + }, + { + "epoch": 2.21, + "grad_norm": 21.612586975097656, + "learning_rate": 5.237836254863407e-06, + "loss": 1.1054, + "step": 17644 + }, + { + "epoch": 2.21, + "grad_norm": 31.007505416870117, + "learning_rate": 5.236999539806719e-06, + "loss": 1.1644, + "step": 17645 + }, + { + "epoch": 2.21, + "grad_norm": 34.03743362426758, + "learning_rate": 5.236162824750032e-06, + "loss": 1.6796, + "step": 17646 + }, + { + "epoch": 2.21, + "grad_norm": 6.421183109283447, + "learning_rate": 5.235326109693344e-06, + "loss": 0.2818, + "step": 17647 + }, + { + "epoch": 2.21, + "grad_norm": 13.530191421508789, + "learning_rate": 5.234489394636657e-06, + "loss": 1.8073, + "step": 17648 + }, + { + "epoch": 2.21, + "grad_norm": 7.906088352203369, + "learning_rate": 5.233652679579969e-06, + "loss": 1.7872, + "step": 17649 + }, + { + "epoch": 2.22, + "grad_norm": 7.971576690673828, + "learning_rate": 5.2328159645232826e-06, + "loss": 0.1354, + "step": 17650 + }, + { + "epoch": 2.22, + "grad_norm": 5.973159313201904, + "learning_rate": 5.2319792494665945e-06, + "loss": 1.2619, + "step": 17651 + }, + { + "epoch": 2.22, + "grad_norm": 7.155309200286865, + "learning_rate": 5.231142534409907e-06, + "loss": 0.8706, + "step": 17652 + }, + { + "epoch": 2.22, + "grad_norm": 14.330678939819336, + "learning_rate": 5.230305819353219e-06, + "loss": 0.3922, + "step": 17653 + }, + { + "epoch": 2.22, + "grad_norm": 10.194661140441895, + "learning_rate": 5.229469104296532e-06, + "loss": 0.5062, + "step": 17654 + }, + { + "epoch": 2.22, + "grad_norm": 29.81911277770996, + "learning_rate": 5.228632389239845e-06, + "loss": 1.8285, + "step": 17655 + }, + { + "epoch": 2.22, + "grad_norm": 34.0993537902832, + "learning_rate": 5.227795674183157e-06, + "loss": 1.3139, + "step": 17656 + }, + { + "epoch": 2.22, + "grad_norm": 10.460409164428711, + "learning_rate": 5.2269589591264705e-06, + "loss": 1.3534, + "step": 17657 + }, + { + "epoch": 2.22, + "grad_norm": 28.992599487304688, + "learning_rate": 5.2261222440697824e-06, + "loss": 3.2598, + "step": 17658 + }, + { + "epoch": 2.22, + "grad_norm": 15.127599716186523, + "learning_rate": 5.225285529013095e-06, + "loss": 1.5536, + "step": 17659 + }, + { + "epoch": 2.22, + "grad_norm": 11.926405906677246, + "learning_rate": 5.224448813956407e-06, + "loss": 0.5888, + "step": 17660 + }, + { + "epoch": 2.22, + "grad_norm": 67.72642517089844, + "learning_rate": 5.22361209889972e-06, + "loss": 2.6624, + "step": 17661 + }, + { + "epoch": 2.22, + "grad_norm": 13.176055908203125, + "learning_rate": 5.222775383843033e-06, + "loss": 0.5765, + "step": 17662 + }, + { + "epoch": 2.22, + "grad_norm": 6.374279499053955, + "learning_rate": 5.221938668786345e-06, + "loss": 0.5205, + "step": 17663 + }, + { + "epoch": 2.22, + "grad_norm": 64.94546508789062, + "learning_rate": 5.221101953729658e-06, + "loss": 1.7056, + "step": 17664 + }, + { + "epoch": 2.22, + "grad_norm": 40.390602111816406, + "learning_rate": 5.22026523867297e-06, + "loss": 2.0192, + "step": 17665 + }, + { + "epoch": 2.22, + "grad_norm": 15.01516342163086, + "learning_rate": 5.219428523616283e-06, + "loss": 0.5252, + "step": 17666 + }, + { + "epoch": 2.22, + "grad_norm": 22.53778648376465, + "learning_rate": 5.218591808559595e-06, + "loss": 0.9188, + "step": 17667 + }, + { + "epoch": 2.22, + "grad_norm": 4.551015853881836, + "learning_rate": 5.217755093502908e-06, + "loss": 0.4476, + "step": 17668 + }, + { + "epoch": 2.22, + "grad_norm": 5.424520969390869, + "learning_rate": 5.216918378446221e-06, + "loss": 0.7938, + "step": 17669 + }, + { + "epoch": 2.22, + "grad_norm": 8.108823776245117, + "learning_rate": 5.216081663389533e-06, + "loss": 0.407, + "step": 17670 + }, + { + "epoch": 2.22, + "grad_norm": 7.024099349975586, + "learning_rate": 5.215244948332846e-06, + "loss": 1.005, + "step": 17671 + }, + { + "epoch": 2.22, + "grad_norm": 17.924938201904297, + "learning_rate": 5.214408233276158e-06, + "loss": 1.6325, + "step": 17672 + }, + { + "epoch": 2.22, + "grad_norm": 26.36313247680664, + "learning_rate": 5.213571518219471e-06, + "loss": 1.2318, + "step": 17673 + }, + { + "epoch": 2.22, + "grad_norm": 6.06789493560791, + "learning_rate": 5.212734803162783e-06, + "loss": 0.6525, + "step": 17674 + }, + { + "epoch": 2.22, + "grad_norm": 14.966464042663574, + "learning_rate": 5.211898088106096e-06, + "loss": 0.9419, + "step": 17675 + }, + { + "epoch": 2.22, + "grad_norm": 8.155410766601562, + "learning_rate": 5.211061373049409e-06, + "loss": 1.016, + "step": 17676 + }, + { + "epoch": 2.22, + "grad_norm": 49.047706604003906, + "learning_rate": 5.210224657992721e-06, + "loss": 1.491, + "step": 17677 + }, + { + "epoch": 2.22, + "grad_norm": 40.98280334472656, + "learning_rate": 5.209387942936034e-06, + "loss": 1.6621, + "step": 17678 + }, + { + "epoch": 2.22, + "grad_norm": 98.17125701904297, + "learning_rate": 5.208551227879346e-06, + "loss": 1.1052, + "step": 17679 + }, + { + "epoch": 2.22, + "grad_norm": 14.517447471618652, + "learning_rate": 5.207714512822659e-06, + "loss": 0.3564, + "step": 17680 + }, + { + "epoch": 2.22, + "grad_norm": 4.574953079223633, + "learning_rate": 5.206877797765971e-06, + "loss": 0.2258, + "step": 17681 + }, + { + "epoch": 2.22, + "grad_norm": 16.248891830444336, + "learning_rate": 5.206041082709284e-06, + "loss": 0.6977, + "step": 17682 + }, + { + "epoch": 2.22, + "grad_norm": 9.710600852966309, + "learning_rate": 5.2052043676525965e-06, + "loss": 0.8049, + "step": 17683 + }, + { + "epoch": 2.22, + "grad_norm": 18.749849319458008, + "learning_rate": 5.2043676525959085e-06, + "loss": 0.4875, + "step": 17684 + }, + { + "epoch": 2.22, + "grad_norm": 14.403364181518555, + "learning_rate": 5.203530937539222e-06, + "loss": 0.4326, + "step": 17685 + }, + { + "epoch": 2.22, + "grad_norm": 25.39562225341797, + "learning_rate": 5.202694222482534e-06, + "loss": 1.1199, + "step": 17686 + }, + { + "epoch": 2.22, + "grad_norm": 21.403207778930664, + "learning_rate": 5.201857507425847e-06, + "loss": 0.6541, + "step": 17687 + }, + { + "epoch": 2.22, + "grad_norm": 10.751093864440918, + "learning_rate": 5.201020792369159e-06, + "loss": 0.2954, + "step": 17688 + }, + { + "epoch": 2.22, + "grad_norm": 22.85277557373047, + "learning_rate": 5.200184077312471e-06, + "loss": 2.2193, + "step": 17689 + }, + { + "epoch": 2.22, + "grad_norm": 4.862344741821289, + "learning_rate": 5.1993473622557845e-06, + "loss": 1.3405, + "step": 17690 + }, + { + "epoch": 2.22, + "grad_norm": 4.9521684646606445, + "learning_rate": 5.198510647199096e-06, + "loss": 0.3263, + "step": 17691 + }, + { + "epoch": 2.22, + "grad_norm": 9.945138931274414, + "learning_rate": 5.19767393214241e-06, + "loss": 0.4844, + "step": 17692 + }, + { + "epoch": 2.22, + "grad_norm": 7.673474311828613, + "learning_rate": 5.196837217085722e-06, + "loss": 0.04, + "step": 17693 + }, + { + "epoch": 2.22, + "grad_norm": 9.783452033996582, + "learning_rate": 5.196000502029035e-06, + "loss": 0.9584, + "step": 17694 + }, + { + "epoch": 2.22, + "grad_norm": 23.981342315673828, + "learning_rate": 5.195163786972347e-06, + "loss": 1.8661, + "step": 17695 + }, + { + "epoch": 2.22, + "grad_norm": 18.916805267333984, + "learning_rate": 5.194327071915659e-06, + "loss": 0.9652, + "step": 17696 + }, + { + "epoch": 2.22, + "grad_norm": 76.17505645751953, + "learning_rate": 5.193490356858972e-06, + "loss": 1.6803, + "step": 17697 + }, + { + "epoch": 2.22, + "grad_norm": 20.964216232299805, + "learning_rate": 5.192653641802284e-06, + "loss": 0.6796, + "step": 17698 + }, + { + "epoch": 2.22, + "grad_norm": 11.345860481262207, + "learning_rate": 5.191816926745598e-06, + "loss": 1.497, + "step": 17699 + }, + { + "epoch": 2.22, + "grad_norm": 6.6955180168151855, + "learning_rate": 5.19098021168891e-06, + "loss": 0.6932, + "step": 17700 + }, + { + "epoch": 2.22, + "grad_norm": 14.110368728637695, + "learning_rate": 5.190143496632223e-06, + "loss": 0.8485, + "step": 17701 + }, + { + "epoch": 2.22, + "grad_norm": 12.59630298614502, + "learning_rate": 5.189306781575535e-06, + "loss": 1.4359, + "step": 17702 + }, + { + "epoch": 2.22, + "grad_norm": 7.114568710327148, + "learning_rate": 5.188470066518847e-06, + "loss": 0.2758, + "step": 17703 + }, + { + "epoch": 2.22, + "grad_norm": 11.192057609558105, + "learning_rate": 5.18763335146216e-06, + "loss": 1.1341, + "step": 17704 + }, + { + "epoch": 2.22, + "grad_norm": 8.494654655456543, + "learning_rate": 5.186796636405472e-06, + "loss": 1.4699, + "step": 17705 + }, + { + "epoch": 2.22, + "grad_norm": 11.067359924316406, + "learning_rate": 5.185959921348785e-06, + "loss": 0.2085, + "step": 17706 + }, + { + "epoch": 2.22, + "grad_norm": 49.78376388549805, + "learning_rate": 5.185123206292098e-06, + "loss": 1.5447, + "step": 17707 + }, + { + "epoch": 2.22, + "grad_norm": 13.176912307739258, + "learning_rate": 5.184286491235411e-06, + "loss": 0.234, + "step": 17708 + }, + { + "epoch": 2.22, + "grad_norm": 14.371654510498047, + "learning_rate": 5.183449776178723e-06, + "loss": 0.5464, + "step": 17709 + }, + { + "epoch": 2.22, + "grad_norm": 15.566927909851074, + "learning_rate": 5.1826130611220346e-06, + "loss": 1.1125, + "step": 17710 + }, + { + "epoch": 2.22, + "grad_norm": 7.577420234680176, + "learning_rate": 5.181776346065348e-06, + "loss": 1.8091, + "step": 17711 + }, + { + "epoch": 2.22, + "grad_norm": 3.659139633178711, + "learning_rate": 5.18093963100866e-06, + "loss": 0.5024, + "step": 17712 + }, + { + "epoch": 2.22, + "grad_norm": 12.561866760253906, + "learning_rate": 5.180102915951973e-06, + "loss": 0.2913, + "step": 17713 + }, + { + "epoch": 2.22, + "grad_norm": 39.97332763671875, + "learning_rate": 5.179266200895286e-06, + "loss": 1.2649, + "step": 17714 + }, + { + "epoch": 2.22, + "grad_norm": 106.55900573730469, + "learning_rate": 5.178429485838599e-06, + "loss": 0.8508, + "step": 17715 + }, + { + "epoch": 2.22, + "grad_norm": 32.46993637084961, + "learning_rate": 5.1775927707819105e-06, + "loss": 1.5124, + "step": 17716 + }, + { + "epoch": 2.22, + "grad_norm": 10.00876235961914, + "learning_rate": 5.1767560557252225e-06, + "loss": 0.9794, + "step": 17717 + }, + { + "epoch": 2.22, + "grad_norm": 8.991241455078125, + "learning_rate": 5.175919340668536e-06, + "loss": 1.698, + "step": 17718 + }, + { + "epoch": 2.22, + "grad_norm": 5.391348361968994, + "learning_rate": 5.175082625611848e-06, + "loss": 0.1494, + "step": 17719 + }, + { + "epoch": 2.22, + "grad_norm": 15.718477249145508, + "learning_rate": 5.174245910555161e-06, + "loss": 0.5882, + "step": 17720 + }, + { + "epoch": 2.22, + "grad_norm": 5.732717037200928, + "learning_rate": 5.173409195498474e-06, + "loss": 0.9163, + "step": 17721 + }, + { + "epoch": 2.22, + "grad_norm": 14.677023887634277, + "learning_rate": 5.1725724804417865e-06, + "loss": 0.4184, + "step": 17722 + }, + { + "epoch": 2.22, + "grad_norm": 44.430641174316406, + "learning_rate": 5.1717357653850985e-06, + "loss": 3.6664, + "step": 17723 + }, + { + "epoch": 2.22, + "grad_norm": 4.236386775970459, + "learning_rate": 5.17089905032841e-06, + "loss": 0.3657, + "step": 17724 + }, + { + "epoch": 2.22, + "grad_norm": 12.09508991241455, + "learning_rate": 5.170062335271724e-06, + "loss": 2.1726, + "step": 17725 + }, + { + "epoch": 2.22, + "grad_norm": 12.473302841186523, + "learning_rate": 5.169225620215036e-06, + "loss": 0.3477, + "step": 17726 + }, + { + "epoch": 2.22, + "grad_norm": 13.235149383544922, + "learning_rate": 5.168388905158349e-06, + "loss": 0.4917, + "step": 17727 + }, + { + "epoch": 2.22, + "grad_norm": 7.2687811851501465, + "learning_rate": 5.167552190101662e-06, + "loss": 2.1986, + "step": 17728 + }, + { + "epoch": 2.22, + "grad_norm": 46.85900115966797, + "learning_rate": 5.166715475044974e-06, + "loss": 2.0231, + "step": 17729 + }, + { + "epoch": 2.23, + "grad_norm": 13.293231010437012, + "learning_rate": 5.165878759988286e-06, + "loss": 0.201, + "step": 17730 + }, + { + "epoch": 2.23, + "grad_norm": 13.189783096313477, + "learning_rate": 5.165042044931598e-06, + "loss": 0.8895, + "step": 17731 + }, + { + "epoch": 2.23, + "grad_norm": 16.408214569091797, + "learning_rate": 5.164205329874912e-06, + "loss": 1.075, + "step": 17732 + }, + { + "epoch": 2.23, + "grad_norm": 17.93597984313965, + "learning_rate": 5.163368614818224e-06, + "loss": 1.2993, + "step": 17733 + }, + { + "epoch": 2.23, + "grad_norm": 17.551984786987305, + "learning_rate": 5.162531899761537e-06, + "loss": 0.2072, + "step": 17734 + }, + { + "epoch": 2.23, + "grad_norm": 15.895017623901367, + "learning_rate": 5.161695184704849e-06, + "loss": 0.9385, + "step": 17735 + }, + { + "epoch": 2.23, + "grad_norm": 42.44586181640625, + "learning_rate": 5.160858469648162e-06, + "loss": 0.6849, + "step": 17736 + }, + { + "epoch": 2.23, + "grad_norm": 14.050836563110352, + "learning_rate": 5.160021754591474e-06, + "loss": 0.5258, + "step": 17737 + }, + { + "epoch": 2.23, + "grad_norm": 21.7032413482666, + "learning_rate": 5.159185039534786e-06, + "loss": 2.3616, + "step": 17738 + }, + { + "epoch": 2.23, + "grad_norm": 21.918964385986328, + "learning_rate": 5.1583483244781e-06, + "loss": 0.6126, + "step": 17739 + }, + { + "epoch": 2.23, + "grad_norm": 13.94185733795166, + "learning_rate": 5.157511609421412e-06, + "loss": 0.7284, + "step": 17740 + }, + { + "epoch": 2.23, + "grad_norm": 9.73373794555664, + "learning_rate": 5.156674894364725e-06, + "loss": 0.623, + "step": 17741 + }, + { + "epoch": 2.23, + "grad_norm": 13.341050148010254, + "learning_rate": 5.155838179308037e-06, + "loss": 0.3461, + "step": 17742 + }, + { + "epoch": 2.23, + "grad_norm": 11.477652549743652, + "learning_rate": 5.15500146425135e-06, + "loss": 1.3524, + "step": 17743 + }, + { + "epoch": 2.23, + "grad_norm": 15.867574691772461, + "learning_rate": 5.154164749194662e-06, + "loss": 1.4677, + "step": 17744 + }, + { + "epoch": 2.23, + "grad_norm": 17.404987335205078, + "learning_rate": 5.153328034137974e-06, + "loss": 0.4331, + "step": 17745 + }, + { + "epoch": 2.23, + "grad_norm": 12.849403381347656, + "learning_rate": 5.152491319081288e-06, + "loss": 0.5099, + "step": 17746 + }, + { + "epoch": 2.23, + "grad_norm": 18.763734817504883, + "learning_rate": 5.1516546040246e-06, + "loss": 1.9163, + "step": 17747 + }, + { + "epoch": 2.23, + "grad_norm": 24.120838165283203, + "learning_rate": 5.1508178889679126e-06, + "loss": 1.3214, + "step": 17748 + }, + { + "epoch": 2.23, + "grad_norm": 41.783485412597656, + "learning_rate": 5.1499811739112245e-06, + "loss": 2.4696, + "step": 17749 + }, + { + "epoch": 2.23, + "grad_norm": 17.785659790039062, + "learning_rate": 5.149144458854537e-06, + "loss": 0.6855, + "step": 17750 + }, + { + "epoch": 2.23, + "grad_norm": 12.047317504882812, + "learning_rate": 5.14830774379785e-06, + "loss": 0.4267, + "step": 17751 + }, + { + "epoch": 2.23, + "grad_norm": 4.101709842681885, + "learning_rate": 5.147471028741162e-06, + "loss": 0.2058, + "step": 17752 + }, + { + "epoch": 2.23, + "grad_norm": 17.78318977355957, + "learning_rate": 5.146634313684476e-06, + "loss": 0.8566, + "step": 17753 + }, + { + "epoch": 2.23, + "grad_norm": 15.483389854431152, + "learning_rate": 5.145797598627788e-06, + "loss": 1.5509, + "step": 17754 + }, + { + "epoch": 2.23, + "grad_norm": 30.174823760986328, + "learning_rate": 5.1449608835711005e-06, + "loss": 1.3108, + "step": 17755 + }, + { + "epoch": 2.23, + "grad_norm": 4.915126323699951, + "learning_rate": 5.1441241685144124e-06, + "loss": 0.4768, + "step": 17756 + }, + { + "epoch": 2.23, + "grad_norm": 15.084676742553711, + "learning_rate": 5.143287453457725e-06, + "loss": 1.3511, + "step": 17757 + }, + { + "epoch": 2.23, + "grad_norm": 12.236443519592285, + "learning_rate": 5.142450738401038e-06, + "loss": 0.8682, + "step": 17758 + }, + { + "epoch": 2.23, + "grad_norm": 8.270609855651855, + "learning_rate": 5.14161402334435e-06, + "loss": 0.4339, + "step": 17759 + }, + { + "epoch": 2.23, + "grad_norm": 39.991859436035156, + "learning_rate": 5.140777308287664e-06, + "loss": 2.3828, + "step": 17760 + }, + { + "epoch": 2.23, + "grad_norm": 41.814022064208984, + "learning_rate": 5.139940593230976e-06, + "loss": 1.1213, + "step": 17761 + }, + { + "epoch": 2.23, + "grad_norm": 7.52501916885376, + "learning_rate": 5.139103878174288e-06, + "loss": 0.3014, + "step": 17762 + }, + { + "epoch": 2.23, + "grad_norm": 20.50660514831543, + "learning_rate": 5.1382671631176e-06, + "loss": 0.9175, + "step": 17763 + }, + { + "epoch": 2.23, + "grad_norm": 18.19256591796875, + "learning_rate": 5.137430448060912e-06, + "loss": 0.7378, + "step": 17764 + }, + { + "epoch": 2.23, + "grad_norm": 3.9844133853912354, + "learning_rate": 5.136593733004226e-06, + "loss": 0.0584, + "step": 17765 + }, + { + "epoch": 2.23, + "grad_norm": 14.300190925598145, + "learning_rate": 5.135757017947538e-06, + "loss": 0.7404, + "step": 17766 + }, + { + "epoch": 2.23, + "grad_norm": 38.13930130004883, + "learning_rate": 5.1349203028908516e-06, + "loss": 0.8283, + "step": 17767 + }, + { + "epoch": 2.23, + "grad_norm": 11.909294128417969, + "learning_rate": 5.1340835878341635e-06, + "loss": 0.8189, + "step": 17768 + }, + { + "epoch": 2.23, + "grad_norm": 11.108964920043945, + "learning_rate": 5.133246872777476e-06, + "loss": 0.8144, + "step": 17769 + }, + { + "epoch": 2.23, + "grad_norm": 8.476258277893066, + "learning_rate": 5.132410157720788e-06, + "loss": 0.5914, + "step": 17770 + }, + { + "epoch": 2.23, + "grad_norm": 6.253983497619629, + "learning_rate": 5.1315734426641e-06, + "loss": 0.4603, + "step": 17771 + }, + { + "epoch": 2.23, + "grad_norm": 33.24333572387695, + "learning_rate": 5.130736727607414e-06, + "loss": 0.9329, + "step": 17772 + }, + { + "epoch": 2.23, + "grad_norm": 16.08575439453125, + "learning_rate": 5.129900012550726e-06, + "loss": 0.5251, + "step": 17773 + }, + { + "epoch": 2.23, + "grad_norm": 115.9076156616211, + "learning_rate": 5.1290632974940395e-06, + "loss": 2.309, + "step": 17774 + }, + { + "epoch": 2.23, + "grad_norm": 19.51165008544922, + "learning_rate": 5.1282265824373514e-06, + "loss": 0.4296, + "step": 17775 + }, + { + "epoch": 2.23, + "grad_norm": 17.818742752075195, + "learning_rate": 5.127389867380664e-06, + "loss": 0.9087, + "step": 17776 + }, + { + "epoch": 2.23, + "grad_norm": 16.055883407592773, + "learning_rate": 5.126553152323976e-06, + "loss": 0.6511, + "step": 17777 + }, + { + "epoch": 2.23, + "grad_norm": 24.93146324157715, + "learning_rate": 5.125716437267288e-06, + "loss": 1.779, + "step": 17778 + }, + { + "epoch": 2.23, + "grad_norm": 29.532642364501953, + "learning_rate": 5.124879722210602e-06, + "loss": 0.9848, + "step": 17779 + }, + { + "epoch": 2.23, + "grad_norm": 10.471334457397461, + "learning_rate": 5.124043007153914e-06, + "loss": 0.8294, + "step": 17780 + }, + { + "epoch": 2.23, + "grad_norm": 22.57203483581543, + "learning_rate": 5.1232062920972266e-06, + "loss": 1.5461, + "step": 17781 + }, + { + "epoch": 2.23, + "grad_norm": 19.7887020111084, + "learning_rate": 5.122369577040539e-06, + "loss": 0.9483, + "step": 17782 + }, + { + "epoch": 2.23, + "grad_norm": 6.547120094299316, + "learning_rate": 5.121532861983852e-06, + "loss": 0.4218, + "step": 17783 + }, + { + "epoch": 2.23, + "grad_norm": 11.875927925109863, + "learning_rate": 5.120696146927164e-06, + "loss": 0.7022, + "step": 17784 + }, + { + "epoch": 2.23, + "grad_norm": 8.516631126403809, + "learning_rate": 5.119859431870476e-06, + "loss": 0.8346, + "step": 17785 + }, + { + "epoch": 2.23, + "grad_norm": 24.55098533630371, + "learning_rate": 5.11902271681379e-06, + "loss": 0.9406, + "step": 17786 + }, + { + "epoch": 2.23, + "grad_norm": 39.46234893798828, + "learning_rate": 5.118186001757102e-06, + "loss": 0.9281, + "step": 17787 + }, + { + "epoch": 2.23, + "grad_norm": 10.087285995483398, + "learning_rate": 5.1173492867004145e-06, + "loss": 0.6703, + "step": 17788 + }, + { + "epoch": 2.23, + "grad_norm": 15.53774356842041, + "learning_rate": 5.116512571643727e-06, + "loss": 1.8845, + "step": 17789 + }, + { + "epoch": 2.23, + "grad_norm": 18.932775497436523, + "learning_rate": 5.11567585658704e-06, + "loss": 1.1073, + "step": 17790 + }, + { + "epoch": 2.23, + "grad_norm": 12.49699592590332, + "learning_rate": 5.114839141530352e-06, + "loss": 0.9165, + "step": 17791 + }, + { + "epoch": 2.23, + "grad_norm": 19.66526222229004, + "learning_rate": 5.114002426473664e-06, + "loss": 1.3602, + "step": 17792 + }, + { + "epoch": 2.23, + "grad_norm": 12.239690780639648, + "learning_rate": 5.113165711416978e-06, + "loss": 1.5688, + "step": 17793 + }, + { + "epoch": 2.23, + "grad_norm": 24.5960750579834, + "learning_rate": 5.11232899636029e-06, + "loss": 1.767, + "step": 17794 + }, + { + "epoch": 2.23, + "grad_norm": 9.939472198486328, + "learning_rate": 5.111492281303602e-06, + "loss": 0.7966, + "step": 17795 + }, + { + "epoch": 2.23, + "grad_norm": 9.753925323486328, + "learning_rate": 5.110655566246915e-06, + "loss": 1.5595, + "step": 17796 + }, + { + "epoch": 2.23, + "grad_norm": 6.685403347015381, + "learning_rate": 5.109818851190228e-06, + "loss": 1.0454, + "step": 17797 + }, + { + "epoch": 2.23, + "grad_norm": 22.421031951904297, + "learning_rate": 5.10898213613354e-06, + "loss": 2.2144, + "step": 17798 + }, + { + "epoch": 2.23, + "grad_norm": 15.689521789550781, + "learning_rate": 5.108145421076852e-06, + "loss": 1.0526, + "step": 17799 + }, + { + "epoch": 2.23, + "grad_norm": 16.362619400024414, + "learning_rate": 5.1073087060201656e-06, + "loss": 0.7085, + "step": 17800 + }, + { + "epoch": 2.23, + "grad_norm": 20.69801139831543, + "learning_rate": 5.1064719909634775e-06, + "loss": 1.1504, + "step": 17801 + }, + { + "epoch": 2.23, + "grad_norm": 23.02520179748535, + "learning_rate": 5.10563527590679e-06, + "loss": 1.2343, + "step": 17802 + }, + { + "epoch": 2.23, + "grad_norm": 22.365503311157227, + "learning_rate": 5.104798560850103e-06, + "loss": 1.7095, + "step": 17803 + }, + { + "epoch": 2.23, + "grad_norm": 9.561039924621582, + "learning_rate": 5.103961845793416e-06, + "loss": 1.0846, + "step": 17804 + }, + { + "epoch": 2.23, + "grad_norm": 6.607800006866455, + "learning_rate": 5.103125130736728e-06, + "loss": 0.1923, + "step": 17805 + }, + { + "epoch": 2.23, + "grad_norm": 5.634875774383545, + "learning_rate": 5.10228841568004e-06, + "loss": 0.085, + "step": 17806 + }, + { + "epoch": 2.23, + "grad_norm": 23.58032989501953, + "learning_rate": 5.1014517006233535e-06, + "loss": 0.8119, + "step": 17807 + }, + { + "epoch": 2.23, + "grad_norm": 3.756401538848877, + "learning_rate": 5.1006149855666654e-06, + "loss": 0.4115, + "step": 17808 + }, + { + "epoch": 2.23, + "grad_norm": 22.091915130615234, + "learning_rate": 5.099778270509978e-06, + "loss": 0.7643, + "step": 17809 + }, + { + "epoch": 2.24, + "grad_norm": 26.409635543823242, + "learning_rate": 5.098941555453291e-06, + "loss": 0.7003, + "step": 17810 + }, + { + "epoch": 2.24, + "grad_norm": 12.126300811767578, + "learning_rate": 5.098104840396604e-06, + "loss": 0.2088, + "step": 17811 + }, + { + "epoch": 2.24, + "grad_norm": 10.77668571472168, + "learning_rate": 5.097268125339916e-06, + "loss": 1.3343, + "step": 17812 + }, + { + "epoch": 2.24, + "grad_norm": 21.273651123046875, + "learning_rate": 5.096431410283228e-06, + "loss": 0.7169, + "step": 17813 + }, + { + "epoch": 2.24, + "grad_norm": 6.164846420288086, + "learning_rate": 5.095594695226541e-06, + "loss": 1.7545, + "step": 17814 + }, + { + "epoch": 2.24, + "grad_norm": 33.088409423828125, + "learning_rate": 5.094757980169853e-06, + "loss": 1.7557, + "step": 17815 + }, + { + "epoch": 2.24, + "grad_norm": 12.935946464538574, + "learning_rate": 5.093921265113166e-06, + "loss": 0.72, + "step": 17816 + }, + { + "epoch": 2.24, + "grad_norm": 9.345338821411133, + "learning_rate": 5.093084550056478e-06, + "loss": 0.3246, + "step": 17817 + }, + { + "epoch": 2.24, + "grad_norm": 13.312705039978027, + "learning_rate": 5.092247834999792e-06, + "loss": 0.7129, + "step": 17818 + }, + { + "epoch": 2.24, + "grad_norm": 3.6184065341949463, + "learning_rate": 5.091411119943104e-06, + "loss": 0.2902, + "step": 17819 + }, + { + "epoch": 2.24, + "grad_norm": 21.613859176635742, + "learning_rate": 5.090574404886416e-06, + "loss": 0.3501, + "step": 17820 + }, + { + "epoch": 2.24, + "grad_norm": 7.444402694702148, + "learning_rate": 5.089737689829729e-06, + "loss": 0.1674, + "step": 17821 + }, + { + "epoch": 2.24, + "grad_norm": 102.95565795898438, + "learning_rate": 5.088900974773041e-06, + "loss": 0.5143, + "step": 17822 + }, + { + "epoch": 2.24, + "grad_norm": 3.1632742881774902, + "learning_rate": 5.088064259716354e-06, + "loss": 0.0502, + "step": 17823 + }, + { + "epoch": 2.24, + "grad_norm": 54.36214065551758, + "learning_rate": 5.087227544659666e-06, + "loss": 2.7835, + "step": 17824 + }, + { + "epoch": 2.24, + "grad_norm": 10.562241554260254, + "learning_rate": 5.08639082960298e-06, + "loss": 0.9218, + "step": 17825 + }, + { + "epoch": 2.24, + "grad_norm": 81.18700408935547, + "learning_rate": 5.085554114546292e-06, + "loss": 0.4029, + "step": 17826 + }, + { + "epoch": 2.24, + "grad_norm": 14.44067096710205, + "learning_rate": 5.084717399489604e-06, + "loss": 0.9139, + "step": 17827 + }, + { + "epoch": 2.24, + "grad_norm": 18.229602813720703, + "learning_rate": 5.083880684432917e-06, + "loss": 0.9942, + "step": 17828 + }, + { + "epoch": 2.24, + "grad_norm": 15.732873916625977, + "learning_rate": 5.083043969376229e-06, + "loss": 4.3611, + "step": 17829 + }, + { + "epoch": 2.24, + "grad_norm": 8.699427604675293, + "learning_rate": 5.082207254319542e-06, + "loss": 0.2181, + "step": 17830 + }, + { + "epoch": 2.24, + "grad_norm": 9.435542106628418, + "learning_rate": 5.081370539262854e-06, + "loss": 0.6497, + "step": 17831 + }, + { + "epoch": 2.24, + "grad_norm": 4.771695137023926, + "learning_rate": 5.080533824206168e-06, + "loss": 0.2273, + "step": 17832 + }, + { + "epoch": 2.24, + "grad_norm": 10.254509925842285, + "learning_rate": 5.0796971091494795e-06, + "loss": 1.5389, + "step": 17833 + }, + { + "epoch": 2.24, + "grad_norm": 25.043521881103516, + "learning_rate": 5.0788603940927915e-06, + "loss": 1.1398, + "step": 17834 + }, + { + "epoch": 2.24, + "grad_norm": 10.375333786010742, + "learning_rate": 5.078023679036105e-06, + "loss": 0.6474, + "step": 17835 + }, + { + "epoch": 2.24, + "grad_norm": 16.78009605407715, + "learning_rate": 5.077186963979417e-06, + "loss": 0.5516, + "step": 17836 + }, + { + "epoch": 2.24, + "grad_norm": 82.75853729248047, + "learning_rate": 5.07635024892273e-06, + "loss": 1.4797, + "step": 17837 + }, + { + "epoch": 2.24, + "grad_norm": 7.424945831298828, + "learning_rate": 5.075513533866042e-06, + "loss": 0.9605, + "step": 17838 + }, + { + "epoch": 2.24, + "grad_norm": 19.120899200439453, + "learning_rate": 5.0746768188093555e-06, + "loss": 1.8178, + "step": 17839 + }, + { + "epoch": 2.24, + "grad_norm": 26.920454025268555, + "learning_rate": 5.0738401037526675e-06, + "loss": 1.1617, + "step": 17840 + }, + { + "epoch": 2.24, + "grad_norm": 13.276774406433105, + "learning_rate": 5.073003388695979e-06, + "loss": 0.5867, + "step": 17841 + }, + { + "epoch": 2.24, + "grad_norm": 12.754081726074219, + "learning_rate": 5.072166673639293e-06, + "loss": 0.6544, + "step": 17842 + }, + { + "epoch": 2.24, + "grad_norm": 6.249405384063721, + "learning_rate": 5.071329958582605e-06, + "loss": 0.3336, + "step": 17843 + }, + { + "epoch": 2.24, + "grad_norm": 15.704106330871582, + "learning_rate": 5.070493243525918e-06, + "loss": 0.538, + "step": 17844 + }, + { + "epoch": 2.24, + "grad_norm": 11.90325927734375, + "learning_rate": 5.06965652846923e-06, + "loss": 1.2049, + "step": 17845 + }, + { + "epoch": 2.24, + "grad_norm": 29.243885040283203, + "learning_rate": 5.0688198134125434e-06, + "loss": 1.0439, + "step": 17846 + }, + { + "epoch": 2.24, + "grad_norm": 7.157782077789307, + "learning_rate": 5.067983098355855e-06, + "loss": 0.6835, + "step": 17847 + }, + { + "epoch": 2.24, + "grad_norm": 13.187911033630371, + "learning_rate": 5.067146383299167e-06, + "loss": 2.1351, + "step": 17848 + }, + { + "epoch": 2.24, + "grad_norm": 4.006730079650879, + "learning_rate": 5.066309668242481e-06, + "loss": 0.4027, + "step": 17849 + }, + { + "epoch": 2.24, + "grad_norm": 109.41995239257812, + "learning_rate": 5.065472953185793e-06, + "loss": 1.1113, + "step": 17850 + }, + { + "epoch": 2.24, + "grad_norm": 9.678338050842285, + "learning_rate": 5.064636238129106e-06, + "loss": 0.3227, + "step": 17851 + }, + { + "epoch": 2.24, + "grad_norm": 8.335624694824219, + "learning_rate": 5.063799523072418e-06, + "loss": 1.5265, + "step": 17852 + }, + { + "epoch": 2.24, + "grad_norm": 18.686553955078125, + "learning_rate": 5.062962808015731e-06, + "loss": 0.4259, + "step": 17853 + }, + { + "epoch": 2.24, + "grad_norm": 60.7696533203125, + "learning_rate": 5.062126092959043e-06, + "loss": 2.018, + "step": 17854 + }, + { + "epoch": 2.24, + "grad_norm": 10.57986831665039, + "learning_rate": 5.061289377902355e-06, + "loss": 1.7964, + "step": 17855 + }, + { + "epoch": 2.24, + "grad_norm": 7.243568420410156, + "learning_rate": 5.060452662845669e-06, + "loss": 0.5711, + "step": 17856 + }, + { + "epoch": 2.24, + "grad_norm": 28.93546485900879, + "learning_rate": 5.059615947788981e-06, + "loss": 1.0337, + "step": 17857 + }, + { + "epoch": 2.24, + "grad_norm": 7.0018815994262695, + "learning_rate": 5.058779232732294e-06, + "loss": 0.2486, + "step": 17858 + }, + { + "epoch": 2.24, + "grad_norm": 3.1246564388275146, + "learning_rate": 5.057942517675606e-06, + "loss": 0.2019, + "step": 17859 + }, + { + "epoch": 2.24, + "grad_norm": 7.868934154510498, + "learning_rate": 5.057105802618919e-06, + "loss": 0.3886, + "step": 17860 + }, + { + "epoch": 2.24, + "grad_norm": 177.0337677001953, + "learning_rate": 5.056269087562231e-06, + "loss": 1.9114, + "step": 17861 + }, + { + "epoch": 2.24, + "grad_norm": 28.44269561767578, + "learning_rate": 5.055432372505543e-06, + "loss": 0.7514, + "step": 17862 + }, + { + "epoch": 2.24, + "grad_norm": 23.441205978393555, + "learning_rate": 5.054595657448856e-06, + "loss": 1.4996, + "step": 17863 + }, + { + "epoch": 2.24, + "grad_norm": 19.890857696533203, + "learning_rate": 5.053758942392169e-06, + "loss": 2.2758, + "step": 17864 + }, + { + "epoch": 2.24, + "grad_norm": 29.43320655822754, + "learning_rate": 5.052922227335482e-06, + "loss": 2.0113, + "step": 17865 + }, + { + "epoch": 2.24, + "grad_norm": 43.13685989379883, + "learning_rate": 5.0520855122787935e-06, + "loss": 2.4186, + "step": 17866 + }, + { + "epoch": 2.24, + "grad_norm": 14.734983444213867, + "learning_rate": 5.051248797222107e-06, + "loss": 0.7916, + "step": 17867 + }, + { + "epoch": 2.24, + "grad_norm": 9.410062789916992, + "learning_rate": 5.050412082165419e-06, + "loss": 0.6375, + "step": 17868 + }, + { + "epoch": 2.24, + "grad_norm": 50.64573669433594, + "learning_rate": 5.049575367108731e-06, + "loss": 1.6407, + "step": 17869 + }, + { + "epoch": 2.24, + "grad_norm": 20.77255630493164, + "learning_rate": 5.048738652052044e-06, + "loss": 1.1511, + "step": 17870 + }, + { + "epoch": 2.24, + "grad_norm": 54.63801956176758, + "learning_rate": 5.047901936995357e-06, + "loss": 2.2643, + "step": 17871 + }, + { + "epoch": 2.24, + "grad_norm": 8.90231704711914, + "learning_rate": 5.0470652219386695e-06, + "loss": 0.6662, + "step": 17872 + }, + { + "epoch": 2.24, + "grad_norm": 9.863375663757324, + "learning_rate": 5.0462285068819815e-06, + "loss": 2.183, + "step": 17873 + }, + { + "epoch": 2.24, + "grad_norm": 9.324417114257812, + "learning_rate": 5.045391791825295e-06, + "loss": 1.1831, + "step": 17874 + }, + { + "epoch": 2.24, + "grad_norm": 41.506832122802734, + "learning_rate": 5.044555076768607e-06, + "loss": 1.467, + "step": 17875 + }, + { + "epoch": 2.24, + "grad_norm": 5.597385406494141, + "learning_rate": 5.043718361711919e-06, + "loss": 0.6364, + "step": 17876 + }, + { + "epoch": 2.24, + "grad_norm": 11.700260162353516, + "learning_rate": 5.042881646655232e-06, + "loss": 0.211, + "step": 17877 + }, + { + "epoch": 2.24, + "grad_norm": 9.609248161315918, + "learning_rate": 5.042044931598545e-06, + "loss": 0.5709, + "step": 17878 + }, + { + "epoch": 2.24, + "grad_norm": 44.30210876464844, + "learning_rate": 5.041208216541857e-06, + "loss": 2.3704, + "step": 17879 + }, + { + "epoch": 2.24, + "grad_norm": 39.628448486328125, + "learning_rate": 5.040371501485169e-06, + "loss": 2.082, + "step": 17880 + }, + { + "epoch": 2.24, + "grad_norm": 6.445079326629639, + "learning_rate": 5.039534786428483e-06, + "loss": 0.2678, + "step": 17881 + }, + { + "epoch": 2.24, + "grad_norm": 5.163549900054932, + "learning_rate": 5.038698071371795e-06, + "loss": 0.198, + "step": 17882 + }, + { + "epoch": 2.24, + "grad_norm": 7.012538433074951, + "learning_rate": 5.037861356315107e-06, + "loss": 0.2496, + "step": 17883 + }, + { + "epoch": 2.24, + "grad_norm": 2.5808498859405518, + "learning_rate": 5.03702464125842e-06, + "loss": 0.2042, + "step": 17884 + }, + { + "epoch": 2.24, + "grad_norm": 18.245594024658203, + "learning_rate": 5.0361879262017325e-06, + "loss": 0.9452, + "step": 17885 + }, + { + "epoch": 2.24, + "grad_norm": 4.119741916656494, + "learning_rate": 5.035351211145045e-06, + "loss": 0.9425, + "step": 17886 + }, + { + "epoch": 2.24, + "grad_norm": 50.56310272216797, + "learning_rate": 5.034514496088357e-06, + "loss": 3.0142, + "step": 17887 + }, + { + "epoch": 2.24, + "grad_norm": 173.828125, + "learning_rate": 5.033677781031671e-06, + "loss": 1.614, + "step": 17888 + }, + { + "epoch": 2.25, + "grad_norm": 38.569576263427734, + "learning_rate": 5.032841065974983e-06, + "loss": 1.5961, + "step": 17889 + }, + { + "epoch": 2.25, + "grad_norm": 25.682310104370117, + "learning_rate": 5.032004350918295e-06, + "loss": 2.4878, + "step": 17890 + }, + { + "epoch": 2.25, + "grad_norm": 6.593906879425049, + "learning_rate": 5.031167635861608e-06, + "loss": 0.8402, + "step": 17891 + }, + { + "epoch": 2.25, + "grad_norm": 12.642677307128906, + "learning_rate": 5.03033092080492e-06, + "loss": 0.8541, + "step": 17892 + }, + { + "epoch": 2.25, + "grad_norm": 7.8724365234375, + "learning_rate": 5.029494205748233e-06, + "loss": 0.5848, + "step": 17893 + }, + { + "epoch": 2.25, + "grad_norm": 21.33145523071289, + "learning_rate": 5.028657490691545e-06, + "loss": 1.9971, + "step": 17894 + }, + { + "epoch": 2.25, + "grad_norm": 38.787960052490234, + "learning_rate": 5.027820775634859e-06, + "loss": 1.665, + "step": 17895 + }, + { + "epoch": 2.25, + "grad_norm": 11.145357131958008, + "learning_rate": 5.026984060578171e-06, + "loss": 0.8777, + "step": 17896 + }, + { + "epoch": 2.25, + "grad_norm": 14.004895210266113, + "learning_rate": 5.026147345521483e-06, + "loss": 0.5583, + "step": 17897 + }, + { + "epoch": 2.25, + "grad_norm": 68.97599029541016, + "learning_rate": 5.0253106304647956e-06, + "loss": 0.5841, + "step": 17898 + }, + { + "epoch": 2.25, + "grad_norm": 7.229062557220459, + "learning_rate": 5.0244739154081075e-06, + "loss": 0.452, + "step": 17899 + }, + { + "epoch": 2.25, + "grad_norm": 22.491193771362305, + "learning_rate": 5.023637200351421e-06, + "loss": 1.6259, + "step": 17900 + }, + { + "epoch": 2.25, + "grad_norm": 94.6976318359375, + "learning_rate": 5.022800485294733e-06, + "loss": 2.652, + "step": 17901 + }, + { + "epoch": 2.25, + "grad_norm": 13.50717830657959, + "learning_rate": 5.021963770238047e-06, + "loss": 1.7198, + "step": 17902 + }, + { + "epoch": 2.25, + "grad_norm": 3.883352279663086, + "learning_rate": 5.021127055181359e-06, + "loss": 0.2975, + "step": 17903 + }, + { + "epoch": 2.25, + "grad_norm": 6.9274001121521, + "learning_rate": 5.020290340124671e-06, + "loss": 0.4971, + "step": 17904 + }, + { + "epoch": 2.25, + "grad_norm": 23.261295318603516, + "learning_rate": 5.0194536250679835e-06, + "loss": 1.2112, + "step": 17905 + }, + { + "epoch": 2.25, + "grad_norm": 10.829798698425293, + "learning_rate": 5.0186169100112954e-06, + "loss": 1.3173, + "step": 17906 + }, + { + "epoch": 2.25, + "grad_norm": 2.5662546157836914, + "learning_rate": 5.017780194954609e-06, + "loss": 0.1248, + "step": 17907 + }, + { + "epoch": 2.25, + "grad_norm": 24.686922073364258, + "learning_rate": 5.016943479897921e-06, + "loss": 2.8531, + "step": 17908 + }, + { + "epoch": 2.25, + "grad_norm": 19.223573684692383, + "learning_rate": 5.016106764841234e-06, + "loss": 1.7334, + "step": 17909 + }, + { + "epoch": 2.25, + "grad_norm": 12.839527130126953, + "learning_rate": 5.015270049784547e-06, + "loss": 0.7207, + "step": 17910 + }, + { + "epoch": 2.25, + "grad_norm": 4.787776947021484, + "learning_rate": 5.014433334727859e-06, + "loss": 0.3833, + "step": 17911 + }, + { + "epoch": 2.25, + "grad_norm": 3.333447217941284, + "learning_rate": 5.013596619671171e-06, + "loss": 0.1528, + "step": 17912 + }, + { + "epoch": 2.25, + "grad_norm": 19.80486297607422, + "learning_rate": 5.012759904614483e-06, + "loss": 0.7569, + "step": 17913 + }, + { + "epoch": 2.25, + "grad_norm": 10.29593276977539, + "learning_rate": 5.011923189557797e-06, + "loss": 0.3161, + "step": 17914 + }, + { + "epoch": 2.25, + "grad_norm": 9.504191398620605, + "learning_rate": 5.011086474501109e-06, + "loss": 0.5831, + "step": 17915 + }, + { + "epoch": 2.25, + "grad_norm": 16.78968620300293, + "learning_rate": 5.010249759444422e-06, + "loss": 0.9673, + "step": 17916 + }, + { + "epoch": 2.25, + "grad_norm": 9.07922649383545, + "learning_rate": 5.0094130443877346e-06, + "loss": 0.533, + "step": 17917 + }, + { + "epoch": 2.25, + "grad_norm": 8.133056640625, + "learning_rate": 5.0085763293310465e-06, + "loss": 0.5087, + "step": 17918 + }, + { + "epoch": 2.25, + "grad_norm": 60.7887077331543, + "learning_rate": 5.007739614274359e-06, + "loss": 1.2224, + "step": 17919 + }, + { + "epoch": 2.25, + "grad_norm": 16.089384078979492, + "learning_rate": 5.006902899217671e-06, + "loss": 2.1645, + "step": 17920 + }, + { + "epoch": 2.25, + "grad_norm": 10.402376174926758, + "learning_rate": 5.006066184160985e-06, + "loss": 0.9864, + "step": 17921 + }, + { + "epoch": 2.25, + "grad_norm": 3.572499990463257, + "learning_rate": 5.005229469104297e-06, + "loss": 0.348, + "step": 17922 + }, + { + "epoch": 2.25, + "grad_norm": 11.185772895812988, + "learning_rate": 5.00439275404761e-06, + "loss": 1.6682, + "step": 17923 + }, + { + "epoch": 2.25, + "grad_norm": 13.980674743652344, + "learning_rate": 5.0035560389909225e-06, + "loss": 0.6962, + "step": 17924 + }, + { + "epoch": 2.25, + "grad_norm": 19.253629684448242, + "learning_rate": 5.0027193239342344e-06, + "loss": 1.5223, + "step": 17925 + }, + { + "epoch": 2.25, + "grad_norm": 9.644561767578125, + "learning_rate": 5.001882608877547e-06, + "loss": 0.3699, + "step": 17926 + }, + { + "epoch": 2.25, + "grad_norm": 19.637935638427734, + "learning_rate": 5.001045893820859e-06, + "loss": 0.4528, + "step": 17927 + }, + { + "epoch": 2.25, + "grad_norm": 47.7040901184082, + "learning_rate": 5.000209178764173e-06, + "loss": 1.1264, + "step": 17928 + }, + { + "epoch": 2.25, + "grad_norm": 6.8891191482543945, + "learning_rate": 4.999372463707485e-06, + "loss": 0.5625, + "step": 17929 + }, + { + "epoch": 2.25, + "grad_norm": 26.81926155090332, + "learning_rate": 4.998535748650798e-06, + "loss": 2.1966, + "step": 17930 + }, + { + "epoch": 2.25, + "grad_norm": 29.25107765197754, + "learning_rate": 4.99769903359411e-06, + "loss": 3.0911, + "step": 17931 + }, + { + "epoch": 2.25, + "grad_norm": 4.49907922744751, + "learning_rate": 4.996862318537422e-06, + "loss": 1.1366, + "step": 17932 + }, + { + "epoch": 2.25, + "grad_norm": 7.054262161254883, + "learning_rate": 4.996025603480735e-06, + "loss": 0.2225, + "step": 17933 + }, + { + "epoch": 2.25, + "grad_norm": 36.04946517944336, + "learning_rate": 4.995188888424047e-06, + "loss": 1.2687, + "step": 17934 + }, + { + "epoch": 2.25, + "grad_norm": 11.313431739807129, + "learning_rate": 4.99435217336736e-06, + "loss": 0.4309, + "step": 17935 + }, + { + "epoch": 2.25, + "grad_norm": 5.431252956390381, + "learning_rate": 4.993515458310673e-06, + "loss": 0.3986, + "step": 17936 + }, + { + "epoch": 2.25, + "grad_norm": 8.059343338012695, + "learning_rate": 4.9926787432539855e-06, + "loss": 0.7262, + "step": 17937 + }, + { + "epoch": 2.25, + "grad_norm": 25.366069793701172, + "learning_rate": 4.991842028197298e-06, + "loss": 0.6772, + "step": 17938 + }, + { + "epoch": 2.25, + "grad_norm": 4.172663688659668, + "learning_rate": 4.99100531314061e-06, + "loss": 0.1793, + "step": 17939 + }, + { + "epoch": 2.25, + "grad_norm": 39.05305862426758, + "learning_rate": 4.990168598083923e-06, + "loss": 0.9276, + "step": 17940 + }, + { + "epoch": 2.25, + "grad_norm": 6.8393964767456055, + "learning_rate": 4.989331883027235e-06, + "loss": 0.678, + "step": 17941 + }, + { + "epoch": 2.25, + "grad_norm": 11.347168922424316, + "learning_rate": 4.988495167970548e-06, + "loss": 0.9141, + "step": 17942 + }, + { + "epoch": 2.25, + "grad_norm": 20.5839900970459, + "learning_rate": 4.987658452913861e-06, + "loss": 1.2812, + "step": 17943 + }, + { + "epoch": 2.25, + "grad_norm": 3.2079124450683594, + "learning_rate": 4.9868217378571734e-06, + "loss": 0.2727, + "step": 17944 + }, + { + "epoch": 2.25, + "grad_norm": 25.391883850097656, + "learning_rate": 4.985985022800485e-06, + "loss": 0.7583, + "step": 17945 + }, + { + "epoch": 2.25, + "grad_norm": 6.606420040130615, + "learning_rate": 4.985148307743798e-06, + "loss": 0.329, + "step": 17946 + }, + { + "epoch": 2.25, + "grad_norm": 16.52248764038086, + "learning_rate": 4.984311592687111e-06, + "loss": 0.7837, + "step": 17947 + }, + { + "epoch": 2.25, + "grad_norm": 7.127424240112305, + "learning_rate": 4.983474877630423e-06, + "loss": 0.43, + "step": 17948 + }, + { + "epoch": 2.25, + "grad_norm": 8.710345268249512, + "learning_rate": 4.982638162573736e-06, + "loss": 1.8954, + "step": 17949 + }, + { + "epoch": 2.25, + "grad_norm": 11.40097427368164, + "learning_rate": 4.9818014475170486e-06, + "loss": 0.3691, + "step": 17950 + }, + { + "epoch": 2.25, + "grad_norm": 96.28443145751953, + "learning_rate": 4.980964732460361e-06, + "loss": 1.2264, + "step": 17951 + }, + { + "epoch": 2.25, + "grad_norm": 57.5449333190918, + "learning_rate": 4.980128017403673e-06, + "loss": 1.812, + "step": 17952 + }, + { + "epoch": 2.25, + "grad_norm": 24.60512351989746, + "learning_rate": 4.979291302346986e-06, + "loss": 2.532, + "step": 17953 + }, + { + "epoch": 2.25, + "grad_norm": 18.563039779663086, + "learning_rate": 4.978454587290299e-06, + "loss": 1.1142, + "step": 17954 + }, + { + "epoch": 2.25, + "grad_norm": 27.25714683532715, + "learning_rate": 4.977617872233611e-06, + "loss": 0.6761, + "step": 17955 + }, + { + "epoch": 2.25, + "grad_norm": 18.81361961364746, + "learning_rate": 4.976781157176924e-06, + "loss": 0.9452, + "step": 17956 + }, + { + "epoch": 2.25, + "grad_norm": 3.5752861499786377, + "learning_rate": 4.9759444421202365e-06, + "loss": 0.4309, + "step": 17957 + }, + { + "epoch": 2.25, + "grad_norm": 10.082378387451172, + "learning_rate": 4.975107727063549e-06, + "loss": 0.7022, + "step": 17958 + }, + { + "epoch": 2.25, + "grad_norm": 22.070344924926758, + "learning_rate": 4.974271012006861e-06, + "loss": 1.0242, + "step": 17959 + }, + { + "epoch": 2.25, + "grad_norm": 28.86927604675293, + "learning_rate": 4.973434296950174e-06, + "loss": 2.25, + "step": 17960 + }, + { + "epoch": 2.25, + "grad_norm": 13.019451141357422, + "learning_rate": 4.972597581893487e-06, + "loss": 0.7717, + "step": 17961 + }, + { + "epoch": 2.25, + "grad_norm": 10.226906776428223, + "learning_rate": 4.971760866836799e-06, + "loss": 1.1278, + "step": 17962 + }, + { + "epoch": 2.25, + "grad_norm": 8.096986770629883, + "learning_rate": 4.970924151780112e-06, + "loss": 0.7276, + "step": 17963 + }, + { + "epoch": 2.25, + "grad_norm": 2.092785596847534, + "learning_rate": 4.970087436723424e-06, + "loss": 0.0536, + "step": 17964 + }, + { + "epoch": 2.25, + "grad_norm": 10.456246376037598, + "learning_rate": 4.969250721666737e-06, + "loss": 0.5507, + "step": 17965 + }, + { + "epoch": 2.25, + "grad_norm": 17.064895629882812, + "learning_rate": 4.968414006610049e-06, + "loss": 1.1995, + "step": 17966 + }, + { + "epoch": 2.25, + "grad_norm": 113.00723266601562, + "learning_rate": 4.967577291553362e-06, + "loss": 1.3712, + "step": 17967 + }, + { + "epoch": 2.25, + "grad_norm": 10.568375587463379, + "learning_rate": 4.966740576496675e-06, + "loss": 0.4062, + "step": 17968 + }, + { + "epoch": 2.26, + "grad_norm": 12.843826293945312, + "learning_rate": 4.965903861439987e-06, + "loss": 1.7527, + "step": 17969 + }, + { + "epoch": 2.26, + "grad_norm": 18.707082748413086, + "learning_rate": 4.9650671463832995e-06, + "loss": 0.6466, + "step": 17970 + }, + { + "epoch": 2.26, + "grad_norm": 8.290353775024414, + "learning_rate": 4.964230431326612e-06, + "loss": 0.3877, + "step": 17971 + }, + { + "epoch": 2.26, + "grad_norm": 7.575691223144531, + "learning_rate": 4.963393716269925e-06, + "loss": 0.4287, + "step": 17972 + }, + { + "epoch": 2.26, + "grad_norm": 5.154853820800781, + "learning_rate": 4.962557001213237e-06, + "loss": 0.2588, + "step": 17973 + }, + { + "epoch": 2.26, + "grad_norm": 23.596227645874023, + "learning_rate": 4.96172028615655e-06, + "loss": 1.1944, + "step": 17974 + }, + { + "epoch": 2.26, + "grad_norm": 54.577293395996094, + "learning_rate": 4.960883571099862e-06, + "loss": 2.0033, + "step": 17975 + }, + { + "epoch": 2.26, + "grad_norm": 17.267406463623047, + "learning_rate": 4.960046856043175e-06, + "loss": 1.2169, + "step": 17976 + }, + { + "epoch": 2.26, + "grad_norm": 8.010673522949219, + "learning_rate": 4.9592101409864874e-06, + "loss": 0.35, + "step": 17977 + }, + { + "epoch": 2.26, + "grad_norm": 22.85257911682129, + "learning_rate": 4.9583734259298e-06, + "loss": 1.5214, + "step": 17978 + }, + { + "epoch": 2.26, + "grad_norm": 14.447754859924316, + "learning_rate": 4.957536710873113e-06, + "loss": 0.3605, + "step": 17979 + }, + { + "epoch": 2.26, + "grad_norm": 18.63277244567871, + "learning_rate": 4.956699995816425e-06, + "loss": 0.969, + "step": 17980 + }, + { + "epoch": 2.26, + "grad_norm": 7.11646032333374, + "learning_rate": 4.955863280759738e-06, + "loss": 0.2212, + "step": 17981 + }, + { + "epoch": 2.26, + "grad_norm": 167.06617736816406, + "learning_rate": 4.95502656570305e-06, + "loss": 0.998, + "step": 17982 + }, + { + "epoch": 2.26, + "grad_norm": 17.073394775390625, + "learning_rate": 4.9541898506463625e-06, + "loss": 2.1347, + "step": 17983 + }, + { + "epoch": 2.26, + "grad_norm": 31.762836456298828, + "learning_rate": 4.953353135589675e-06, + "loss": 1.343, + "step": 17984 + }, + { + "epoch": 2.26, + "grad_norm": 7.8832173347473145, + "learning_rate": 4.952516420532988e-06, + "loss": 0.7458, + "step": 17985 + }, + { + "epoch": 2.26, + "grad_norm": 3.58601713180542, + "learning_rate": 4.9516797054763e-06, + "loss": 0.5316, + "step": 17986 + }, + { + "epoch": 2.26, + "grad_norm": 29.055683135986328, + "learning_rate": 4.950842990419613e-06, + "loss": 0.6991, + "step": 17987 + }, + { + "epoch": 2.26, + "grad_norm": 4.409537315368652, + "learning_rate": 4.950006275362926e-06, + "loss": 0.2481, + "step": 17988 + }, + { + "epoch": 2.26, + "grad_norm": 10.621453285217285, + "learning_rate": 4.949169560306238e-06, + "loss": 1.452, + "step": 17989 + }, + { + "epoch": 2.26, + "grad_norm": 174.6796112060547, + "learning_rate": 4.9483328452495505e-06, + "loss": 1.3904, + "step": 17990 + }, + { + "epoch": 2.26, + "grad_norm": 14.097359657287598, + "learning_rate": 4.947496130192863e-06, + "loss": 1.0402, + "step": 17991 + }, + { + "epoch": 2.26, + "grad_norm": 18.349971771240234, + "learning_rate": 4.946659415136176e-06, + "loss": 1.0192, + "step": 17992 + }, + { + "epoch": 2.26, + "grad_norm": 12.767474174499512, + "learning_rate": 4.945822700079488e-06, + "loss": 0.7705, + "step": 17993 + }, + { + "epoch": 2.26, + "grad_norm": 14.867411613464355, + "learning_rate": 4.944985985022801e-06, + "loss": 1.523, + "step": 17994 + }, + { + "epoch": 2.26, + "grad_norm": 97.9448471069336, + "learning_rate": 4.944149269966114e-06, + "loss": 2.2483, + "step": 17995 + }, + { + "epoch": 2.26, + "grad_norm": 8.423328399658203, + "learning_rate": 4.943312554909426e-06, + "loss": 0.2358, + "step": 17996 + }, + { + "epoch": 2.26, + "grad_norm": 13.076930046081543, + "learning_rate": 4.942475839852738e-06, + "loss": 0.1316, + "step": 17997 + }, + { + "epoch": 2.26, + "grad_norm": 13.969695091247559, + "learning_rate": 4.941639124796051e-06, + "loss": 1.0452, + "step": 17998 + }, + { + "epoch": 2.26, + "grad_norm": 4.603994846343994, + "learning_rate": 4.940802409739364e-06, + "loss": 0.2369, + "step": 17999 + }, + { + "epoch": 2.26, + "grad_norm": 25.964096069335938, + "learning_rate": 4.939965694682676e-06, + "loss": 1.8896, + "step": 18000 + }, + { + "epoch": 2.26, + "eval_loss": 0.07683395594358444, + "eval_runtime": 94.3131, + "eval_samples_per_second": 37.556, + "eval_steps_per_second": 37.556, + "step": 18000 + }, + { + "epoch": 2.26, + "grad_norm": 11.208454132080078, + "learning_rate": 4.939128979625989e-06, + "loss": 0.6867, + "step": 18001 + }, + { + "epoch": 2.26, + "grad_norm": 17.421865463256836, + "learning_rate": 4.9382922645693015e-06, + "loss": 1.2619, + "step": 18002 + }, + { + "epoch": 2.26, + "grad_norm": 10.385871887207031, + "learning_rate": 4.9374555495126135e-06, + "loss": 0.9416, + "step": 18003 + }, + { + "epoch": 2.26, + "grad_norm": 12.817423820495605, + "learning_rate": 4.936618834455926e-06, + "loss": 2.2117, + "step": 18004 + }, + { + "epoch": 2.26, + "grad_norm": 35.08500671386719, + "learning_rate": 4.935782119399239e-06, + "loss": 2.569, + "step": 18005 + }, + { + "epoch": 2.26, + "grad_norm": 22.860774993896484, + "learning_rate": 4.934945404342552e-06, + "loss": 0.8472, + "step": 18006 + }, + { + "epoch": 2.26, + "grad_norm": 7.265113830566406, + "learning_rate": 4.934108689285864e-06, + "loss": 2.1218, + "step": 18007 + }, + { + "epoch": 2.26, + "grad_norm": 11.792815208435059, + "learning_rate": 4.933271974229177e-06, + "loss": 1.3434, + "step": 18008 + }, + { + "epoch": 2.26, + "grad_norm": 5.4329118728637695, + "learning_rate": 4.9324352591724895e-06, + "loss": 0.4935, + "step": 18009 + }, + { + "epoch": 2.26, + "grad_norm": 26.49026107788086, + "learning_rate": 4.931598544115801e-06, + "loss": 1.1913, + "step": 18010 + }, + { + "epoch": 2.26, + "grad_norm": 20.838491439819336, + "learning_rate": 4.930761829059114e-06, + "loss": 1.0779, + "step": 18011 + }, + { + "epoch": 2.26, + "grad_norm": 5.645506858825684, + "learning_rate": 4.929925114002427e-06, + "loss": 0.3584, + "step": 18012 + }, + { + "epoch": 2.26, + "grad_norm": 3.891331195831299, + "learning_rate": 4.92908839894574e-06, + "loss": 0.1227, + "step": 18013 + }, + { + "epoch": 2.26, + "grad_norm": 1.6347407102584839, + "learning_rate": 4.928251683889052e-06, + "loss": 0.0455, + "step": 18014 + }, + { + "epoch": 2.26, + "grad_norm": 16.91248321533203, + "learning_rate": 4.927414968832365e-06, + "loss": 0.8666, + "step": 18015 + }, + { + "epoch": 2.26, + "grad_norm": 33.073455810546875, + "learning_rate": 4.926578253775677e-06, + "loss": 1.8501, + "step": 18016 + }, + { + "epoch": 2.26, + "grad_norm": 14.698752403259277, + "learning_rate": 4.925741538718989e-06, + "loss": 0.5226, + "step": 18017 + }, + { + "epoch": 2.26, + "grad_norm": 41.15288162231445, + "learning_rate": 4.924904823662302e-06, + "loss": 0.4485, + "step": 18018 + }, + { + "epoch": 2.26, + "grad_norm": 33.41960525512695, + "learning_rate": 4.924068108605615e-06, + "loss": 0.9904, + "step": 18019 + }, + { + "epoch": 2.26, + "grad_norm": 28.272079467773438, + "learning_rate": 4.923231393548928e-06, + "loss": 0.3973, + "step": 18020 + }, + { + "epoch": 2.26, + "grad_norm": 12.279377937316895, + "learning_rate": 4.92239467849224e-06, + "loss": 0.4734, + "step": 18021 + }, + { + "epoch": 2.26, + "grad_norm": 13.807548522949219, + "learning_rate": 4.9215579634355525e-06, + "loss": 0.1609, + "step": 18022 + }, + { + "epoch": 2.26, + "grad_norm": 5.790491580963135, + "learning_rate": 4.920721248378865e-06, + "loss": 0.2628, + "step": 18023 + }, + { + "epoch": 2.26, + "grad_norm": 30.30073356628418, + "learning_rate": 4.919884533322177e-06, + "loss": 1.6022, + "step": 18024 + }, + { + "epoch": 2.26, + "grad_norm": 21.043136596679688, + "learning_rate": 4.91904781826549e-06, + "loss": 2.1917, + "step": 18025 + }, + { + "epoch": 2.26, + "grad_norm": 15.728610038757324, + "learning_rate": 4.918211103208803e-06, + "loss": 0.8905, + "step": 18026 + }, + { + "epoch": 2.26, + "grad_norm": 9.309231758117676, + "learning_rate": 4.917374388152115e-06, + "loss": 0.5748, + "step": 18027 + }, + { + "epoch": 2.26, + "grad_norm": 18.681480407714844, + "learning_rate": 4.916537673095428e-06, + "loss": 1.3546, + "step": 18028 + }, + { + "epoch": 2.26, + "grad_norm": 9.547511100769043, + "learning_rate": 4.91570095803874e-06, + "loss": 0.7077, + "step": 18029 + }, + { + "epoch": 2.26, + "grad_norm": 16.273372650146484, + "learning_rate": 4.914864242982053e-06, + "loss": 1.1192, + "step": 18030 + }, + { + "epoch": 2.26, + "grad_norm": 17.370267868041992, + "learning_rate": 4.914027527925365e-06, + "loss": 0.7352, + "step": 18031 + }, + { + "epoch": 2.26, + "grad_norm": 7.943282127380371, + "learning_rate": 4.913190812868678e-06, + "loss": 1.6347, + "step": 18032 + }, + { + "epoch": 2.26, + "grad_norm": 11.803362846374512, + "learning_rate": 4.912354097811991e-06, + "loss": 0.6748, + "step": 18033 + }, + { + "epoch": 2.26, + "grad_norm": 11.606377601623535, + "learning_rate": 4.911517382755303e-06, + "loss": 1.2015, + "step": 18034 + }, + { + "epoch": 2.26, + "grad_norm": 36.61336135864258, + "learning_rate": 4.9106806676986155e-06, + "loss": 1.6687, + "step": 18035 + }, + { + "epoch": 2.26, + "grad_norm": 43.68743896484375, + "learning_rate": 4.909843952641928e-06, + "loss": 1.9454, + "step": 18036 + }, + { + "epoch": 2.26, + "grad_norm": 18.23581886291504, + "learning_rate": 4.909007237585241e-06, + "loss": 0.4798, + "step": 18037 + }, + { + "epoch": 2.26, + "grad_norm": 15.893588066101074, + "learning_rate": 4.908170522528553e-06, + "loss": 1.0806, + "step": 18038 + }, + { + "epoch": 2.26, + "grad_norm": 18.41132164001465, + "learning_rate": 4.907333807471866e-06, + "loss": 1.9558, + "step": 18039 + }, + { + "epoch": 2.26, + "grad_norm": 17.931055068969727, + "learning_rate": 4.906497092415179e-06, + "loss": 0.9484, + "step": 18040 + }, + { + "epoch": 2.26, + "grad_norm": 15.66031265258789, + "learning_rate": 4.905660377358491e-06, + "loss": 0.4607, + "step": 18041 + }, + { + "epoch": 2.26, + "grad_norm": 14.757575035095215, + "learning_rate": 4.9048236623018034e-06, + "loss": 1.4596, + "step": 18042 + }, + { + "epoch": 2.26, + "grad_norm": 17.42569351196289, + "learning_rate": 4.903986947245116e-06, + "loss": 1.3227, + "step": 18043 + }, + { + "epoch": 2.26, + "grad_norm": 28.305334091186523, + "learning_rate": 4.903150232188429e-06, + "loss": 1.1253, + "step": 18044 + }, + { + "epoch": 2.26, + "grad_norm": 15.629451751708984, + "learning_rate": 4.902313517131741e-06, + "loss": 1.5697, + "step": 18045 + }, + { + "epoch": 2.26, + "grad_norm": 11.656510353088379, + "learning_rate": 4.901476802075054e-06, + "loss": 1.5753, + "step": 18046 + }, + { + "epoch": 2.26, + "grad_norm": 12.482386589050293, + "learning_rate": 4.900640087018367e-06, + "loss": 0.7014, + "step": 18047 + }, + { + "epoch": 2.26, + "grad_norm": 7.773298263549805, + "learning_rate": 4.8998033719616786e-06, + "loss": 0.4116, + "step": 18048 + }, + { + "epoch": 2.27, + "grad_norm": 9.567391395568848, + "learning_rate": 4.898966656904991e-06, + "loss": 0.2018, + "step": 18049 + }, + { + "epoch": 2.27, + "grad_norm": 724.5193481445312, + "learning_rate": 4.898129941848304e-06, + "loss": 0.6798, + "step": 18050 + }, + { + "epoch": 2.27, + "grad_norm": 6.985085487365723, + "learning_rate": 4.897293226791617e-06, + "loss": 0.6674, + "step": 18051 + }, + { + "epoch": 2.27, + "grad_norm": 20.335052490234375, + "learning_rate": 4.896456511734929e-06, + "loss": 0.6555, + "step": 18052 + }, + { + "epoch": 2.27, + "grad_norm": 16.140939712524414, + "learning_rate": 4.895619796678242e-06, + "loss": 1.3869, + "step": 18053 + }, + { + "epoch": 2.27, + "grad_norm": 13.487977027893066, + "learning_rate": 4.8947830816215545e-06, + "loss": 0.9246, + "step": 18054 + }, + { + "epoch": 2.27, + "grad_norm": 31.696489334106445, + "learning_rate": 4.8939463665648665e-06, + "loss": 1.5, + "step": 18055 + }, + { + "epoch": 2.27, + "grad_norm": 7.162998676300049, + "learning_rate": 4.893109651508179e-06, + "loss": 1.1, + "step": 18056 + }, + { + "epoch": 2.27, + "grad_norm": 17.811870574951172, + "learning_rate": 4.892272936451492e-06, + "loss": 1.477, + "step": 18057 + }, + { + "epoch": 2.27, + "grad_norm": 14.948601722717285, + "learning_rate": 4.891436221394805e-06, + "loss": 0.6962, + "step": 18058 + }, + { + "epoch": 2.27, + "grad_norm": 20.97901725769043, + "learning_rate": 4.890599506338117e-06, + "loss": 2.3629, + "step": 18059 + }, + { + "epoch": 2.27, + "grad_norm": 38.151309967041016, + "learning_rate": 4.88976279128143e-06, + "loss": 0.5859, + "step": 18060 + }, + { + "epoch": 2.27, + "grad_norm": 29.360761642456055, + "learning_rate": 4.888926076224742e-06, + "loss": 2.9119, + "step": 18061 + }, + { + "epoch": 2.27, + "grad_norm": 5.248826503753662, + "learning_rate": 4.888089361168054e-06, + "loss": 0.2653, + "step": 18062 + }, + { + "epoch": 2.27, + "grad_norm": 18.261001586914062, + "learning_rate": 4.887252646111367e-06, + "loss": 1.687, + "step": 18063 + }, + { + "epoch": 2.27, + "grad_norm": 4.85291862487793, + "learning_rate": 4.88641593105468e-06, + "loss": 0.2157, + "step": 18064 + }, + { + "epoch": 2.27, + "grad_norm": 8.904922485351562, + "learning_rate": 4.885579215997992e-06, + "loss": 0.2832, + "step": 18065 + }, + { + "epoch": 2.27, + "grad_norm": 36.32822799682617, + "learning_rate": 4.884742500941305e-06, + "loss": 1.5748, + "step": 18066 + }, + { + "epoch": 2.27, + "grad_norm": 7.538064479827881, + "learning_rate": 4.8839057858846176e-06, + "loss": 1.3308, + "step": 18067 + }, + { + "epoch": 2.27, + "grad_norm": 6.013849258422852, + "learning_rate": 4.8830690708279295e-06, + "loss": 0.3816, + "step": 18068 + }, + { + "epoch": 2.27, + "grad_norm": 53.89246368408203, + "learning_rate": 4.882232355771242e-06, + "loss": 4.6914, + "step": 18069 + }, + { + "epoch": 2.27, + "grad_norm": 8.806126594543457, + "learning_rate": 4.881395640714555e-06, + "loss": 0.7824, + "step": 18070 + }, + { + "epoch": 2.27, + "grad_norm": 49.080078125, + "learning_rate": 4.880558925657868e-06, + "loss": 1.7653, + "step": 18071 + }, + { + "epoch": 2.27, + "grad_norm": 51.791839599609375, + "learning_rate": 4.87972221060118e-06, + "loss": 1.6891, + "step": 18072 + }, + { + "epoch": 2.27, + "grad_norm": 18.03871726989746, + "learning_rate": 4.878885495544493e-06, + "loss": 1.208, + "step": 18073 + }, + { + "epoch": 2.27, + "grad_norm": 12.533004760742188, + "learning_rate": 4.8780487804878055e-06, + "loss": 2.1779, + "step": 18074 + }, + { + "epoch": 2.27, + "grad_norm": 13.060806274414062, + "learning_rate": 4.8772120654311174e-06, + "loss": 2.3596, + "step": 18075 + }, + { + "epoch": 2.27, + "grad_norm": 83.15618896484375, + "learning_rate": 4.87637535037443e-06, + "loss": 0.9586, + "step": 18076 + }, + { + "epoch": 2.27, + "grad_norm": 48.380252838134766, + "learning_rate": 4.875538635317743e-06, + "loss": 1.1134, + "step": 18077 + }, + { + "epoch": 2.27, + "grad_norm": 11.484421730041504, + "learning_rate": 4.874701920261056e-06, + "loss": 0.6175, + "step": 18078 + }, + { + "epoch": 2.27, + "grad_norm": 5.373561382293701, + "learning_rate": 4.873865205204368e-06, + "loss": 0.5831, + "step": 18079 + }, + { + "epoch": 2.27, + "grad_norm": 140.86215209960938, + "learning_rate": 4.873028490147681e-06, + "loss": 1.3971, + "step": 18080 + }, + { + "epoch": 2.27, + "grad_norm": 27.262065887451172, + "learning_rate": 4.872191775090993e-06, + "loss": 1.3046, + "step": 18081 + }, + { + "epoch": 2.27, + "grad_norm": 17.172199249267578, + "learning_rate": 4.871355060034305e-06, + "loss": 0.6865, + "step": 18082 + }, + { + "epoch": 2.27, + "grad_norm": 13.459043502807617, + "learning_rate": 4.870518344977618e-06, + "loss": 0.4893, + "step": 18083 + }, + { + "epoch": 2.27, + "grad_norm": 13.085651397705078, + "learning_rate": 4.869681629920931e-06, + "loss": 1.2103, + "step": 18084 + }, + { + "epoch": 2.27, + "grad_norm": 14.714868545532227, + "learning_rate": 4.868844914864244e-06, + "loss": 0.2914, + "step": 18085 + }, + { + "epoch": 2.27, + "grad_norm": 13.607343673706055, + "learning_rate": 4.868008199807556e-06, + "loss": 0.7138, + "step": 18086 + }, + { + "epoch": 2.27, + "grad_norm": 11.629634857177734, + "learning_rate": 4.8671714847508685e-06, + "loss": 0.7351, + "step": 18087 + }, + { + "epoch": 2.27, + "grad_norm": 6.835043907165527, + "learning_rate": 4.866334769694181e-06, + "loss": 0.704, + "step": 18088 + }, + { + "epoch": 2.27, + "grad_norm": 11.061873435974121, + "learning_rate": 4.865498054637493e-06, + "loss": 1.0773, + "step": 18089 + }, + { + "epoch": 2.27, + "grad_norm": 10.615815162658691, + "learning_rate": 4.864661339580806e-06, + "loss": 1.0865, + "step": 18090 + }, + { + "epoch": 2.27, + "grad_norm": 23.319171905517578, + "learning_rate": 4.863824624524119e-06, + "loss": 2.2188, + "step": 18091 + }, + { + "epoch": 2.27, + "grad_norm": 3.291536569595337, + "learning_rate": 4.862987909467432e-06, + "loss": 0.1462, + "step": 18092 + }, + { + "epoch": 2.27, + "grad_norm": 18.689645767211914, + "learning_rate": 4.862151194410744e-06, + "loss": 0.9393, + "step": 18093 + }, + { + "epoch": 2.27, + "grad_norm": 11.912179946899414, + "learning_rate": 4.8613144793540564e-06, + "loss": 1.312, + "step": 18094 + }, + { + "epoch": 2.27, + "grad_norm": 11.882221221923828, + "learning_rate": 4.860477764297369e-06, + "loss": 0.436, + "step": 18095 + }, + { + "epoch": 2.27, + "grad_norm": 31.453899383544922, + "learning_rate": 4.859641049240681e-06, + "loss": 3.0691, + "step": 18096 + }, + { + "epoch": 2.27, + "grad_norm": 39.959293365478516, + "learning_rate": 4.858804334183994e-06, + "loss": 2.2514, + "step": 18097 + }, + { + "epoch": 2.27, + "grad_norm": 11.969451904296875, + "learning_rate": 4.857967619127307e-06, + "loss": 0.5848, + "step": 18098 + }, + { + "epoch": 2.27, + "grad_norm": 19.55484390258789, + "learning_rate": 4.85713090407062e-06, + "loss": 1.8364, + "step": 18099 + }, + { + "epoch": 2.27, + "grad_norm": 27.38494300842285, + "learning_rate": 4.8562941890139316e-06, + "loss": 1.1832, + "step": 18100 + }, + { + "epoch": 2.27, + "grad_norm": 24.023799896240234, + "learning_rate": 4.855457473957244e-06, + "loss": 1.5347, + "step": 18101 + }, + { + "epoch": 2.27, + "grad_norm": 3.8433549404144287, + "learning_rate": 4.854620758900556e-06, + "loss": 0.0844, + "step": 18102 + }, + { + "epoch": 2.27, + "grad_norm": 7.295028209686279, + "learning_rate": 4.853784043843869e-06, + "loss": 2.2588, + "step": 18103 + }, + { + "epoch": 2.27, + "grad_norm": 18.740617752075195, + "learning_rate": 4.852947328787182e-06, + "loss": 1.6035, + "step": 18104 + }, + { + "epoch": 2.27, + "grad_norm": 16.479990005493164, + "learning_rate": 4.852110613730495e-06, + "loss": 1.7165, + "step": 18105 + }, + { + "epoch": 2.27, + "grad_norm": 11.476269721984863, + "learning_rate": 4.8512738986738075e-06, + "loss": 0.7912, + "step": 18106 + }, + { + "epoch": 2.27, + "grad_norm": 17.679611206054688, + "learning_rate": 4.8504371836171195e-06, + "loss": 1.3959, + "step": 18107 + }, + { + "epoch": 2.27, + "grad_norm": 20.310611724853516, + "learning_rate": 4.849600468560432e-06, + "loss": 2.0444, + "step": 18108 + }, + { + "epoch": 2.27, + "grad_norm": 6.861479759216309, + "learning_rate": 4.848763753503744e-06, + "loss": 0.1372, + "step": 18109 + }, + { + "epoch": 2.27, + "grad_norm": 8.993551254272461, + "learning_rate": 4.847927038447057e-06, + "loss": 2.1999, + "step": 18110 + }, + { + "epoch": 2.27, + "grad_norm": 12.034762382507324, + "learning_rate": 4.84709032339037e-06, + "loss": 0.7087, + "step": 18111 + }, + { + "epoch": 2.27, + "grad_norm": 3.3666305541992188, + "learning_rate": 4.846253608333683e-06, + "loss": 0.0713, + "step": 18112 + }, + { + "epoch": 2.27, + "grad_norm": 18.19227409362793, + "learning_rate": 4.8454168932769954e-06, + "loss": 1.681, + "step": 18113 + }, + { + "epoch": 2.27, + "grad_norm": 13.807527542114258, + "learning_rate": 4.844580178220307e-06, + "loss": 0.3026, + "step": 18114 + }, + { + "epoch": 2.27, + "grad_norm": 22.87261199951172, + "learning_rate": 4.84374346316362e-06, + "loss": 0.8078, + "step": 18115 + }, + { + "epoch": 2.27, + "grad_norm": 657.290283203125, + "learning_rate": 4.842906748106932e-06, + "loss": 1.5545, + "step": 18116 + }, + { + "epoch": 2.27, + "grad_norm": 12.585835456848145, + "learning_rate": 4.842070033050245e-06, + "loss": 1.2678, + "step": 18117 + }, + { + "epoch": 2.27, + "grad_norm": 24.29072380065918, + "learning_rate": 4.841233317993558e-06, + "loss": 1.8188, + "step": 18118 + }, + { + "epoch": 2.27, + "grad_norm": 9.583120346069336, + "learning_rate": 4.8403966029368706e-06, + "loss": 0.9413, + "step": 18119 + }, + { + "epoch": 2.27, + "grad_norm": 21.73714256286621, + "learning_rate": 4.839559887880183e-06, + "loss": 2.7989, + "step": 18120 + }, + { + "epoch": 2.27, + "grad_norm": 3.3437893390655518, + "learning_rate": 4.838723172823495e-06, + "loss": 0.0507, + "step": 18121 + }, + { + "epoch": 2.27, + "grad_norm": 41.34368896484375, + "learning_rate": 4.837886457766808e-06, + "loss": 0.5975, + "step": 18122 + }, + { + "epoch": 2.27, + "grad_norm": 29.665691375732422, + "learning_rate": 4.83704974271012e-06, + "loss": 2.3017, + "step": 18123 + }, + { + "epoch": 2.27, + "grad_norm": 5.171306133270264, + "learning_rate": 4.836213027653433e-06, + "loss": 0.8568, + "step": 18124 + }, + { + "epoch": 2.27, + "grad_norm": 8.918929100036621, + "learning_rate": 4.835376312596746e-06, + "loss": 0.4179, + "step": 18125 + }, + { + "epoch": 2.27, + "grad_norm": 100.70255279541016, + "learning_rate": 4.8345395975400585e-06, + "loss": 1.7599, + "step": 18126 + }, + { + "epoch": 2.27, + "grad_norm": 19.132160186767578, + "learning_rate": 4.833702882483371e-06, + "loss": 2.4708, + "step": 18127 + }, + { + "epoch": 2.28, + "grad_norm": 4.666585445404053, + "learning_rate": 4.832866167426683e-06, + "loss": 0.2085, + "step": 18128 + }, + { + "epoch": 2.28, + "grad_norm": 19.80629539489746, + "learning_rate": 4.832029452369996e-06, + "loss": 0.4924, + "step": 18129 + }, + { + "epoch": 2.28, + "grad_norm": 58.904666900634766, + "learning_rate": 4.831192737313308e-06, + "loss": 0.7678, + "step": 18130 + }, + { + "epoch": 2.28, + "grad_norm": 10.534483909606934, + "learning_rate": 4.830356022256621e-06, + "loss": 0.9355, + "step": 18131 + }, + { + "epoch": 2.28, + "grad_norm": 18.848360061645508, + "learning_rate": 4.829519307199934e-06, + "loss": 0.8069, + "step": 18132 + }, + { + "epoch": 2.28, + "grad_norm": 14.515966415405273, + "learning_rate": 4.828682592143246e-06, + "loss": 1.9142, + "step": 18133 + }, + { + "epoch": 2.28, + "grad_norm": 17.976545333862305, + "learning_rate": 4.827845877086559e-06, + "loss": 2.0127, + "step": 18134 + }, + { + "epoch": 2.28, + "grad_norm": 31.835487365722656, + "learning_rate": 4.827009162029871e-06, + "loss": 1.7525, + "step": 18135 + }, + { + "epoch": 2.28, + "grad_norm": 94.18576049804688, + "learning_rate": 4.826172446973184e-06, + "loss": 3.045, + "step": 18136 + }, + { + "epoch": 2.28, + "grad_norm": 12.619370460510254, + "learning_rate": 4.825335731916496e-06, + "loss": 0.5362, + "step": 18137 + }, + { + "epoch": 2.28, + "grad_norm": 3.5252768993377686, + "learning_rate": 4.824499016859809e-06, + "loss": 0.1731, + "step": 18138 + }, + { + "epoch": 2.28, + "grad_norm": 6.68712043762207, + "learning_rate": 4.8236623018031215e-06, + "loss": 1.0455, + "step": 18139 + }, + { + "epoch": 2.28, + "grad_norm": 4.003234386444092, + "learning_rate": 4.822825586746434e-06, + "loss": 0.396, + "step": 18140 + }, + { + "epoch": 2.28, + "grad_norm": 7.79044246673584, + "learning_rate": 4.821988871689747e-06, + "loss": 0.9295, + "step": 18141 + }, + { + "epoch": 2.28, + "grad_norm": 26.89201545715332, + "learning_rate": 4.821152156633059e-06, + "loss": 1.0806, + "step": 18142 + }, + { + "epoch": 2.28, + "grad_norm": 15.89769172668457, + "learning_rate": 4.820315441576371e-06, + "loss": 1.8888, + "step": 18143 + }, + { + "epoch": 2.28, + "grad_norm": 5.993673324584961, + "learning_rate": 4.819478726519684e-06, + "loss": 2.3131, + "step": 18144 + }, + { + "epoch": 2.28, + "grad_norm": 8.432538032531738, + "learning_rate": 4.818642011462997e-06, + "loss": 0.524, + "step": 18145 + }, + { + "epoch": 2.28, + "grad_norm": 5.847290515899658, + "learning_rate": 4.8178052964063094e-06, + "loss": 0.535, + "step": 18146 + }, + { + "epoch": 2.28, + "grad_norm": 15.385242462158203, + "learning_rate": 4.816968581349622e-06, + "loss": 1.5261, + "step": 18147 + }, + { + "epoch": 2.28, + "grad_norm": 16.392929077148438, + "learning_rate": 4.816131866292935e-06, + "loss": 1.4277, + "step": 18148 + }, + { + "epoch": 2.28, + "grad_norm": 10.277228355407715, + "learning_rate": 4.815295151236247e-06, + "loss": 0.6431, + "step": 18149 + }, + { + "epoch": 2.28, + "grad_norm": 13.693373680114746, + "learning_rate": 4.814458436179559e-06, + "loss": 1.8388, + "step": 18150 + }, + { + "epoch": 2.28, + "grad_norm": 9.805331230163574, + "learning_rate": 4.813621721122872e-06, + "loss": 0.3037, + "step": 18151 + }, + { + "epoch": 2.28, + "grad_norm": 17.375608444213867, + "learning_rate": 4.8127850060661845e-06, + "loss": 0.5494, + "step": 18152 + }, + { + "epoch": 2.28, + "grad_norm": 4.424117088317871, + "learning_rate": 4.811948291009497e-06, + "loss": 0.2041, + "step": 18153 + }, + { + "epoch": 2.28, + "grad_norm": 14.018604278564453, + "learning_rate": 4.81111157595281e-06, + "loss": 1.1859, + "step": 18154 + }, + { + "epoch": 2.28, + "grad_norm": 7.766754150390625, + "learning_rate": 4.810274860896122e-06, + "loss": 0.4826, + "step": 18155 + }, + { + "epoch": 2.28, + "grad_norm": 6.565408229827881, + "learning_rate": 4.809438145839435e-06, + "loss": 0.5125, + "step": 18156 + }, + { + "epoch": 2.28, + "grad_norm": 40.847049713134766, + "learning_rate": 4.808601430782747e-06, + "loss": 2.2631, + "step": 18157 + }, + { + "epoch": 2.28, + "grad_norm": 148.59217834472656, + "learning_rate": 4.80776471572606e-06, + "loss": 2.0997, + "step": 18158 + }, + { + "epoch": 2.28, + "grad_norm": 9.460793495178223, + "learning_rate": 4.8069280006693725e-06, + "loss": 0.4907, + "step": 18159 + }, + { + "epoch": 2.28, + "grad_norm": 5.744081020355225, + "learning_rate": 4.806091285612685e-06, + "loss": 0.2731, + "step": 18160 + }, + { + "epoch": 2.28, + "grad_norm": 27.180835723876953, + "learning_rate": 4.805254570555998e-06, + "loss": 1.5487, + "step": 18161 + }, + { + "epoch": 2.28, + "grad_norm": 22.261077880859375, + "learning_rate": 4.80441785549931e-06, + "loss": 1.4461, + "step": 18162 + }, + { + "epoch": 2.28, + "grad_norm": 13.491981506347656, + "learning_rate": 4.803581140442623e-06, + "loss": 1.6574, + "step": 18163 + }, + { + "epoch": 2.28, + "grad_norm": 18.313154220581055, + "learning_rate": 4.802744425385935e-06, + "loss": 0.9745, + "step": 18164 + }, + { + "epoch": 2.28, + "grad_norm": 15.0607328414917, + "learning_rate": 4.801907710329248e-06, + "loss": 0.5434, + "step": 18165 + }, + { + "epoch": 2.28, + "grad_norm": 22.887287139892578, + "learning_rate": 4.80107099527256e-06, + "loss": 0.5663, + "step": 18166 + }, + { + "epoch": 2.28, + "grad_norm": 19.480941772460938, + "learning_rate": 4.800234280215873e-06, + "loss": 0.8002, + "step": 18167 + }, + { + "epoch": 2.28, + "grad_norm": 25.959819793701172, + "learning_rate": 4.799397565159186e-06, + "loss": 1.2098, + "step": 18168 + }, + { + "epoch": 2.28, + "grad_norm": 17.326189041137695, + "learning_rate": 4.798560850102498e-06, + "loss": 1.2297, + "step": 18169 + }, + { + "epoch": 2.28, + "grad_norm": 11.509465217590332, + "learning_rate": 4.797724135045811e-06, + "loss": 1.7532, + "step": 18170 + }, + { + "epoch": 2.28, + "grad_norm": 6.875067234039307, + "learning_rate": 4.796887419989123e-06, + "loss": 1.0477, + "step": 18171 + }, + { + "epoch": 2.28, + "grad_norm": 14.742862701416016, + "learning_rate": 4.7960507049324355e-06, + "loss": 0.5861, + "step": 18172 + }, + { + "epoch": 2.28, + "grad_norm": 6.9289679527282715, + "learning_rate": 4.795213989875748e-06, + "loss": 0.2185, + "step": 18173 + }, + { + "epoch": 2.28, + "grad_norm": 16.963138580322266, + "learning_rate": 4.794377274819061e-06, + "loss": 0.7712, + "step": 18174 + }, + { + "epoch": 2.28, + "grad_norm": 13.129985809326172, + "learning_rate": 4.793540559762374e-06, + "loss": 0.7799, + "step": 18175 + }, + { + "epoch": 2.28, + "grad_norm": 15.65756607055664, + "learning_rate": 4.792703844705686e-06, + "loss": 1.0235, + "step": 18176 + }, + { + "epoch": 2.28, + "grad_norm": 23.486530303955078, + "learning_rate": 4.791867129648999e-06, + "loss": 0.6155, + "step": 18177 + }, + { + "epoch": 2.28, + "grad_norm": 17.494380950927734, + "learning_rate": 4.791030414592311e-06, + "loss": 0.9875, + "step": 18178 + }, + { + "epoch": 2.28, + "grad_norm": 12.82445240020752, + "learning_rate": 4.790193699535623e-06, + "loss": 0.5564, + "step": 18179 + }, + { + "epoch": 2.28, + "grad_norm": 12.545409202575684, + "learning_rate": 4.789356984478936e-06, + "loss": 0.9441, + "step": 18180 + }, + { + "epoch": 2.28, + "grad_norm": 10.12964916229248, + "learning_rate": 4.788520269422249e-06, + "loss": 0.5128, + "step": 18181 + }, + { + "epoch": 2.28, + "grad_norm": 11.175071716308594, + "learning_rate": 4.787683554365562e-06, + "loss": 0.5869, + "step": 18182 + }, + { + "epoch": 2.28, + "grad_norm": 10.429204940795898, + "learning_rate": 4.786846839308874e-06, + "loss": 0.9771, + "step": 18183 + }, + { + "epoch": 2.28, + "grad_norm": 7.880436897277832, + "learning_rate": 4.786010124252186e-06, + "loss": 0.5035, + "step": 18184 + }, + { + "epoch": 2.28, + "grad_norm": 6.829651355743408, + "learning_rate": 4.7851734091954985e-06, + "loss": 0.282, + "step": 18185 + }, + { + "epoch": 2.28, + "grad_norm": 9.256832122802734, + "learning_rate": 4.784336694138811e-06, + "loss": 0.9679, + "step": 18186 + }, + { + "epoch": 2.28, + "grad_norm": 82.81836700439453, + "learning_rate": 4.783499979082124e-06, + "loss": 2.0453, + "step": 18187 + }, + { + "epoch": 2.28, + "grad_norm": 13.147477149963379, + "learning_rate": 4.782663264025437e-06, + "loss": 0.5534, + "step": 18188 + }, + { + "epoch": 2.28, + "grad_norm": 11.433621406555176, + "learning_rate": 4.78182654896875e-06, + "loss": 0.9656, + "step": 18189 + }, + { + "epoch": 2.28, + "grad_norm": 20.815425872802734, + "learning_rate": 4.780989833912062e-06, + "loss": 1.3373, + "step": 18190 + }, + { + "epoch": 2.28, + "grad_norm": 29.052854537963867, + "learning_rate": 4.780153118855374e-06, + "loss": 0.7903, + "step": 18191 + }, + { + "epoch": 2.28, + "grad_norm": 14.508295059204102, + "learning_rate": 4.7793164037986864e-06, + "loss": 0.673, + "step": 18192 + }, + { + "epoch": 2.28, + "grad_norm": 17.395936965942383, + "learning_rate": 4.778479688741999e-06, + "loss": 0.8457, + "step": 18193 + }, + { + "epoch": 2.28, + "grad_norm": 17.987634658813477, + "learning_rate": 4.777642973685312e-06, + "loss": 0.7368, + "step": 18194 + }, + { + "epoch": 2.28, + "grad_norm": 20.08308982849121, + "learning_rate": 4.776806258628625e-06, + "loss": 2.3677, + "step": 18195 + }, + { + "epoch": 2.28, + "grad_norm": 9.142012596130371, + "learning_rate": 4.775969543571937e-06, + "loss": 1.2506, + "step": 18196 + }, + { + "epoch": 2.28, + "grad_norm": 21.252647399902344, + "learning_rate": 4.77513282851525e-06, + "loss": 1.0209, + "step": 18197 + }, + { + "epoch": 2.28, + "grad_norm": 18.741111755371094, + "learning_rate": 4.7742961134585616e-06, + "loss": 0.4748, + "step": 18198 + }, + { + "epoch": 2.28, + "grad_norm": 3.1713852882385254, + "learning_rate": 4.773459398401874e-06, + "loss": 0.1406, + "step": 18199 + }, + { + "epoch": 2.28, + "grad_norm": 13.61368465423584, + "learning_rate": 4.772622683345187e-06, + "loss": 0.6536, + "step": 18200 + }, + { + "epoch": 2.28, + "grad_norm": 13.201034545898438, + "learning_rate": 4.7717859682885e-06, + "loss": 0.8234, + "step": 18201 + }, + { + "epoch": 2.28, + "grad_norm": 12.031088829040527, + "learning_rate": 4.770949253231813e-06, + "loss": 1.6413, + "step": 18202 + }, + { + "epoch": 2.28, + "grad_norm": 31.76029396057129, + "learning_rate": 4.770112538175125e-06, + "loss": 3.4188, + "step": 18203 + }, + { + "epoch": 2.28, + "grad_norm": 11.155171394348145, + "learning_rate": 4.7692758231184375e-06, + "loss": 1.4523, + "step": 18204 + }, + { + "epoch": 2.28, + "grad_norm": 67.39051818847656, + "learning_rate": 4.7684391080617495e-06, + "loss": 1.882, + "step": 18205 + }, + { + "epoch": 2.28, + "grad_norm": 19.147981643676758, + "learning_rate": 4.767602393005062e-06, + "loss": 1.1568, + "step": 18206 + }, + { + "epoch": 2.28, + "grad_norm": 6.553123950958252, + "learning_rate": 4.766765677948375e-06, + "loss": 1.0164, + "step": 18207 + }, + { + "epoch": 2.29, + "grad_norm": 12.664617538452148, + "learning_rate": 4.765928962891688e-06, + "loss": 0.9444, + "step": 18208 + }, + { + "epoch": 2.29, + "grad_norm": 16.09284210205078, + "learning_rate": 4.765092247835001e-06, + "loss": 0.5439, + "step": 18209 + }, + { + "epoch": 2.29, + "grad_norm": 8.732881546020508, + "learning_rate": 4.764255532778313e-06, + "loss": 0.7299, + "step": 18210 + }, + { + "epoch": 2.29, + "grad_norm": 45.31431579589844, + "learning_rate": 4.7634188177216254e-06, + "loss": 1.6899, + "step": 18211 + }, + { + "epoch": 2.29, + "grad_norm": 24.906145095825195, + "learning_rate": 4.762582102664937e-06, + "loss": 1.6587, + "step": 18212 + }, + { + "epoch": 2.29, + "grad_norm": 16.11197280883789, + "learning_rate": 4.76174538760825e-06, + "loss": 1.0412, + "step": 18213 + }, + { + "epoch": 2.29, + "grad_norm": 9.287578582763672, + "learning_rate": 4.760908672551563e-06, + "loss": 1.0026, + "step": 18214 + }, + { + "epoch": 2.29, + "grad_norm": 16.759666442871094, + "learning_rate": 4.760071957494876e-06, + "loss": 0.8681, + "step": 18215 + }, + { + "epoch": 2.29, + "grad_norm": 20.16606903076172, + "learning_rate": 4.759235242438189e-06, + "loss": 0.8982, + "step": 18216 + }, + { + "epoch": 2.29, + "grad_norm": 9.273433685302734, + "learning_rate": 4.7583985273815006e-06, + "loss": 1.404, + "step": 18217 + }, + { + "epoch": 2.29, + "grad_norm": 34.79890441894531, + "learning_rate": 4.757561812324813e-06, + "loss": 3.0047, + "step": 18218 + }, + { + "epoch": 2.29, + "grad_norm": 31.379962921142578, + "learning_rate": 4.756725097268125e-06, + "loss": 0.9652, + "step": 18219 + }, + { + "epoch": 2.29, + "grad_norm": 19.541475296020508, + "learning_rate": 4.755888382211438e-06, + "loss": 0.7403, + "step": 18220 + }, + { + "epoch": 2.29, + "grad_norm": 15.139787673950195, + "learning_rate": 4.755051667154751e-06, + "loss": 0.4795, + "step": 18221 + }, + { + "epoch": 2.29, + "grad_norm": 6.544916152954102, + "learning_rate": 4.754214952098064e-06, + "loss": 0.9741, + "step": 18222 + }, + { + "epoch": 2.29, + "grad_norm": 30.706077575683594, + "learning_rate": 4.7533782370413765e-06, + "loss": 1.6226, + "step": 18223 + }, + { + "epoch": 2.29, + "grad_norm": 7.352999210357666, + "learning_rate": 4.7525415219846885e-06, + "loss": 0.7358, + "step": 18224 + }, + { + "epoch": 2.29, + "grad_norm": 10.688814163208008, + "learning_rate": 4.7517048069280004e-06, + "loss": 1.2113, + "step": 18225 + }, + { + "epoch": 2.29, + "grad_norm": 6.308061599731445, + "learning_rate": 4.750868091871313e-06, + "loss": 0.3193, + "step": 18226 + }, + { + "epoch": 2.29, + "grad_norm": 18.884119033813477, + "learning_rate": 4.750031376814626e-06, + "loss": 1.5468, + "step": 18227 + }, + { + "epoch": 2.29, + "grad_norm": 18.070905685424805, + "learning_rate": 4.749194661757939e-06, + "loss": 1.1872, + "step": 18228 + }, + { + "epoch": 2.29, + "grad_norm": 6.125032901763916, + "learning_rate": 4.748357946701252e-06, + "loss": 1.0369, + "step": 18229 + }, + { + "epoch": 2.29, + "grad_norm": 8.317168235778809, + "learning_rate": 4.747521231644564e-06, + "loss": 0.38, + "step": 18230 + }, + { + "epoch": 2.29, + "grad_norm": 5.549513816833496, + "learning_rate": 4.746684516587876e-06, + "loss": 0.5862, + "step": 18231 + }, + { + "epoch": 2.29, + "grad_norm": 6.371419906616211, + "learning_rate": 4.745847801531188e-06, + "loss": 0.5998, + "step": 18232 + }, + { + "epoch": 2.29, + "grad_norm": 15.010042190551758, + "learning_rate": 4.745011086474501e-06, + "loss": 0.6881, + "step": 18233 + }, + { + "epoch": 2.29, + "grad_norm": 32.16387939453125, + "learning_rate": 4.744174371417814e-06, + "loss": 1.2754, + "step": 18234 + }, + { + "epoch": 2.29, + "grad_norm": 4.978853225708008, + "learning_rate": 4.743337656361127e-06, + "loss": 0.3576, + "step": 18235 + }, + { + "epoch": 2.29, + "grad_norm": 16.613771438598633, + "learning_rate": 4.7425009413044396e-06, + "loss": 1.196, + "step": 18236 + }, + { + "epoch": 2.29, + "grad_norm": 13.595804214477539, + "learning_rate": 4.7416642262477515e-06, + "loss": 0.8008, + "step": 18237 + }, + { + "epoch": 2.29, + "grad_norm": 92.11951446533203, + "learning_rate": 4.740827511191064e-06, + "loss": 1.0349, + "step": 18238 + }, + { + "epoch": 2.29, + "grad_norm": 6.98962926864624, + "learning_rate": 4.739990796134376e-06, + "loss": 0.2019, + "step": 18239 + }, + { + "epoch": 2.29, + "grad_norm": 6.2486162185668945, + "learning_rate": 4.739154081077689e-06, + "loss": 0.2373, + "step": 18240 + }, + { + "epoch": 2.29, + "grad_norm": 10.029749870300293, + "learning_rate": 4.738317366021002e-06, + "loss": 1.55, + "step": 18241 + }, + { + "epoch": 2.29, + "grad_norm": 18.082401275634766, + "learning_rate": 4.737480650964315e-06, + "loss": 0.5872, + "step": 18242 + }, + { + "epoch": 2.29, + "grad_norm": 18.084856033325195, + "learning_rate": 4.7366439359076275e-06, + "loss": 1.0304, + "step": 18243 + }, + { + "epoch": 2.29, + "grad_norm": 11.300666809082031, + "learning_rate": 4.7358072208509394e-06, + "loss": 0.6484, + "step": 18244 + }, + { + "epoch": 2.29, + "grad_norm": 8.962103843688965, + "learning_rate": 4.734970505794252e-06, + "loss": 1.3226, + "step": 18245 + }, + { + "epoch": 2.29, + "grad_norm": 21.546951293945312, + "learning_rate": 4.734133790737564e-06, + "loss": 0.226, + "step": 18246 + }, + { + "epoch": 2.29, + "grad_norm": 9.672987937927246, + "learning_rate": 4.733297075680877e-06, + "loss": 0.9477, + "step": 18247 + }, + { + "epoch": 2.29, + "grad_norm": 19.209959030151367, + "learning_rate": 4.73246036062419e-06, + "loss": 4.4495, + "step": 18248 + }, + { + "epoch": 2.29, + "grad_norm": 6.786340236663818, + "learning_rate": 4.731623645567503e-06, + "loss": 1.3741, + "step": 18249 + }, + { + "epoch": 2.29, + "grad_norm": 6.299871921539307, + "learning_rate": 4.730786930510815e-06, + "loss": 0.6948, + "step": 18250 + }, + { + "epoch": 2.29, + "grad_norm": 43.83751678466797, + "learning_rate": 4.729950215454127e-06, + "loss": 3.2393, + "step": 18251 + }, + { + "epoch": 2.29, + "grad_norm": 6.27683162689209, + "learning_rate": 4.72911350039744e-06, + "loss": 0.1366, + "step": 18252 + }, + { + "epoch": 2.29, + "grad_norm": 9.838420867919922, + "learning_rate": 4.728276785340752e-06, + "loss": 0.7473, + "step": 18253 + }, + { + "epoch": 2.29, + "grad_norm": 16.24065399169922, + "learning_rate": 4.727440070284065e-06, + "loss": 0.6801, + "step": 18254 + }, + { + "epoch": 2.29, + "grad_norm": 14.735170364379883, + "learning_rate": 4.726603355227378e-06, + "loss": 1.2568, + "step": 18255 + }, + { + "epoch": 2.29, + "grad_norm": 10.91494083404541, + "learning_rate": 4.7257666401706905e-06, + "loss": 0.3934, + "step": 18256 + }, + { + "epoch": 2.29, + "grad_norm": 25.261390686035156, + "learning_rate": 4.724929925114003e-06, + "loss": 2.6683, + "step": 18257 + }, + { + "epoch": 2.29, + "grad_norm": 18.75349235534668, + "learning_rate": 4.724093210057315e-06, + "loss": 1.5721, + "step": 18258 + }, + { + "epoch": 2.29, + "grad_norm": 9.904398918151855, + "learning_rate": 4.723256495000628e-06, + "loss": 0.417, + "step": 18259 + }, + { + "epoch": 2.29, + "grad_norm": 34.27606964111328, + "learning_rate": 4.72241977994394e-06, + "loss": 0.6526, + "step": 18260 + }, + { + "epoch": 2.29, + "grad_norm": 51.746421813964844, + "learning_rate": 4.721583064887253e-06, + "loss": 2.392, + "step": 18261 + }, + { + "epoch": 2.29, + "grad_norm": 13.128945350646973, + "learning_rate": 4.720746349830566e-06, + "loss": 0.4282, + "step": 18262 + }, + { + "epoch": 2.29, + "grad_norm": 11.959078788757324, + "learning_rate": 4.7199096347738784e-06, + "loss": 0.9903, + "step": 18263 + }, + { + "epoch": 2.29, + "grad_norm": 13.84642219543457, + "learning_rate": 4.719072919717191e-06, + "loss": 1.7438, + "step": 18264 + }, + { + "epoch": 2.29, + "grad_norm": 17.713260650634766, + "learning_rate": 4.718236204660503e-06, + "loss": 1.294, + "step": 18265 + }, + { + "epoch": 2.29, + "grad_norm": 12.827953338623047, + "learning_rate": 4.717399489603815e-06, + "loss": 0.7201, + "step": 18266 + }, + { + "epoch": 2.29, + "grad_norm": 6.821693420410156, + "learning_rate": 4.716562774547128e-06, + "loss": 0.5782, + "step": 18267 + }, + { + "epoch": 2.29, + "grad_norm": 7.889687538146973, + "learning_rate": 4.715726059490441e-06, + "loss": 0.2661, + "step": 18268 + }, + { + "epoch": 2.29, + "grad_norm": 16.12978172302246, + "learning_rate": 4.7148893444337536e-06, + "loss": 1.2605, + "step": 18269 + }, + { + "epoch": 2.29, + "grad_norm": 7.606167316436768, + "learning_rate": 4.714052629377066e-06, + "loss": 0.5176, + "step": 18270 + }, + { + "epoch": 2.29, + "grad_norm": 12.436556816101074, + "learning_rate": 4.713215914320378e-06, + "loss": 1.8853, + "step": 18271 + }, + { + "epoch": 2.29, + "grad_norm": 150.93994140625, + "learning_rate": 4.712379199263691e-06, + "loss": 1.0326, + "step": 18272 + }, + { + "epoch": 2.29, + "grad_norm": 16.166446685791016, + "learning_rate": 4.711542484207003e-06, + "loss": 0.3729, + "step": 18273 + }, + { + "epoch": 2.29, + "grad_norm": 16.094757080078125, + "learning_rate": 4.710705769150316e-06, + "loss": 0.5804, + "step": 18274 + }, + { + "epoch": 2.29, + "grad_norm": 18.42324447631836, + "learning_rate": 4.709869054093629e-06, + "loss": 0.9788, + "step": 18275 + }, + { + "epoch": 2.29, + "grad_norm": 7.328489303588867, + "learning_rate": 4.7090323390369415e-06, + "loss": 0.2255, + "step": 18276 + }, + { + "epoch": 2.29, + "grad_norm": 30.318105697631836, + "learning_rate": 4.708195623980254e-06, + "loss": 1.2918, + "step": 18277 + }, + { + "epoch": 2.29, + "grad_norm": 8.805867195129395, + "learning_rate": 4.707358908923566e-06, + "loss": 2.0253, + "step": 18278 + }, + { + "epoch": 2.29, + "grad_norm": 27.83954620361328, + "learning_rate": 4.706522193866879e-06, + "loss": 0.7115, + "step": 18279 + }, + { + "epoch": 2.29, + "grad_norm": 17.944978713989258, + "learning_rate": 4.705685478810191e-06, + "loss": 2.4743, + "step": 18280 + }, + { + "epoch": 2.29, + "grad_norm": 11.114306449890137, + "learning_rate": 4.704848763753504e-06, + "loss": 0.944, + "step": 18281 + }, + { + "epoch": 2.29, + "grad_norm": 14.47452449798584, + "learning_rate": 4.704012048696817e-06, + "loss": 0.3426, + "step": 18282 + }, + { + "epoch": 2.29, + "grad_norm": 9.617793083190918, + "learning_rate": 4.703175333640129e-06, + "loss": 0.9436, + "step": 18283 + }, + { + "epoch": 2.29, + "grad_norm": 9.082965850830078, + "learning_rate": 4.702338618583442e-06, + "loss": 0.39, + "step": 18284 + }, + { + "epoch": 2.29, + "grad_norm": 11.9785795211792, + "learning_rate": 4.701501903526754e-06, + "loss": 0.3713, + "step": 18285 + }, + { + "epoch": 2.29, + "grad_norm": 6.59641170501709, + "learning_rate": 4.700665188470067e-06, + "loss": 0.614, + "step": 18286 + }, + { + "epoch": 2.29, + "grad_norm": 12.665901184082031, + "learning_rate": 4.699828473413379e-06, + "loss": 0.4018, + "step": 18287 + }, + { + "epoch": 2.3, + "grad_norm": 3.5972509384155273, + "learning_rate": 4.698991758356692e-06, + "loss": 0.087, + "step": 18288 + }, + { + "epoch": 2.3, + "grad_norm": 22.941980361938477, + "learning_rate": 4.6981550433000045e-06, + "loss": 2.0695, + "step": 18289 + }, + { + "epoch": 2.3, + "grad_norm": 5.258419036865234, + "learning_rate": 4.697318328243317e-06, + "loss": 0.2443, + "step": 18290 + }, + { + "epoch": 2.3, + "grad_norm": 11.865102767944336, + "learning_rate": 4.69648161318663e-06, + "loss": 0.6209, + "step": 18291 + }, + { + "epoch": 2.3, + "grad_norm": 21.162181854248047, + "learning_rate": 4.695644898129942e-06, + "loss": 2.312, + "step": 18292 + }, + { + "epoch": 2.3, + "grad_norm": 5.515084743499756, + "learning_rate": 4.694808183073255e-06, + "loss": 0.4826, + "step": 18293 + }, + { + "epoch": 2.3, + "grad_norm": 11.08837890625, + "learning_rate": 4.693971468016567e-06, + "loss": 1.0752, + "step": 18294 + }, + { + "epoch": 2.3, + "grad_norm": 95.66429138183594, + "learning_rate": 4.69313475295988e-06, + "loss": 1.3982, + "step": 18295 + }, + { + "epoch": 2.3, + "grad_norm": 19.24129867553711, + "learning_rate": 4.6922980379031924e-06, + "loss": 0.3854, + "step": 18296 + }, + { + "epoch": 2.3, + "grad_norm": 22.71967887878418, + "learning_rate": 4.691461322846505e-06, + "loss": 1.989, + "step": 18297 + }, + { + "epoch": 2.3, + "grad_norm": 8.208455085754395, + "learning_rate": 4.690624607789818e-06, + "loss": 0.253, + "step": 18298 + }, + { + "epoch": 2.3, + "grad_norm": 22.519729614257812, + "learning_rate": 4.68978789273313e-06, + "loss": 1.0901, + "step": 18299 + }, + { + "epoch": 2.3, + "grad_norm": 11.487154960632324, + "learning_rate": 4.688951177676443e-06, + "loss": 2.3736, + "step": 18300 + }, + { + "epoch": 2.3, + "grad_norm": 17.971080780029297, + "learning_rate": 4.688114462619755e-06, + "loss": 2.133, + "step": 18301 + }, + { + "epoch": 2.3, + "grad_norm": 19.557613372802734, + "learning_rate": 4.6872777475630675e-06, + "loss": 0.9644, + "step": 18302 + }, + { + "epoch": 2.3, + "grad_norm": 9.254117012023926, + "learning_rate": 4.68644103250638e-06, + "loss": 0.6961, + "step": 18303 + }, + { + "epoch": 2.3, + "grad_norm": 15.289875030517578, + "learning_rate": 4.685604317449693e-06, + "loss": 0.338, + "step": 18304 + }, + { + "epoch": 2.3, + "grad_norm": 163.48834228515625, + "learning_rate": 4.684767602393006e-06, + "loss": 1.5944, + "step": 18305 + }, + { + "epoch": 2.3, + "grad_norm": 7.755233287811279, + "learning_rate": 4.683930887336318e-06, + "loss": 1.7118, + "step": 18306 + }, + { + "epoch": 2.3, + "grad_norm": 7.73464298248291, + "learning_rate": 4.683094172279631e-06, + "loss": 0.4349, + "step": 18307 + }, + { + "epoch": 2.3, + "grad_norm": 22.640239715576172, + "learning_rate": 4.682257457222943e-06, + "loss": 1.8137, + "step": 18308 + }, + { + "epoch": 2.3, + "grad_norm": 8.948088645935059, + "learning_rate": 4.6814207421662555e-06, + "loss": 0.481, + "step": 18309 + }, + { + "epoch": 2.3, + "grad_norm": 14.048690795898438, + "learning_rate": 4.680584027109568e-06, + "loss": 1.3658, + "step": 18310 + }, + { + "epoch": 2.3, + "grad_norm": 15.745485305786133, + "learning_rate": 4.679747312052881e-06, + "loss": 0.4563, + "step": 18311 + }, + { + "epoch": 2.3, + "grad_norm": 11.852276802062988, + "learning_rate": 4.678910596996193e-06, + "loss": 0.5245, + "step": 18312 + }, + { + "epoch": 2.3, + "grad_norm": 41.801578521728516, + "learning_rate": 4.678073881939506e-06, + "loss": 0.6852, + "step": 18313 + }, + { + "epoch": 2.3, + "grad_norm": 3.1565005779266357, + "learning_rate": 4.677237166882819e-06, + "loss": 0.1697, + "step": 18314 + }, + { + "epoch": 2.3, + "grad_norm": 19.444913864135742, + "learning_rate": 4.676400451826131e-06, + "loss": 0.5341, + "step": 18315 + }, + { + "epoch": 2.3, + "grad_norm": 6.4638800621032715, + "learning_rate": 4.675563736769443e-06, + "loss": 0.3213, + "step": 18316 + }, + { + "epoch": 2.3, + "grad_norm": 26.27005386352539, + "learning_rate": 4.674727021712756e-06, + "loss": 1.0434, + "step": 18317 + }, + { + "epoch": 2.3, + "grad_norm": 15.692587852478027, + "learning_rate": 4.673890306656069e-06, + "loss": 0.9039, + "step": 18318 + }, + { + "epoch": 2.3, + "grad_norm": 13.831748008728027, + "learning_rate": 4.673053591599381e-06, + "loss": 1.1145, + "step": 18319 + }, + { + "epoch": 2.3, + "grad_norm": 41.15101623535156, + "learning_rate": 4.672216876542694e-06, + "loss": 1.4555, + "step": 18320 + }, + { + "epoch": 2.3, + "grad_norm": 8.727827072143555, + "learning_rate": 4.6713801614860065e-06, + "loss": 0.6902, + "step": 18321 + }, + { + "epoch": 2.3, + "grad_norm": 26.913434982299805, + "learning_rate": 4.6705434464293185e-06, + "loss": 1.9168, + "step": 18322 + }, + { + "epoch": 2.3, + "grad_norm": 19.45763397216797, + "learning_rate": 4.669706731372631e-06, + "loss": 0.8972, + "step": 18323 + }, + { + "epoch": 2.3, + "grad_norm": 16.61663246154785, + "learning_rate": 4.668870016315944e-06, + "loss": 0.5028, + "step": 18324 + }, + { + "epoch": 2.3, + "grad_norm": 36.733219146728516, + "learning_rate": 4.668033301259257e-06, + "loss": 1.1269, + "step": 18325 + }, + { + "epoch": 2.3, + "grad_norm": 16.540794372558594, + "learning_rate": 4.667196586202569e-06, + "loss": 0.585, + "step": 18326 + }, + { + "epoch": 2.3, + "grad_norm": 9.48679256439209, + "learning_rate": 4.666359871145882e-06, + "loss": 0.4882, + "step": 18327 + }, + { + "epoch": 2.3, + "grad_norm": 7.973689079284668, + "learning_rate": 4.6655231560891945e-06, + "loss": 0.63, + "step": 18328 + }, + { + "epoch": 2.3, + "grad_norm": 10.795661926269531, + "learning_rate": 4.664686441032506e-06, + "loss": 0.9274, + "step": 18329 + }, + { + "epoch": 2.3, + "grad_norm": 11.213825225830078, + "learning_rate": 4.663849725975819e-06, + "loss": 2.3832, + "step": 18330 + }, + { + "epoch": 2.3, + "grad_norm": 10.533309936523438, + "learning_rate": 4.663013010919132e-06, + "loss": 1.4456, + "step": 18331 + }, + { + "epoch": 2.3, + "grad_norm": 17.412017822265625, + "learning_rate": 4.662176295862445e-06, + "loss": 1.477, + "step": 18332 + }, + { + "epoch": 2.3, + "grad_norm": 22.808351516723633, + "learning_rate": 4.661339580805757e-06, + "loss": 1.0247, + "step": 18333 + }, + { + "epoch": 2.3, + "grad_norm": 17.05181884765625, + "learning_rate": 4.6605028657490696e-06, + "loss": 0.617, + "step": 18334 + }, + { + "epoch": 2.3, + "grad_norm": 14.080843925476074, + "learning_rate": 4.6596661506923815e-06, + "loss": 0.6881, + "step": 18335 + }, + { + "epoch": 2.3, + "grad_norm": 12.616730690002441, + "learning_rate": 4.658829435635694e-06, + "loss": 0.4547, + "step": 18336 + }, + { + "epoch": 2.3, + "grad_norm": 13.244577407836914, + "learning_rate": 4.657992720579007e-06, + "loss": 0.5683, + "step": 18337 + }, + { + "epoch": 2.3, + "grad_norm": 10.909563064575195, + "learning_rate": 4.65715600552232e-06, + "loss": 0.9421, + "step": 18338 + }, + { + "epoch": 2.3, + "grad_norm": 99.05635070800781, + "learning_rate": 4.656319290465633e-06, + "loss": 2.5841, + "step": 18339 + }, + { + "epoch": 2.3, + "grad_norm": 20.45166778564453, + "learning_rate": 4.655482575408945e-06, + "loss": 1.4672, + "step": 18340 + }, + { + "epoch": 2.3, + "grad_norm": 33.56233596801758, + "learning_rate": 4.6546458603522575e-06, + "loss": 1.5021, + "step": 18341 + }, + { + "epoch": 2.3, + "grad_norm": 8.5343656539917, + "learning_rate": 4.6538091452955694e-06, + "loss": 0.3094, + "step": 18342 + }, + { + "epoch": 2.3, + "grad_norm": 12.456859588623047, + "learning_rate": 4.652972430238882e-06, + "loss": 0.4925, + "step": 18343 + }, + { + "epoch": 2.3, + "grad_norm": 14.237940788269043, + "learning_rate": 4.652135715182195e-06, + "loss": 0.6235, + "step": 18344 + }, + { + "epoch": 2.3, + "grad_norm": 22.024959564208984, + "learning_rate": 4.651299000125508e-06, + "loss": 0.6861, + "step": 18345 + }, + { + "epoch": 2.3, + "grad_norm": 9.857805252075195, + "learning_rate": 4.650462285068821e-06, + "loss": 1.6765, + "step": 18346 + }, + { + "epoch": 2.3, + "grad_norm": 7.209195137023926, + "learning_rate": 4.649625570012133e-06, + "loss": 0.4586, + "step": 18347 + }, + { + "epoch": 2.3, + "grad_norm": 15.141064643859863, + "learning_rate": 4.648788854955445e-06, + "loss": 1.2863, + "step": 18348 + }, + { + "epoch": 2.3, + "grad_norm": 20.099699020385742, + "learning_rate": 4.647952139898757e-06, + "loss": 1.2703, + "step": 18349 + }, + { + "epoch": 2.3, + "grad_norm": 18.63115882873535, + "learning_rate": 4.64711542484207e-06, + "loss": 1.5081, + "step": 18350 + }, + { + "epoch": 2.3, + "grad_norm": 13.551779747009277, + "learning_rate": 4.646278709785383e-06, + "loss": 0.6705, + "step": 18351 + }, + { + "epoch": 2.3, + "grad_norm": 20.256752014160156, + "learning_rate": 4.645441994728696e-06, + "loss": 2.3797, + "step": 18352 + }, + { + "epoch": 2.3, + "grad_norm": 4.774133682250977, + "learning_rate": 4.644605279672008e-06, + "loss": 0.1291, + "step": 18353 + }, + { + "epoch": 2.3, + "grad_norm": 10.925572395324707, + "learning_rate": 4.6437685646153205e-06, + "loss": 0.9005, + "step": 18354 + }, + { + "epoch": 2.3, + "grad_norm": 12.335229873657227, + "learning_rate": 4.642931849558633e-06, + "loss": 0.8607, + "step": 18355 + }, + { + "epoch": 2.3, + "grad_norm": 10.979912757873535, + "learning_rate": 4.642095134501945e-06, + "loss": 1.9296, + "step": 18356 + }, + { + "epoch": 2.3, + "grad_norm": 23.525379180908203, + "learning_rate": 4.641258419445258e-06, + "loss": 1.5312, + "step": 18357 + }, + { + "epoch": 2.3, + "grad_norm": 18.172266006469727, + "learning_rate": 4.640421704388571e-06, + "loss": 1.4989, + "step": 18358 + }, + { + "epoch": 2.3, + "grad_norm": 12.313436508178711, + "learning_rate": 4.639584989331884e-06, + "loss": 0.8825, + "step": 18359 + }, + { + "epoch": 2.3, + "grad_norm": 103.46768188476562, + "learning_rate": 4.638748274275196e-06, + "loss": 1.6756, + "step": 18360 + }, + { + "epoch": 2.3, + "grad_norm": 10.648122787475586, + "learning_rate": 4.6379115592185084e-06, + "loss": 0.7992, + "step": 18361 + }, + { + "epoch": 2.3, + "grad_norm": 11.252740859985352, + "learning_rate": 4.637074844161821e-06, + "loss": 0.7059, + "step": 18362 + }, + { + "epoch": 2.3, + "grad_norm": 97.60618591308594, + "learning_rate": 4.636238129105133e-06, + "loss": 1.5978, + "step": 18363 + }, + { + "epoch": 2.3, + "grad_norm": 10.907837867736816, + "learning_rate": 4.635401414048446e-06, + "loss": 0.5228, + "step": 18364 + }, + { + "epoch": 2.3, + "grad_norm": 14.384221076965332, + "learning_rate": 4.634564698991759e-06, + "loss": 1.6483, + "step": 18365 + }, + { + "epoch": 2.3, + "grad_norm": 41.83057403564453, + "learning_rate": 4.633727983935072e-06, + "loss": 1.7768, + "step": 18366 + }, + { + "epoch": 2.31, + "grad_norm": 24.17619514465332, + "learning_rate": 4.6328912688783836e-06, + "loss": 0.3494, + "step": 18367 + }, + { + "epoch": 2.31, + "grad_norm": 6.7195563316345215, + "learning_rate": 4.632054553821696e-06, + "loss": 1.5434, + "step": 18368 + }, + { + "epoch": 2.31, + "grad_norm": 30.15358543395996, + "learning_rate": 4.631217838765009e-06, + "loss": 2.3482, + "step": 18369 + }, + { + "epoch": 2.31, + "grad_norm": 6.931991100311279, + "learning_rate": 4.630381123708321e-06, + "loss": 0.6997, + "step": 18370 + }, + { + "epoch": 2.31, + "grad_norm": 11.640711784362793, + "learning_rate": 4.629544408651634e-06, + "loss": 1.0447, + "step": 18371 + }, + { + "epoch": 2.31, + "grad_norm": 13.369149208068848, + "learning_rate": 4.628707693594947e-06, + "loss": 0.4712, + "step": 18372 + }, + { + "epoch": 2.31, + "grad_norm": 14.961252212524414, + "learning_rate": 4.6278709785382595e-06, + "loss": 1.9146, + "step": 18373 + }, + { + "epoch": 2.31, + "grad_norm": 9.926864624023438, + "learning_rate": 4.6270342634815715e-06, + "loss": 0.2889, + "step": 18374 + }, + { + "epoch": 2.31, + "grad_norm": 4.883518695831299, + "learning_rate": 4.626197548424884e-06, + "loss": 0.2071, + "step": 18375 + }, + { + "epoch": 2.31, + "grad_norm": 14.534547805786133, + "learning_rate": 4.625360833368197e-06, + "loss": 1.4073, + "step": 18376 + }, + { + "epoch": 2.31, + "grad_norm": 28.588939666748047, + "learning_rate": 4.624524118311509e-06, + "loss": 0.9256, + "step": 18377 + }, + { + "epoch": 2.31, + "grad_norm": 17.355749130249023, + "learning_rate": 4.623687403254822e-06, + "loss": 0.3963, + "step": 18378 + }, + { + "epoch": 2.31, + "grad_norm": 11.834649085998535, + "learning_rate": 4.622850688198135e-06, + "loss": 0.6306, + "step": 18379 + }, + { + "epoch": 2.31, + "grad_norm": 5.803377151489258, + "learning_rate": 4.6220139731414474e-06, + "loss": 0.364, + "step": 18380 + }, + { + "epoch": 2.31, + "grad_norm": 22.250408172607422, + "learning_rate": 4.621177258084759e-06, + "loss": 1.0079, + "step": 18381 + }, + { + "epoch": 2.31, + "grad_norm": 116.30833435058594, + "learning_rate": 4.620340543028072e-06, + "loss": 2.0143, + "step": 18382 + }, + { + "epoch": 2.31, + "grad_norm": 16.099084854125977, + "learning_rate": 4.619503827971385e-06, + "loss": 1.2797, + "step": 18383 + }, + { + "epoch": 2.31, + "grad_norm": 8.6182222366333, + "learning_rate": 4.618667112914697e-06, + "loss": 1.729, + "step": 18384 + }, + { + "epoch": 2.31, + "grad_norm": 12.962079048156738, + "learning_rate": 4.61783039785801e-06, + "loss": 0.3484, + "step": 18385 + }, + { + "epoch": 2.31, + "grad_norm": 22.328760147094727, + "learning_rate": 4.6169936828013226e-06, + "loss": 1.6485, + "step": 18386 + }, + { + "epoch": 2.31, + "grad_norm": 20.145177841186523, + "learning_rate": 4.616156967744635e-06, + "loss": 1.6653, + "step": 18387 + }, + { + "epoch": 2.31, + "grad_norm": 24.25278663635254, + "learning_rate": 4.615320252687947e-06, + "loss": 0.9849, + "step": 18388 + }, + { + "epoch": 2.31, + "grad_norm": 15.110502243041992, + "learning_rate": 4.61448353763126e-06, + "loss": 1.1584, + "step": 18389 + }, + { + "epoch": 2.31, + "grad_norm": 6.980915546417236, + "learning_rate": 4.613646822574573e-06, + "loss": 0.6934, + "step": 18390 + }, + { + "epoch": 2.31, + "grad_norm": 103.16082763671875, + "learning_rate": 4.612810107517885e-06, + "loss": 1.3066, + "step": 18391 + }, + { + "epoch": 2.31, + "grad_norm": 9.676926612854004, + "learning_rate": 4.611973392461198e-06, + "loss": 0.4227, + "step": 18392 + }, + { + "epoch": 2.31, + "grad_norm": 41.39181900024414, + "learning_rate": 4.6111366774045105e-06, + "loss": 1.914, + "step": 18393 + }, + { + "epoch": 2.31, + "grad_norm": 51.430519104003906, + "learning_rate": 4.6102999623478224e-06, + "loss": 1.3135, + "step": 18394 + }, + { + "epoch": 2.31, + "grad_norm": 12.843317031860352, + "learning_rate": 4.609463247291135e-06, + "loss": 2.5327, + "step": 18395 + }, + { + "epoch": 2.31, + "grad_norm": 60.07124710083008, + "learning_rate": 4.608626532234448e-06, + "loss": 1.1639, + "step": 18396 + }, + { + "epoch": 2.31, + "grad_norm": 18.975269317626953, + "learning_rate": 4.607789817177761e-06, + "loss": 0.9971, + "step": 18397 + }, + { + "epoch": 2.31, + "grad_norm": 17.61123275756836, + "learning_rate": 4.606953102121073e-06, + "loss": 0.5639, + "step": 18398 + }, + { + "epoch": 2.31, + "grad_norm": 17.708599090576172, + "learning_rate": 4.606116387064386e-06, + "loss": 1.7949, + "step": 18399 + }, + { + "epoch": 2.31, + "grad_norm": 12.286175727844238, + "learning_rate": 4.605279672007698e-06, + "loss": 0.6957, + "step": 18400 + }, + { + "epoch": 2.31, + "eval_loss": 0.0826125219464302, + "eval_runtime": 95.608, + "eval_samples_per_second": 37.047, + "eval_steps_per_second": 37.047, + "step": 18400 + }, + { + "epoch": 2.31, + "grad_norm": 18.678001403808594, + "learning_rate": 4.60444295695101e-06, + "loss": 1.502, + "step": 18401 + }, + { + "epoch": 2.31, + "grad_norm": 18.006704330444336, + "learning_rate": 4.603606241894323e-06, + "loss": 1.5553, + "step": 18402 + }, + { + "epoch": 2.31, + "grad_norm": 13.556264877319336, + "learning_rate": 4.602769526837636e-06, + "loss": 1.2797, + "step": 18403 + }, + { + "epoch": 2.31, + "grad_norm": 85.38465118408203, + "learning_rate": 4.601932811780949e-06, + "loss": 3.1793, + "step": 18404 + }, + { + "epoch": 2.31, + "grad_norm": 48.9052848815918, + "learning_rate": 4.601096096724261e-06, + "loss": 1.9324, + "step": 18405 + }, + { + "epoch": 2.31, + "grad_norm": 15.101606369018555, + "learning_rate": 4.6002593816675735e-06, + "loss": 0.5255, + "step": 18406 + }, + { + "epoch": 2.31, + "grad_norm": 15.13419246673584, + "learning_rate": 4.599422666610886e-06, + "loss": 0.7577, + "step": 18407 + }, + { + "epoch": 2.31, + "grad_norm": 14.737383842468262, + "learning_rate": 4.598585951554198e-06, + "loss": 1.2584, + "step": 18408 + }, + { + "epoch": 2.31, + "grad_norm": 7.4333295822143555, + "learning_rate": 4.597749236497511e-06, + "loss": 0.402, + "step": 18409 + }, + { + "epoch": 2.31, + "grad_norm": 61.6894645690918, + "learning_rate": 4.596912521440824e-06, + "loss": 1.2618, + "step": 18410 + }, + { + "epoch": 2.31, + "grad_norm": 8.353595733642578, + "learning_rate": 4.596075806384137e-06, + "loss": 0.5188, + "step": 18411 + }, + { + "epoch": 2.31, + "grad_norm": 12.857970237731934, + "learning_rate": 4.595239091327449e-06, + "loss": 1.075, + "step": 18412 + }, + { + "epoch": 2.31, + "grad_norm": 10.973597526550293, + "learning_rate": 4.5944023762707614e-06, + "loss": 1.0918, + "step": 18413 + }, + { + "epoch": 2.31, + "grad_norm": 19.118207931518555, + "learning_rate": 4.593565661214074e-06, + "loss": 1.0175, + "step": 18414 + }, + { + "epoch": 2.31, + "grad_norm": 23.525999069213867, + "learning_rate": 4.592728946157386e-06, + "loss": 1.0776, + "step": 18415 + }, + { + "epoch": 2.31, + "grad_norm": 5.870167255401611, + "learning_rate": 4.591892231100699e-06, + "loss": 1.8001, + "step": 18416 + }, + { + "epoch": 2.31, + "grad_norm": 7.887999057769775, + "learning_rate": 4.591055516044012e-06, + "loss": 1.8062, + "step": 18417 + }, + { + "epoch": 2.31, + "grad_norm": 10.264204025268555, + "learning_rate": 4.590218800987325e-06, + "loss": 0.6321, + "step": 18418 + }, + { + "epoch": 2.31, + "grad_norm": 17.61655616760254, + "learning_rate": 4.5893820859306366e-06, + "loss": 1.7462, + "step": 18419 + }, + { + "epoch": 2.31, + "grad_norm": 39.52225875854492, + "learning_rate": 4.588545370873949e-06, + "loss": 0.6963, + "step": 18420 + }, + { + "epoch": 2.31, + "grad_norm": 14.193148612976074, + "learning_rate": 4.587708655817262e-06, + "loss": 1.3271, + "step": 18421 + }, + { + "epoch": 2.31, + "grad_norm": 14.687078475952148, + "learning_rate": 4.586871940760574e-06, + "loss": 0.9129, + "step": 18422 + }, + { + "epoch": 2.31, + "grad_norm": 42.63068771362305, + "learning_rate": 4.586035225703887e-06, + "loss": 0.7915, + "step": 18423 + }, + { + "epoch": 2.31, + "grad_norm": 92.52960968017578, + "learning_rate": 4.5851985106472e-06, + "loss": 1.8236, + "step": 18424 + }, + { + "epoch": 2.31, + "grad_norm": 11.0745267868042, + "learning_rate": 4.584361795590512e-06, + "loss": 0.7362, + "step": 18425 + }, + { + "epoch": 2.31, + "grad_norm": 7.666187763214111, + "learning_rate": 4.5835250805338245e-06, + "loss": 0.2181, + "step": 18426 + }, + { + "epoch": 2.31, + "grad_norm": 41.562591552734375, + "learning_rate": 4.582688365477137e-06, + "loss": 1.4681, + "step": 18427 + }, + { + "epoch": 2.31, + "grad_norm": 11.841303825378418, + "learning_rate": 4.58185165042045e-06, + "loss": 1.4834, + "step": 18428 + }, + { + "epoch": 2.31, + "grad_norm": 19.387245178222656, + "learning_rate": 4.581014935363762e-06, + "loss": 1.9407, + "step": 18429 + }, + { + "epoch": 2.31, + "grad_norm": 7.08765172958374, + "learning_rate": 4.580178220307075e-06, + "loss": 0.6848, + "step": 18430 + }, + { + "epoch": 2.31, + "grad_norm": 16.13463592529297, + "learning_rate": 4.579341505250388e-06, + "loss": 0.6158, + "step": 18431 + }, + { + "epoch": 2.31, + "grad_norm": 16.1623477935791, + "learning_rate": 4.5785047901937e-06, + "loss": 1.6799, + "step": 18432 + }, + { + "epoch": 2.31, + "grad_norm": 16.288482666015625, + "learning_rate": 4.577668075137012e-06, + "loss": 0.726, + "step": 18433 + }, + { + "epoch": 2.31, + "grad_norm": 34.15134811401367, + "learning_rate": 4.576831360080325e-06, + "loss": 0.9051, + "step": 18434 + }, + { + "epoch": 2.31, + "grad_norm": 48.93670654296875, + "learning_rate": 4.575994645023637e-06, + "loss": 1.1806, + "step": 18435 + }, + { + "epoch": 2.31, + "grad_norm": 67.82980346679688, + "learning_rate": 4.57515792996695e-06, + "loss": 1.4423, + "step": 18436 + }, + { + "epoch": 2.31, + "grad_norm": 7.230655670166016, + "learning_rate": 4.574321214910263e-06, + "loss": 0.3903, + "step": 18437 + }, + { + "epoch": 2.31, + "grad_norm": 18.597454071044922, + "learning_rate": 4.5734844998535756e-06, + "loss": 1.1491, + "step": 18438 + }, + { + "epoch": 2.31, + "grad_norm": 6.0403242111206055, + "learning_rate": 4.5726477847968875e-06, + "loss": 0.9506, + "step": 18439 + }, + { + "epoch": 2.31, + "grad_norm": 18.730148315429688, + "learning_rate": 4.5718110697402e-06, + "loss": 1.3363, + "step": 18440 + }, + { + "epoch": 2.31, + "grad_norm": 30.04512596130371, + "learning_rate": 4.570974354683513e-06, + "loss": 1.4447, + "step": 18441 + }, + { + "epoch": 2.31, + "grad_norm": 9.06648063659668, + "learning_rate": 4.570137639626825e-06, + "loss": 0.5994, + "step": 18442 + }, + { + "epoch": 2.31, + "grad_norm": 194.48912048339844, + "learning_rate": 4.569300924570138e-06, + "loss": 1.0488, + "step": 18443 + }, + { + "epoch": 2.31, + "grad_norm": 12.337285995483398, + "learning_rate": 4.568464209513451e-06, + "loss": 0.8462, + "step": 18444 + }, + { + "epoch": 2.31, + "grad_norm": 15.318734169006348, + "learning_rate": 4.5676274944567635e-06, + "loss": 0.4775, + "step": 18445 + }, + { + "epoch": 2.31, + "grad_norm": 15.811105728149414, + "learning_rate": 4.5667907794000754e-06, + "loss": 0.4122, + "step": 18446 + }, + { + "epoch": 2.32, + "grad_norm": 10.545247077941895, + "learning_rate": 4.565954064343388e-06, + "loss": 1.5772, + "step": 18447 + }, + { + "epoch": 2.32, + "grad_norm": 9.892328262329102, + "learning_rate": 4.565117349286701e-06, + "loss": 0.3988, + "step": 18448 + }, + { + "epoch": 2.32, + "grad_norm": 5.395293235778809, + "learning_rate": 4.564280634230013e-06, + "loss": 1.4455, + "step": 18449 + }, + { + "epoch": 2.32, + "grad_norm": 5.257930278778076, + "learning_rate": 4.563443919173326e-06, + "loss": 0.6387, + "step": 18450 + }, + { + "epoch": 2.32, + "grad_norm": 23.21833038330078, + "learning_rate": 4.562607204116639e-06, + "loss": 1.8555, + "step": 18451 + }, + { + "epoch": 2.32, + "grad_norm": 39.41880416870117, + "learning_rate": 4.561770489059951e-06, + "loss": 0.9496, + "step": 18452 + }, + { + "epoch": 2.32, + "grad_norm": 30.933761596679688, + "learning_rate": 4.560933774003263e-06, + "loss": 0.8729, + "step": 18453 + }, + { + "epoch": 2.32, + "grad_norm": 70.10780334472656, + "learning_rate": 4.560097058946576e-06, + "loss": 1.2494, + "step": 18454 + }, + { + "epoch": 2.32, + "grad_norm": 14.925946235656738, + "learning_rate": 4.559260343889889e-06, + "loss": 0.3035, + "step": 18455 + }, + { + "epoch": 2.32, + "grad_norm": 6.578594207763672, + "learning_rate": 4.558423628833201e-06, + "loss": 0.5906, + "step": 18456 + }, + { + "epoch": 2.32, + "grad_norm": 17.706449508666992, + "learning_rate": 4.557586913776514e-06, + "loss": 0.6118, + "step": 18457 + }, + { + "epoch": 2.32, + "grad_norm": 10.585360527038574, + "learning_rate": 4.5567501987198265e-06, + "loss": 0.2812, + "step": 18458 + }, + { + "epoch": 2.32, + "grad_norm": 13.294159889221191, + "learning_rate": 4.555913483663139e-06, + "loss": 0.7182, + "step": 18459 + }, + { + "epoch": 2.32, + "grad_norm": 6.902012348175049, + "learning_rate": 4.555076768606451e-06, + "loss": 0.3839, + "step": 18460 + }, + { + "epoch": 2.32, + "grad_norm": 12.276602745056152, + "learning_rate": 4.554240053549764e-06, + "loss": 0.9393, + "step": 18461 + }, + { + "epoch": 2.32, + "grad_norm": 7.4398651123046875, + "learning_rate": 4.553403338493077e-06, + "loss": 0.6542, + "step": 18462 + }, + { + "epoch": 2.32, + "grad_norm": 35.33179473876953, + "learning_rate": 4.552566623436389e-06, + "loss": 0.4444, + "step": 18463 + }, + { + "epoch": 2.32, + "grad_norm": 24.448781967163086, + "learning_rate": 4.551729908379702e-06, + "loss": 0.9474, + "step": 18464 + }, + { + "epoch": 2.32, + "grad_norm": 3.1484103202819824, + "learning_rate": 4.5508931933230144e-06, + "loss": 0.0949, + "step": 18465 + }, + { + "epoch": 2.32, + "grad_norm": 26.908098220825195, + "learning_rate": 4.550056478266327e-06, + "loss": 1.1604, + "step": 18466 + }, + { + "epoch": 2.32, + "grad_norm": 31.584924697875977, + "learning_rate": 4.549219763209639e-06, + "loss": 1.0898, + "step": 18467 + }, + { + "epoch": 2.32, + "grad_norm": 8.06916332244873, + "learning_rate": 4.548383048152952e-06, + "loss": 0.2024, + "step": 18468 + }, + { + "epoch": 2.32, + "grad_norm": 53.92453384399414, + "learning_rate": 4.547546333096265e-06, + "loss": 0.8322, + "step": 18469 + }, + { + "epoch": 2.32, + "grad_norm": 10.829703330993652, + "learning_rate": 4.546709618039577e-06, + "loss": 0.546, + "step": 18470 + }, + { + "epoch": 2.32, + "grad_norm": 15.220983505249023, + "learning_rate": 4.5458729029828895e-06, + "loss": 1.5201, + "step": 18471 + }, + { + "epoch": 2.32, + "grad_norm": 48.43095779418945, + "learning_rate": 4.545036187926202e-06, + "loss": 1.083, + "step": 18472 + }, + { + "epoch": 2.32, + "grad_norm": 34.10857391357422, + "learning_rate": 4.544199472869515e-06, + "loss": 1.2666, + "step": 18473 + }, + { + "epoch": 2.32, + "grad_norm": 38.154232025146484, + "learning_rate": 4.543362757812827e-06, + "loss": 2.316, + "step": 18474 + }, + { + "epoch": 2.32, + "grad_norm": 19.14337921142578, + "learning_rate": 4.54252604275614e-06, + "loss": 1.2413, + "step": 18475 + }, + { + "epoch": 2.32, + "grad_norm": 60.2432975769043, + "learning_rate": 4.541689327699452e-06, + "loss": 1.3024, + "step": 18476 + }, + { + "epoch": 2.32, + "grad_norm": 12.001728057861328, + "learning_rate": 4.540852612642765e-06, + "loss": 1.1734, + "step": 18477 + }, + { + "epoch": 2.32, + "grad_norm": 6.172059535980225, + "learning_rate": 4.5400158975860775e-06, + "loss": 0.5006, + "step": 18478 + }, + { + "epoch": 2.32, + "grad_norm": 17.862943649291992, + "learning_rate": 4.53917918252939e-06, + "loss": 0.7339, + "step": 18479 + }, + { + "epoch": 2.32, + "grad_norm": 6.126031398773193, + "learning_rate": 4.538342467472703e-06, + "loss": 0.3133, + "step": 18480 + }, + { + "epoch": 2.32, + "grad_norm": 4.250203609466553, + "learning_rate": 4.537505752416015e-06, + "loss": 0.1674, + "step": 18481 + }, + { + "epoch": 2.32, + "grad_norm": 9.197067260742188, + "learning_rate": 4.536669037359328e-06, + "loss": 0.5205, + "step": 18482 + }, + { + "epoch": 2.32, + "grad_norm": 9.302022933959961, + "learning_rate": 4.53583232230264e-06, + "loss": 0.3086, + "step": 18483 + }, + { + "epoch": 2.32, + "grad_norm": 24.13341522216797, + "learning_rate": 4.5349956072459526e-06, + "loss": 0.71, + "step": 18484 + }, + { + "epoch": 2.32, + "grad_norm": 4.410804748535156, + "learning_rate": 4.534158892189265e-06, + "loss": 0.2536, + "step": 18485 + }, + { + "epoch": 2.32, + "grad_norm": 16.58905792236328, + "learning_rate": 4.533322177132578e-06, + "loss": 0.5471, + "step": 18486 + }, + { + "epoch": 2.32, + "grad_norm": 9.57160472869873, + "learning_rate": 4.532485462075891e-06, + "loss": 0.5688, + "step": 18487 + }, + { + "epoch": 2.32, + "grad_norm": 86.57952117919922, + "learning_rate": 4.531648747019203e-06, + "loss": 1.9876, + "step": 18488 + }, + { + "epoch": 2.32, + "grad_norm": 29.423912048339844, + "learning_rate": 4.530812031962516e-06, + "loss": 0.0888, + "step": 18489 + }, + { + "epoch": 2.32, + "grad_norm": 3.2133326530456543, + "learning_rate": 4.529975316905828e-06, + "loss": 0.2878, + "step": 18490 + }, + { + "epoch": 2.32, + "grad_norm": 44.775272369384766, + "learning_rate": 4.5291386018491405e-06, + "loss": 0.8185, + "step": 18491 + }, + { + "epoch": 2.32, + "grad_norm": 22.77064323425293, + "learning_rate": 4.528301886792453e-06, + "loss": 1.3059, + "step": 18492 + }, + { + "epoch": 2.32, + "grad_norm": 18.844072341918945, + "learning_rate": 4.527465171735766e-06, + "loss": 1.5743, + "step": 18493 + }, + { + "epoch": 2.32, + "grad_norm": 4.893050670623779, + "learning_rate": 4.526628456679079e-06, + "loss": 0.1682, + "step": 18494 + }, + { + "epoch": 2.32, + "grad_norm": 9.104093551635742, + "learning_rate": 4.525791741622391e-06, + "loss": 0.1361, + "step": 18495 + }, + { + "epoch": 2.32, + "grad_norm": 14.211118698120117, + "learning_rate": 4.524955026565704e-06, + "loss": 0.8007, + "step": 18496 + }, + { + "epoch": 2.32, + "grad_norm": 19.555896759033203, + "learning_rate": 4.524118311509016e-06, + "loss": 2.5899, + "step": 18497 + }, + { + "epoch": 2.32, + "grad_norm": 24.445524215698242, + "learning_rate": 4.523281596452328e-06, + "loss": 1.299, + "step": 18498 + }, + { + "epoch": 2.32, + "grad_norm": 9.455183982849121, + "learning_rate": 4.522444881395641e-06, + "loss": 0.2394, + "step": 18499 + }, + { + "epoch": 2.32, + "grad_norm": 15.775891304016113, + "learning_rate": 4.521608166338954e-06, + "loss": 1.1262, + "step": 18500 + }, + { + "epoch": 2.32, + "grad_norm": 8.089310646057129, + "learning_rate": 4.520771451282267e-06, + "loss": 0.1442, + "step": 18501 + }, + { + "epoch": 2.32, + "grad_norm": 16.34964942932129, + "learning_rate": 4.519934736225579e-06, + "loss": 1.0115, + "step": 18502 + }, + { + "epoch": 2.32, + "grad_norm": 28.648630142211914, + "learning_rate": 4.5190980211688916e-06, + "loss": 0.2435, + "step": 18503 + }, + { + "epoch": 2.32, + "grad_norm": 7.589015483856201, + "learning_rate": 4.5182613061122035e-06, + "loss": 1.3313, + "step": 18504 + }, + { + "epoch": 2.32, + "grad_norm": 16.20108413696289, + "learning_rate": 4.517424591055516e-06, + "loss": 0.7038, + "step": 18505 + }, + { + "epoch": 2.32, + "grad_norm": 9.961381912231445, + "learning_rate": 4.516587875998829e-06, + "loss": 0.3543, + "step": 18506 + }, + { + "epoch": 2.32, + "grad_norm": 15.873948097229004, + "learning_rate": 4.515751160942142e-06, + "loss": 2.2536, + "step": 18507 + }, + { + "epoch": 2.32, + "grad_norm": 15.497950553894043, + "learning_rate": 4.514914445885454e-06, + "loss": 0.6082, + "step": 18508 + }, + { + "epoch": 2.32, + "grad_norm": 5.966700553894043, + "learning_rate": 4.514077730828767e-06, + "loss": 0.3721, + "step": 18509 + }, + { + "epoch": 2.32, + "grad_norm": 56.19346237182617, + "learning_rate": 4.5132410157720795e-06, + "loss": 1.6361, + "step": 18510 + }, + { + "epoch": 2.32, + "grad_norm": 76.994873046875, + "learning_rate": 4.5124043007153914e-06, + "loss": 1.6389, + "step": 18511 + }, + { + "epoch": 2.32, + "grad_norm": 18.834623336791992, + "learning_rate": 4.511567585658704e-06, + "loss": 0.6549, + "step": 18512 + }, + { + "epoch": 2.32, + "grad_norm": 37.12870788574219, + "learning_rate": 4.510730870602017e-06, + "loss": 1.274, + "step": 18513 + }, + { + "epoch": 2.32, + "grad_norm": 18.4162654876709, + "learning_rate": 4.50989415554533e-06, + "loss": 0.626, + "step": 18514 + }, + { + "epoch": 2.32, + "grad_norm": 183.23519897460938, + "learning_rate": 4.509057440488642e-06, + "loss": 0.9285, + "step": 18515 + }, + { + "epoch": 2.32, + "grad_norm": 31.22499656677246, + "learning_rate": 4.508220725431955e-06, + "loss": 0.8322, + "step": 18516 + }, + { + "epoch": 2.32, + "grad_norm": 43.19390869140625, + "learning_rate": 4.5073840103752666e-06, + "loss": 1.4242, + "step": 18517 + }, + { + "epoch": 2.32, + "grad_norm": 10.707412719726562, + "learning_rate": 4.506547295318579e-06, + "loss": 1.2822, + "step": 18518 + }, + { + "epoch": 2.32, + "grad_norm": 18.027921676635742, + "learning_rate": 4.505710580261892e-06, + "loss": 0.9674, + "step": 18519 + }, + { + "epoch": 2.32, + "grad_norm": 24.536752700805664, + "learning_rate": 4.504873865205205e-06, + "loss": 1.9647, + "step": 18520 + }, + { + "epoch": 2.32, + "grad_norm": 36.74241256713867, + "learning_rate": 4.504037150148518e-06, + "loss": 1.7273, + "step": 18521 + }, + { + "epoch": 2.32, + "grad_norm": 21.1619873046875, + "learning_rate": 4.50320043509183e-06, + "loss": 0.9094, + "step": 18522 + }, + { + "epoch": 2.32, + "grad_norm": 92.76055145263672, + "learning_rate": 4.5023637200351425e-06, + "loss": 2.894, + "step": 18523 + }, + { + "epoch": 2.32, + "grad_norm": 13.162256240844727, + "learning_rate": 4.5015270049784545e-06, + "loss": 0.9359, + "step": 18524 + }, + { + "epoch": 2.32, + "grad_norm": 19.254579544067383, + "learning_rate": 4.500690289921767e-06, + "loss": 3.4928, + "step": 18525 + }, + { + "epoch": 2.32, + "grad_norm": 33.88831329345703, + "learning_rate": 4.49985357486508e-06, + "loss": 3.0827, + "step": 18526 + }, + { + "epoch": 2.33, + "grad_norm": 8.00139045715332, + "learning_rate": 4.499016859808393e-06, + "loss": 0.3537, + "step": 18527 + }, + { + "epoch": 2.33, + "grad_norm": 18.81523895263672, + "learning_rate": 4.498180144751706e-06, + "loss": 1.4291, + "step": 18528 + }, + { + "epoch": 2.33, + "grad_norm": 28.315595626831055, + "learning_rate": 4.497343429695018e-06, + "loss": 1.0837, + "step": 18529 + }, + { + "epoch": 2.33, + "grad_norm": 11.951425552368164, + "learning_rate": 4.4965067146383304e-06, + "loss": 1.8781, + "step": 18530 + }, + { + "epoch": 2.33, + "grad_norm": 27.449447631835938, + "learning_rate": 4.495669999581642e-06, + "loss": 1.3868, + "step": 18531 + }, + { + "epoch": 2.33, + "grad_norm": 16.96086883544922, + "learning_rate": 4.494833284524955e-06, + "loss": 0.7793, + "step": 18532 + }, + { + "epoch": 2.33, + "grad_norm": 19.07918930053711, + "learning_rate": 4.493996569468268e-06, + "loss": 2.4194, + "step": 18533 + }, + { + "epoch": 2.33, + "grad_norm": 18.13289451599121, + "learning_rate": 4.493159854411581e-06, + "loss": 1.1208, + "step": 18534 + }, + { + "epoch": 2.33, + "grad_norm": 7.961818218231201, + "learning_rate": 4.492323139354894e-06, + "loss": 0.3971, + "step": 18535 + }, + { + "epoch": 2.33, + "grad_norm": 9.359652519226074, + "learning_rate": 4.4914864242982056e-06, + "loss": 1.1236, + "step": 18536 + }, + { + "epoch": 2.33, + "grad_norm": 10.581709861755371, + "learning_rate": 4.490649709241518e-06, + "loss": 0.3395, + "step": 18537 + }, + { + "epoch": 2.33, + "grad_norm": 10.72530746459961, + "learning_rate": 4.48981299418483e-06, + "loss": 0.5704, + "step": 18538 + }, + { + "epoch": 2.33, + "grad_norm": 19.760509490966797, + "learning_rate": 4.488976279128143e-06, + "loss": 0.24, + "step": 18539 + }, + { + "epoch": 2.33, + "grad_norm": 63.88747024536133, + "learning_rate": 4.488139564071456e-06, + "loss": 1.9412, + "step": 18540 + }, + { + "epoch": 2.33, + "grad_norm": 8.674443244934082, + "learning_rate": 4.487302849014769e-06, + "loss": 0.7178, + "step": 18541 + }, + { + "epoch": 2.33, + "grad_norm": 12.277080535888672, + "learning_rate": 4.4864661339580815e-06, + "loss": 0.3274, + "step": 18542 + }, + { + "epoch": 2.33, + "grad_norm": 19.937883377075195, + "learning_rate": 4.4856294189013935e-06, + "loss": 1.2808, + "step": 18543 + }, + { + "epoch": 2.33, + "grad_norm": 12.608200073242188, + "learning_rate": 4.484792703844706e-06, + "loss": 1.8176, + "step": 18544 + }, + { + "epoch": 2.33, + "grad_norm": 11.698739051818848, + "learning_rate": 4.483955988788018e-06, + "loss": 0.6196, + "step": 18545 + }, + { + "epoch": 2.33, + "grad_norm": 9.430562019348145, + "learning_rate": 4.483119273731331e-06, + "loss": 0.9752, + "step": 18546 + }, + { + "epoch": 2.33, + "grad_norm": 6.099495887756348, + "learning_rate": 4.482282558674644e-06, + "loss": 0.3059, + "step": 18547 + }, + { + "epoch": 2.33, + "grad_norm": 11.467565536499023, + "learning_rate": 4.481445843617957e-06, + "loss": 1.7283, + "step": 18548 + }, + { + "epoch": 2.33, + "grad_norm": 177.7603302001953, + "learning_rate": 4.4806091285612694e-06, + "loss": 2.3344, + "step": 18549 + }, + { + "epoch": 2.33, + "grad_norm": 19.29451560974121, + "learning_rate": 4.479772413504581e-06, + "loss": 0.4299, + "step": 18550 + }, + { + "epoch": 2.33, + "grad_norm": 20.672033309936523, + "learning_rate": 4.478935698447893e-06, + "loss": 0.8978, + "step": 18551 + }, + { + "epoch": 2.33, + "grad_norm": 9.668931007385254, + "learning_rate": 4.478098983391206e-06, + "loss": 0.6274, + "step": 18552 + }, + { + "epoch": 2.33, + "grad_norm": 8.82039737701416, + "learning_rate": 4.477262268334519e-06, + "loss": 1.276, + "step": 18553 + }, + { + "epoch": 2.33, + "grad_norm": 24.4532527923584, + "learning_rate": 4.476425553277832e-06, + "loss": 1.1815, + "step": 18554 + }, + { + "epoch": 2.33, + "grad_norm": 27.42000389099121, + "learning_rate": 4.4755888382211446e-06, + "loss": 1.7023, + "step": 18555 + }, + { + "epoch": 2.33, + "grad_norm": 19.05655860900879, + "learning_rate": 4.474752123164457e-06, + "loss": 1.3701, + "step": 18556 + }, + { + "epoch": 2.33, + "grad_norm": 31.000120162963867, + "learning_rate": 4.473915408107769e-06, + "loss": 0.8982, + "step": 18557 + }, + { + "epoch": 2.33, + "grad_norm": 8.06107234954834, + "learning_rate": 4.473078693051081e-06, + "loss": 1.0744, + "step": 18558 + }, + { + "epoch": 2.33, + "grad_norm": 5.505249977111816, + "learning_rate": 4.472241977994394e-06, + "loss": 0.4442, + "step": 18559 + }, + { + "epoch": 2.33, + "grad_norm": 31.695676803588867, + "learning_rate": 4.471405262937707e-06, + "loss": 0.9121, + "step": 18560 + }, + { + "epoch": 2.33, + "grad_norm": 130.29441833496094, + "learning_rate": 4.47056854788102e-06, + "loss": 0.6397, + "step": 18561 + }, + { + "epoch": 2.33, + "grad_norm": 25.62512969970703, + "learning_rate": 4.4697318328243325e-06, + "loss": 2.3557, + "step": 18562 + }, + { + "epoch": 2.33, + "grad_norm": 3.5183045864105225, + "learning_rate": 4.4688951177676444e-06, + "loss": 0.2687, + "step": 18563 + }, + { + "epoch": 2.33, + "grad_norm": 1.5712084770202637, + "learning_rate": 4.468058402710957e-06, + "loss": 0.0653, + "step": 18564 + }, + { + "epoch": 2.33, + "grad_norm": 1.8861128091812134, + "learning_rate": 4.467221687654269e-06, + "loss": 0.0338, + "step": 18565 + }, + { + "epoch": 2.33, + "grad_norm": 11.420167922973633, + "learning_rate": 4.466384972597582e-06, + "loss": 1.7012, + "step": 18566 + }, + { + "epoch": 2.33, + "grad_norm": 13.980409622192383, + "learning_rate": 4.465548257540895e-06, + "loss": 0.7014, + "step": 18567 + }, + { + "epoch": 2.33, + "grad_norm": 23.168249130249023, + "learning_rate": 4.464711542484208e-06, + "loss": 0.7185, + "step": 18568 + }, + { + "epoch": 2.33, + "grad_norm": 8.879084587097168, + "learning_rate": 4.46387482742752e-06, + "loss": 0.6503, + "step": 18569 + }, + { + "epoch": 2.33, + "grad_norm": 8.347853660583496, + "learning_rate": 4.463038112370832e-06, + "loss": 0.7873, + "step": 18570 + }, + { + "epoch": 2.33, + "grad_norm": 27.89021110534668, + "learning_rate": 4.462201397314145e-06, + "loss": 3.0137, + "step": 18571 + }, + { + "epoch": 2.33, + "grad_norm": 24.764833450317383, + "learning_rate": 4.461364682257457e-06, + "loss": 0.7871, + "step": 18572 + }, + { + "epoch": 2.33, + "grad_norm": 6.496964454650879, + "learning_rate": 4.46052796720077e-06, + "loss": 0.9638, + "step": 18573 + }, + { + "epoch": 2.33, + "grad_norm": 7.120611667633057, + "learning_rate": 4.459691252144083e-06, + "loss": 0.2862, + "step": 18574 + }, + { + "epoch": 2.33, + "grad_norm": 24.407320022583008, + "learning_rate": 4.4588545370873955e-06, + "loss": 0.5231, + "step": 18575 + }, + { + "epoch": 2.33, + "grad_norm": 7.571482181549072, + "learning_rate": 4.458017822030708e-06, + "loss": 0.6017, + "step": 18576 + }, + { + "epoch": 2.33, + "grad_norm": 20.153623580932617, + "learning_rate": 4.45718110697402e-06, + "loss": 0.7772, + "step": 18577 + }, + { + "epoch": 2.33, + "grad_norm": 10.780214309692383, + "learning_rate": 4.456344391917333e-06, + "loss": 0.4093, + "step": 18578 + }, + { + "epoch": 2.33, + "grad_norm": 134.1143035888672, + "learning_rate": 4.455507676860645e-06, + "loss": 2.7139, + "step": 18579 + }, + { + "epoch": 2.33, + "grad_norm": 22.914356231689453, + "learning_rate": 4.454670961803958e-06, + "loss": 1.1819, + "step": 18580 + }, + { + "epoch": 2.33, + "grad_norm": 16.470748901367188, + "learning_rate": 4.453834246747271e-06, + "loss": 1.6799, + "step": 18581 + }, + { + "epoch": 2.33, + "grad_norm": 28.637752532958984, + "learning_rate": 4.4529975316905834e-06, + "loss": 0.5893, + "step": 18582 + }, + { + "epoch": 2.33, + "grad_norm": 17.95598793029785, + "learning_rate": 4.452160816633896e-06, + "loss": 1.992, + "step": 18583 + }, + { + "epoch": 2.33, + "grad_norm": 16.640546798706055, + "learning_rate": 4.451324101577208e-06, + "loss": 1.4794, + "step": 18584 + }, + { + "epoch": 2.33, + "grad_norm": 12.653728485107422, + "learning_rate": 4.450487386520521e-06, + "loss": 0.5668, + "step": 18585 + }, + { + "epoch": 2.33, + "grad_norm": 21.595863342285156, + "learning_rate": 4.449650671463833e-06, + "loss": 0.7837, + "step": 18586 + }, + { + "epoch": 2.33, + "grad_norm": 5.787606239318848, + "learning_rate": 4.448813956407146e-06, + "loss": 1.5406, + "step": 18587 + }, + { + "epoch": 2.33, + "grad_norm": 21.696575164794922, + "learning_rate": 4.4479772413504586e-06, + "loss": 0.848, + "step": 18588 + }, + { + "epoch": 2.33, + "grad_norm": 11.923542022705078, + "learning_rate": 4.447140526293771e-06, + "loss": 1.5221, + "step": 18589 + }, + { + "epoch": 2.33, + "grad_norm": 8.494603157043457, + "learning_rate": 4.446303811237084e-06, + "loss": 0.9219, + "step": 18590 + }, + { + "epoch": 2.33, + "grad_norm": 15.257559776306152, + "learning_rate": 4.445467096180396e-06, + "loss": 1.4104, + "step": 18591 + }, + { + "epoch": 2.33, + "grad_norm": 8.365575790405273, + "learning_rate": 4.444630381123708e-06, + "loss": 1.4944, + "step": 18592 + }, + { + "epoch": 2.33, + "grad_norm": 10.471525192260742, + "learning_rate": 4.443793666067021e-06, + "loss": 1.1119, + "step": 18593 + }, + { + "epoch": 2.33, + "grad_norm": 16.419118881225586, + "learning_rate": 4.442956951010334e-06, + "loss": 0.9984, + "step": 18594 + }, + { + "epoch": 2.33, + "grad_norm": 29.719350814819336, + "learning_rate": 4.4421202359536465e-06, + "loss": 3.2967, + "step": 18595 + }, + { + "epoch": 2.33, + "grad_norm": 12.81863021850586, + "learning_rate": 4.441283520896959e-06, + "loss": 0.3091, + "step": 18596 + }, + { + "epoch": 2.33, + "grad_norm": 24.091392517089844, + "learning_rate": 4.440446805840272e-06, + "loss": 0.2168, + "step": 18597 + }, + { + "epoch": 2.33, + "grad_norm": 14.132325172424316, + "learning_rate": 4.439610090783584e-06, + "loss": 0.8024, + "step": 18598 + }, + { + "epoch": 2.33, + "grad_norm": 26.080467224121094, + "learning_rate": 4.438773375726896e-06, + "loss": 2.0559, + "step": 18599 + }, + { + "epoch": 2.33, + "grad_norm": 14.235190391540527, + "learning_rate": 4.437936660670209e-06, + "loss": 1.5803, + "step": 18600 + }, + { + "epoch": 2.33, + "grad_norm": 10.642189979553223, + "learning_rate": 4.437099945613522e-06, + "loss": 1.5307, + "step": 18601 + }, + { + "epoch": 2.33, + "grad_norm": 10.55996036529541, + "learning_rate": 4.436263230556834e-06, + "loss": 0.5519, + "step": 18602 + }, + { + "epoch": 2.33, + "grad_norm": 17.189367294311523, + "learning_rate": 4.435426515500147e-06, + "loss": 1.8954, + "step": 18603 + }, + { + "epoch": 2.33, + "grad_norm": 17.33152961730957, + "learning_rate": 4.434589800443459e-06, + "loss": 1.3626, + "step": 18604 + }, + { + "epoch": 2.33, + "grad_norm": 16.388660430908203, + "learning_rate": 4.433753085386772e-06, + "loss": 0.9414, + "step": 18605 + }, + { + "epoch": 2.34, + "grad_norm": 13.897096633911133, + "learning_rate": 4.432916370330084e-06, + "loss": 1.7795, + "step": 18606 + }, + { + "epoch": 2.34, + "grad_norm": 74.13624572753906, + "learning_rate": 4.432079655273397e-06, + "loss": 0.8079, + "step": 18607 + }, + { + "epoch": 2.34, + "grad_norm": 23.349058151245117, + "learning_rate": 4.4312429402167095e-06, + "loss": 1.7441, + "step": 18608 + }, + { + "epoch": 2.34, + "grad_norm": 8.103655815124512, + "learning_rate": 4.430406225160022e-06, + "loss": 1.6228, + "step": 18609 + }, + { + "epoch": 2.34, + "grad_norm": 67.40815734863281, + "learning_rate": 4.429569510103335e-06, + "loss": 2.4471, + "step": 18610 + }, + { + "epoch": 2.34, + "grad_norm": 12.260972023010254, + "learning_rate": 4.428732795046647e-06, + "loss": 0.5967, + "step": 18611 + }, + { + "epoch": 2.34, + "grad_norm": 10.175232887268066, + "learning_rate": 4.42789607998996e-06, + "loss": 0.8402, + "step": 18612 + }, + { + "epoch": 2.34, + "grad_norm": 90.39164733886719, + "learning_rate": 4.427059364933272e-06, + "loss": 1.1611, + "step": 18613 + }, + { + "epoch": 2.34, + "grad_norm": 47.40072250366211, + "learning_rate": 4.426222649876585e-06, + "loss": 1.7785, + "step": 18614 + }, + { + "epoch": 2.34, + "grad_norm": 7.856509685516357, + "learning_rate": 4.4253859348198974e-06, + "loss": 0.5466, + "step": 18615 + }, + { + "epoch": 2.34, + "grad_norm": 10.867964744567871, + "learning_rate": 4.42454921976321e-06, + "loss": 0.3609, + "step": 18616 + }, + { + "epoch": 2.34, + "grad_norm": 10.657840728759766, + "learning_rate": 4.423712504706523e-06, + "loss": 0.6803, + "step": 18617 + }, + { + "epoch": 2.34, + "grad_norm": 5.646101474761963, + "learning_rate": 4.422875789649835e-06, + "loss": 0.2756, + "step": 18618 + }, + { + "epoch": 2.34, + "grad_norm": 14.785065650939941, + "learning_rate": 4.422039074593148e-06, + "loss": 0.9255, + "step": 18619 + }, + { + "epoch": 2.34, + "grad_norm": 17.859464645385742, + "learning_rate": 4.42120235953646e-06, + "loss": 0.3477, + "step": 18620 + }, + { + "epoch": 2.34, + "grad_norm": 6.709313869476318, + "learning_rate": 4.4203656444797725e-06, + "loss": 0.2985, + "step": 18621 + }, + { + "epoch": 2.34, + "grad_norm": 3.3436636924743652, + "learning_rate": 4.419528929423085e-06, + "loss": 0.1535, + "step": 18622 + }, + { + "epoch": 2.34, + "grad_norm": 8.863362312316895, + "learning_rate": 4.418692214366398e-06, + "loss": 0.7028, + "step": 18623 + }, + { + "epoch": 2.34, + "grad_norm": 8.461028099060059, + "learning_rate": 4.417855499309711e-06, + "loss": 1.6624, + "step": 18624 + }, + { + "epoch": 2.34, + "grad_norm": 4.642300605773926, + "learning_rate": 4.417018784253023e-06, + "loss": 0.1512, + "step": 18625 + }, + { + "epoch": 2.34, + "grad_norm": 6.611247539520264, + "learning_rate": 4.416182069196336e-06, + "loss": 1.3047, + "step": 18626 + }, + { + "epoch": 2.34, + "grad_norm": 23.731626510620117, + "learning_rate": 4.415345354139648e-06, + "loss": 0.9144, + "step": 18627 + }, + { + "epoch": 2.34, + "grad_norm": 10.380583763122559, + "learning_rate": 4.4145086390829605e-06, + "loss": 0.186, + "step": 18628 + }, + { + "epoch": 2.34, + "grad_norm": 13.79790210723877, + "learning_rate": 4.413671924026273e-06, + "loss": 0.5185, + "step": 18629 + }, + { + "epoch": 2.34, + "grad_norm": 22.302764892578125, + "learning_rate": 4.412835208969586e-06, + "loss": 1.3934, + "step": 18630 + }, + { + "epoch": 2.34, + "grad_norm": 259.0495300292969, + "learning_rate": 4.411998493912899e-06, + "loss": 1.516, + "step": 18631 + }, + { + "epoch": 2.34, + "grad_norm": 11.745481491088867, + "learning_rate": 4.411161778856211e-06, + "loss": 2.1137, + "step": 18632 + }, + { + "epoch": 2.34, + "grad_norm": 72.0488510131836, + "learning_rate": 4.410325063799523e-06, + "loss": 1.3036, + "step": 18633 + }, + { + "epoch": 2.34, + "grad_norm": 8.15693473815918, + "learning_rate": 4.4094883487428356e-06, + "loss": 0.3817, + "step": 18634 + }, + { + "epoch": 2.34, + "grad_norm": 9.523024559020996, + "learning_rate": 4.408651633686148e-06, + "loss": 1.111, + "step": 18635 + }, + { + "epoch": 2.34, + "grad_norm": 10.763395309448242, + "learning_rate": 4.407814918629461e-06, + "loss": 1.9269, + "step": 18636 + }, + { + "epoch": 2.34, + "grad_norm": 80.72051239013672, + "learning_rate": 4.406978203572774e-06, + "loss": 1.6392, + "step": 18637 + }, + { + "epoch": 2.34, + "grad_norm": 16.553476333618164, + "learning_rate": 4.406141488516087e-06, + "loss": 0.6152, + "step": 18638 + }, + { + "epoch": 2.34, + "grad_norm": 4.874630928039551, + "learning_rate": 4.405304773459399e-06, + "loss": 0.456, + "step": 18639 + }, + { + "epoch": 2.34, + "grad_norm": 47.115177154541016, + "learning_rate": 4.404468058402711e-06, + "loss": 0.8628, + "step": 18640 + }, + { + "epoch": 2.34, + "grad_norm": 8.854296684265137, + "learning_rate": 4.4036313433460235e-06, + "loss": 0.135, + "step": 18641 + }, + { + "epoch": 2.34, + "grad_norm": 13.961197853088379, + "learning_rate": 4.402794628289336e-06, + "loss": 1.3367, + "step": 18642 + }, + { + "epoch": 2.34, + "grad_norm": 5.6081061363220215, + "learning_rate": 4.401957913232649e-06, + "loss": 0.7036, + "step": 18643 + }, + { + "epoch": 2.34, + "grad_norm": 13.064496994018555, + "learning_rate": 4.401121198175962e-06, + "loss": 0.8997, + "step": 18644 + }, + { + "epoch": 2.34, + "grad_norm": 15.707348823547363, + "learning_rate": 4.400284483119274e-06, + "loss": 0.9013, + "step": 18645 + }, + { + "epoch": 2.34, + "grad_norm": 11.664067268371582, + "learning_rate": 4.399447768062587e-06, + "loss": 0.6092, + "step": 18646 + }, + { + "epoch": 2.34, + "grad_norm": 25.071107864379883, + "learning_rate": 4.398611053005899e-06, + "loss": 0.6235, + "step": 18647 + }, + { + "epoch": 2.34, + "grad_norm": 9.626585006713867, + "learning_rate": 4.397774337949211e-06, + "loss": 0.6089, + "step": 18648 + }, + { + "epoch": 2.34, + "grad_norm": 5.4906086921691895, + "learning_rate": 4.396937622892524e-06, + "loss": 0.3915, + "step": 18649 + }, + { + "epoch": 2.34, + "grad_norm": 21.848379135131836, + "learning_rate": 4.396100907835837e-06, + "loss": 2.2813, + "step": 18650 + }, + { + "epoch": 2.34, + "grad_norm": 100.59906768798828, + "learning_rate": 4.39526419277915e-06, + "loss": 1.7135, + "step": 18651 + }, + { + "epoch": 2.34, + "grad_norm": 4.7957444190979, + "learning_rate": 4.394427477722462e-06, + "loss": 0.1458, + "step": 18652 + }, + { + "epoch": 2.34, + "grad_norm": 6.3409037590026855, + "learning_rate": 4.3935907626657746e-06, + "loss": 1.2853, + "step": 18653 + }, + { + "epoch": 2.34, + "grad_norm": 21.08176040649414, + "learning_rate": 4.3927540476090865e-06, + "loss": 2.1313, + "step": 18654 + }, + { + "epoch": 2.34, + "grad_norm": 15.40987777709961, + "learning_rate": 4.391917332552399e-06, + "loss": 0.9029, + "step": 18655 + }, + { + "epoch": 2.34, + "grad_norm": 11.423988342285156, + "learning_rate": 4.391080617495712e-06, + "loss": 0.675, + "step": 18656 + }, + { + "epoch": 2.34, + "grad_norm": 12.644664764404297, + "learning_rate": 4.390243902439025e-06, + "loss": 0.3895, + "step": 18657 + }, + { + "epoch": 2.34, + "grad_norm": 8.19571304321289, + "learning_rate": 4.389407187382338e-06, + "loss": 0.1933, + "step": 18658 + }, + { + "epoch": 2.34, + "grad_norm": 16.45088005065918, + "learning_rate": 4.38857047232565e-06, + "loss": 0.5284, + "step": 18659 + }, + { + "epoch": 2.34, + "grad_norm": 43.56208419799805, + "learning_rate": 4.3877337572689625e-06, + "loss": 0.7396, + "step": 18660 + }, + { + "epoch": 2.34, + "grad_norm": 6.200404644012451, + "learning_rate": 4.3868970422122744e-06, + "loss": 0.8841, + "step": 18661 + }, + { + "epoch": 2.34, + "grad_norm": 18.50584602355957, + "learning_rate": 4.386060327155587e-06, + "loss": 2.0774, + "step": 18662 + }, + { + "epoch": 2.34, + "grad_norm": 122.0450668334961, + "learning_rate": 4.3852236120989e-06, + "loss": 0.9103, + "step": 18663 + }, + { + "epoch": 2.34, + "grad_norm": 15.36992073059082, + "learning_rate": 4.384386897042213e-06, + "loss": 0.8376, + "step": 18664 + }, + { + "epoch": 2.34, + "grad_norm": 15.341522216796875, + "learning_rate": 4.383550181985526e-06, + "loss": 1.083, + "step": 18665 + }, + { + "epoch": 2.34, + "grad_norm": 4.4258575439453125, + "learning_rate": 4.382713466928838e-06, + "loss": 0.4676, + "step": 18666 + }, + { + "epoch": 2.34, + "grad_norm": 2.513173818588257, + "learning_rate": 4.38187675187215e-06, + "loss": 0.0561, + "step": 18667 + }, + { + "epoch": 2.34, + "grad_norm": 16.44916534423828, + "learning_rate": 4.381040036815462e-06, + "loss": 1.1331, + "step": 18668 + }, + { + "epoch": 2.34, + "grad_norm": 27.556068420410156, + "learning_rate": 4.380203321758775e-06, + "loss": 2.3297, + "step": 18669 + }, + { + "epoch": 2.34, + "grad_norm": 19.5424861907959, + "learning_rate": 4.379366606702088e-06, + "loss": 1.7086, + "step": 18670 + }, + { + "epoch": 2.34, + "grad_norm": 17.760272979736328, + "learning_rate": 4.378529891645401e-06, + "loss": 0.8763, + "step": 18671 + }, + { + "epoch": 2.34, + "grad_norm": 8.427785873413086, + "learning_rate": 4.3776931765887136e-06, + "loss": 2.0768, + "step": 18672 + }, + { + "epoch": 2.34, + "grad_norm": 21.770397186279297, + "learning_rate": 4.3768564615320255e-06, + "loss": 0.9538, + "step": 18673 + }, + { + "epoch": 2.34, + "grad_norm": 14.882251739501953, + "learning_rate": 4.376019746475338e-06, + "loss": 0.4401, + "step": 18674 + }, + { + "epoch": 2.34, + "grad_norm": 24.103191375732422, + "learning_rate": 4.37518303141865e-06, + "loss": 3.058, + "step": 18675 + }, + { + "epoch": 2.34, + "grad_norm": 6.864442825317383, + "learning_rate": 4.374346316361963e-06, + "loss": 0.479, + "step": 18676 + }, + { + "epoch": 2.34, + "grad_norm": 45.01991271972656, + "learning_rate": 4.373509601305276e-06, + "loss": 1.2495, + "step": 18677 + }, + { + "epoch": 2.34, + "grad_norm": 33.04206085205078, + "learning_rate": 4.372672886248589e-06, + "loss": 0.9721, + "step": 18678 + }, + { + "epoch": 2.34, + "grad_norm": 83.6139144897461, + "learning_rate": 4.371836171191901e-06, + "loss": 2.5946, + "step": 18679 + }, + { + "epoch": 2.34, + "grad_norm": 26.812896728515625, + "learning_rate": 4.3709994561352134e-06, + "loss": 0.6091, + "step": 18680 + }, + { + "epoch": 2.34, + "grad_norm": 13.539735794067383, + "learning_rate": 4.370162741078526e-06, + "loss": 0.6702, + "step": 18681 + }, + { + "epoch": 2.34, + "grad_norm": 7.730309963226318, + "learning_rate": 4.369326026021838e-06, + "loss": 0.4636, + "step": 18682 + }, + { + "epoch": 2.34, + "grad_norm": 13.025385856628418, + "learning_rate": 4.368489310965151e-06, + "loss": 1.4847, + "step": 18683 + }, + { + "epoch": 2.34, + "grad_norm": 6.699886798858643, + "learning_rate": 4.367652595908464e-06, + "loss": 0.309, + "step": 18684 + }, + { + "epoch": 2.34, + "grad_norm": 10.381994247436523, + "learning_rate": 4.366815880851777e-06, + "loss": 0.4492, + "step": 18685 + }, + { + "epoch": 2.35, + "grad_norm": 13.109643936157227, + "learning_rate": 4.3659791657950886e-06, + "loss": 1.0229, + "step": 18686 + }, + { + "epoch": 2.35, + "grad_norm": 8.719829559326172, + "learning_rate": 4.365142450738401e-06, + "loss": 0.4225, + "step": 18687 + }, + { + "epoch": 2.35, + "grad_norm": 6.568873405456543, + "learning_rate": 4.364305735681713e-06, + "loss": 0.3474, + "step": 18688 + }, + { + "epoch": 2.35, + "grad_norm": 5.817454814910889, + "learning_rate": 4.363469020625026e-06, + "loss": 1.064, + "step": 18689 + }, + { + "epoch": 2.35, + "grad_norm": 43.017364501953125, + "learning_rate": 4.362632305568339e-06, + "loss": 1.0785, + "step": 18690 + }, + { + "epoch": 2.35, + "grad_norm": 13.391928672790527, + "learning_rate": 4.361795590511652e-06, + "loss": 0.4956, + "step": 18691 + }, + { + "epoch": 2.35, + "grad_norm": 19.47562599182129, + "learning_rate": 4.3609588754549645e-06, + "loss": 0.5848, + "step": 18692 + }, + { + "epoch": 2.35, + "grad_norm": 34.49109649658203, + "learning_rate": 4.3601221603982765e-06, + "loss": 2.0613, + "step": 18693 + }, + { + "epoch": 2.35, + "grad_norm": 9.013361930847168, + "learning_rate": 4.359285445341589e-06, + "loss": 0.4564, + "step": 18694 + }, + { + "epoch": 2.35, + "grad_norm": 9.572412490844727, + "learning_rate": 4.358448730284901e-06, + "loss": 0.7554, + "step": 18695 + }, + { + "epoch": 2.35, + "grad_norm": 11.869797706604004, + "learning_rate": 4.357612015228214e-06, + "loss": 0.4906, + "step": 18696 + }, + { + "epoch": 2.35, + "grad_norm": 30.388763427734375, + "learning_rate": 4.356775300171527e-06, + "loss": 1.9539, + "step": 18697 + }, + { + "epoch": 2.35, + "grad_norm": 13.3001070022583, + "learning_rate": 4.35593858511484e-06, + "loss": 0.5377, + "step": 18698 + }, + { + "epoch": 2.35, + "grad_norm": 8.962652206420898, + "learning_rate": 4.3551018700581524e-06, + "loss": 0.4298, + "step": 18699 + }, + { + "epoch": 2.35, + "grad_norm": 14.28447437286377, + "learning_rate": 4.354265155001464e-06, + "loss": 1.4159, + "step": 18700 + }, + { + "epoch": 2.35, + "grad_norm": 18.35455894470215, + "learning_rate": 4.353428439944777e-06, + "loss": 0.6595, + "step": 18701 + }, + { + "epoch": 2.35, + "grad_norm": 18.493106842041016, + "learning_rate": 4.352591724888089e-06, + "loss": 0.7556, + "step": 18702 + }, + { + "epoch": 2.35, + "grad_norm": 18.51642417907715, + "learning_rate": 4.351755009831402e-06, + "loss": 0.7741, + "step": 18703 + }, + { + "epoch": 2.35, + "grad_norm": 20.658401489257812, + "learning_rate": 4.350918294774715e-06, + "loss": 2.6664, + "step": 18704 + }, + { + "epoch": 2.35, + "grad_norm": 22.133594512939453, + "learning_rate": 4.3500815797180276e-06, + "loss": 2.4023, + "step": 18705 + }, + { + "epoch": 2.35, + "grad_norm": 8.598179817199707, + "learning_rate": 4.34924486466134e-06, + "loss": 0.282, + "step": 18706 + }, + { + "epoch": 2.35, + "grad_norm": 14.833199501037598, + "learning_rate": 4.348408149604652e-06, + "loss": 0.4488, + "step": 18707 + }, + { + "epoch": 2.35, + "grad_norm": 16.867813110351562, + "learning_rate": 4.347571434547965e-06, + "loss": 1.0466, + "step": 18708 + }, + { + "epoch": 2.35, + "grad_norm": 18.45764923095703, + "learning_rate": 4.346734719491277e-06, + "loss": 1.0179, + "step": 18709 + }, + { + "epoch": 2.35, + "grad_norm": 5.755275726318359, + "learning_rate": 4.34589800443459e-06, + "loss": 0.7205, + "step": 18710 + }, + { + "epoch": 2.35, + "grad_norm": 17.404754638671875, + "learning_rate": 4.345061289377903e-06, + "loss": 0.7211, + "step": 18711 + }, + { + "epoch": 2.35, + "grad_norm": 19.29561996459961, + "learning_rate": 4.3442245743212155e-06, + "loss": 3.2392, + "step": 18712 + }, + { + "epoch": 2.35, + "grad_norm": 27.662883758544922, + "learning_rate": 4.343387859264528e-06, + "loss": 1.2601, + "step": 18713 + }, + { + "epoch": 2.35, + "grad_norm": 14.027308464050293, + "learning_rate": 4.34255114420784e-06, + "loss": 0.3639, + "step": 18714 + }, + { + "epoch": 2.35, + "grad_norm": 10.645654678344727, + "learning_rate": 4.341714429151153e-06, + "loss": 1.0314, + "step": 18715 + }, + { + "epoch": 2.35, + "grad_norm": 23.480266571044922, + "learning_rate": 4.340877714094465e-06, + "loss": 0.5757, + "step": 18716 + }, + { + "epoch": 2.35, + "grad_norm": 3.5198709964752197, + "learning_rate": 4.340040999037778e-06, + "loss": 0.3255, + "step": 18717 + }, + { + "epoch": 2.35, + "grad_norm": 21.50994300842285, + "learning_rate": 4.339204283981091e-06, + "loss": 1.4896, + "step": 18718 + }, + { + "epoch": 2.35, + "grad_norm": 13.328341484069824, + "learning_rate": 4.338367568924403e-06, + "loss": 0.7514, + "step": 18719 + }, + { + "epoch": 2.35, + "grad_norm": 19.248104095458984, + "learning_rate": 4.337530853867715e-06, + "loss": 0.8781, + "step": 18720 + }, + { + "epoch": 2.35, + "grad_norm": 11.229989051818848, + "learning_rate": 4.336694138811028e-06, + "loss": 0.5211, + "step": 18721 + }, + { + "epoch": 2.35, + "grad_norm": 12.407174110412598, + "learning_rate": 4.335857423754341e-06, + "loss": 0.6381, + "step": 18722 + }, + { + "epoch": 2.35, + "grad_norm": 10.882194519042969, + "learning_rate": 4.335020708697653e-06, + "loss": 0.6095, + "step": 18723 + }, + { + "epoch": 2.35, + "grad_norm": 18.214351654052734, + "learning_rate": 4.334183993640966e-06, + "loss": 0.5243, + "step": 18724 + }, + { + "epoch": 2.35, + "grad_norm": 4.372857570648193, + "learning_rate": 4.3333472785842785e-06, + "loss": 0.3892, + "step": 18725 + }, + { + "epoch": 2.35, + "grad_norm": 3.732825756072998, + "learning_rate": 4.332510563527591e-06, + "loss": 0.1582, + "step": 18726 + }, + { + "epoch": 2.35, + "grad_norm": 14.107687950134277, + "learning_rate": 4.331673848470903e-06, + "loss": 0.8475, + "step": 18727 + }, + { + "epoch": 2.35, + "grad_norm": 16.790781021118164, + "learning_rate": 4.330837133414216e-06, + "loss": 1.5795, + "step": 18728 + }, + { + "epoch": 2.35, + "grad_norm": 13.135433197021484, + "learning_rate": 4.330000418357529e-06, + "loss": 1.7949, + "step": 18729 + }, + { + "epoch": 2.35, + "grad_norm": 17.481163024902344, + "learning_rate": 4.329163703300841e-06, + "loss": 0.461, + "step": 18730 + }, + { + "epoch": 2.35, + "grad_norm": 10.736077308654785, + "learning_rate": 4.328326988244154e-06, + "loss": 0.5273, + "step": 18731 + }, + { + "epoch": 2.35, + "grad_norm": 16.15587043762207, + "learning_rate": 4.3274902731874664e-06, + "loss": 0.436, + "step": 18732 + }, + { + "epoch": 2.35, + "grad_norm": 7.515250205993652, + "learning_rate": 4.326653558130779e-06, + "loss": 0.9062, + "step": 18733 + }, + { + "epoch": 2.35, + "grad_norm": 28.70311164855957, + "learning_rate": 4.325816843074091e-06, + "loss": 1.467, + "step": 18734 + }, + { + "epoch": 2.35, + "grad_norm": 505.750732421875, + "learning_rate": 4.324980128017404e-06, + "loss": 1.8738, + "step": 18735 + }, + { + "epoch": 2.35, + "grad_norm": 14.68479061126709, + "learning_rate": 4.324143412960717e-06, + "loss": 0.5343, + "step": 18736 + }, + { + "epoch": 2.35, + "grad_norm": 15.57192325592041, + "learning_rate": 4.323306697904029e-06, + "loss": 0.7605, + "step": 18737 + }, + { + "epoch": 2.35, + "grad_norm": 6.905477046966553, + "learning_rate": 4.3224699828473416e-06, + "loss": 0.3782, + "step": 18738 + }, + { + "epoch": 2.35, + "grad_norm": 6.218075275421143, + "learning_rate": 4.321633267790654e-06, + "loss": 0.601, + "step": 18739 + }, + { + "epoch": 2.35, + "grad_norm": 64.63713073730469, + "learning_rate": 4.320796552733967e-06, + "loss": 1.1643, + "step": 18740 + }, + { + "epoch": 2.35, + "grad_norm": 13.204005241394043, + "learning_rate": 4.319959837677279e-06, + "loss": 0.649, + "step": 18741 + }, + { + "epoch": 2.35, + "grad_norm": 36.709774017333984, + "learning_rate": 4.319123122620592e-06, + "loss": 0.6826, + "step": 18742 + }, + { + "epoch": 2.35, + "grad_norm": 16.327075958251953, + "learning_rate": 4.318286407563905e-06, + "loss": 0.9353, + "step": 18743 + }, + { + "epoch": 2.35, + "grad_norm": 13.860923767089844, + "learning_rate": 4.317449692507217e-06, + "loss": 1.3019, + "step": 18744 + }, + { + "epoch": 2.35, + "grad_norm": 25.233509063720703, + "learning_rate": 4.3166129774505295e-06, + "loss": 0.9742, + "step": 18745 + }, + { + "epoch": 2.35, + "grad_norm": 41.156368255615234, + "learning_rate": 4.315776262393842e-06, + "loss": 0.8902, + "step": 18746 + }, + { + "epoch": 2.35, + "grad_norm": 22.470033645629883, + "learning_rate": 4.314939547337155e-06, + "loss": 0.7067, + "step": 18747 + }, + { + "epoch": 2.35, + "grad_norm": 14.529524803161621, + "learning_rate": 4.314102832280467e-06, + "loss": 0.7574, + "step": 18748 + }, + { + "epoch": 2.35, + "grad_norm": 10.950799942016602, + "learning_rate": 4.31326611722378e-06, + "loss": 0.5338, + "step": 18749 + }, + { + "epoch": 2.35, + "grad_norm": 16.41820526123047, + "learning_rate": 4.312429402167093e-06, + "loss": 0.3557, + "step": 18750 + }, + { + "epoch": 2.35, + "grad_norm": 14.858510971069336, + "learning_rate": 4.311592687110405e-06, + "loss": 1.075, + "step": 18751 + }, + { + "epoch": 2.35, + "grad_norm": 9.458222389221191, + "learning_rate": 4.310755972053717e-06, + "loss": 1.0552, + "step": 18752 + }, + { + "epoch": 2.35, + "grad_norm": 23.35594367980957, + "learning_rate": 4.30991925699703e-06, + "loss": 0.7038, + "step": 18753 + }, + { + "epoch": 2.35, + "grad_norm": 40.7728157043457, + "learning_rate": 4.309082541940343e-06, + "loss": 0.3855, + "step": 18754 + }, + { + "epoch": 2.35, + "grad_norm": 20.873706817626953, + "learning_rate": 4.308245826883655e-06, + "loss": 1.2008, + "step": 18755 + }, + { + "epoch": 2.35, + "grad_norm": 90.79370880126953, + "learning_rate": 4.307409111826968e-06, + "loss": 2.0384, + "step": 18756 + }, + { + "epoch": 2.35, + "grad_norm": 11.139209747314453, + "learning_rate": 4.3065723967702806e-06, + "loss": 0.581, + "step": 18757 + }, + { + "epoch": 2.35, + "grad_norm": 5.312123775482178, + "learning_rate": 4.3057356817135925e-06, + "loss": 0.1415, + "step": 18758 + }, + { + "epoch": 2.35, + "grad_norm": 22.708269119262695, + "learning_rate": 4.304898966656905e-06, + "loss": 1.34, + "step": 18759 + }, + { + "epoch": 2.35, + "grad_norm": 28.71159553527832, + "learning_rate": 4.304062251600218e-06, + "loss": 1.1707, + "step": 18760 + }, + { + "epoch": 2.35, + "grad_norm": 8.738513946533203, + "learning_rate": 4.30322553654353e-06, + "loss": 0.621, + "step": 18761 + }, + { + "epoch": 2.35, + "grad_norm": 15.694025993347168, + "learning_rate": 4.302388821486843e-06, + "loss": 0.8531, + "step": 18762 + }, + { + "epoch": 2.35, + "grad_norm": 13.099310874938965, + "learning_rate": 4.301552106430156e-06, + "loss": 1.1724, + "step": 18763 + }, + { + "epoch": 2.35, + "grad_norm": 12.900650978088379, + "learning_rate": 4.3007153913734685e-06, + "loss": 0.8502, + "step": 18764 + }, + { + "epoch": 2.35, + "grad_norm": 10.844757080078125, + "learning_rate": 4.2998786763167804e-06, + "loss": 0.7511, + "step": 18765 + }, + { + "epoch": 2.36, + "grad_norm": 10.86279010772705, + "learning_rate": 4.299041961260093e-06, + "loss": 0.4016, + "step": 18766 + }, + { + "epoch": 2.36, + "grad_norm": 30.131702423095703, + "learning_rate": 4.298205246203406e-06, + "loss": 1.2946, + "step": 18767 + }, + { + "epoch": 2.36, + "grad_norm": 11.3428316116333, + "learning_rate": 4.297368531146718e-06, + "loss": 0.44, + "step": 18768 + }, + { + "epoch": 2.36, + "grad_norm": 17.472829818725586, + "learning_rate": 4.296531816090031e-06, + "loss": 0.6456, + "step": 18769 + }, + { + "epoch": 2.36, + "grad_norm": 28.170934677124023, + "learning_rate": 4.295695101033344e-06, + "loss": 2.0749, + "step": 18770 + }, + { + "epoch": 2.36, + "grad_norm": 11.849146842956543, + "learning_rate": 4.294858385976656e-06, + "loss": 1.1018, + "step": 18771 + }, + { + "epoch": 2.36, + "grad_norm": 26.870027542114258, + "learning_rate": 4.294021670919968e-06, + "loss": 1.7141, + "step": 18772 + }, + { + "epoch": 2.36, + "grad_norm": 32.702880859375, + "learning_rate": 4.293184955863281e-06, + "loss": 1.5337, + "step": 18773 + }, + { + "epoch": 2.36, + "grad_norm": 22.25931167602539, + "learning_rate": 4.292348240806594e-06, + "loss": 2.4034, + "step": 18774 + }, + { + "epoch": 2.36, + "grad_norm": 566.87744140625, + "learning_rate": 4.291511525749906e-06, + "loss": 2.0262, + "step": 18775 + }, + { + "epoch": 2.36, + "grad_norm": 3.847529411315918, + "learning_rate": 4.290674810693219e-06, + "loss": 0.593, + "step": 18776 + }, + { + "epoch": 2.36, + "grad_norm": 8.884088516235352, + "learning_rate": 4.2898380956365315e-06, + "loss": 0.5754, + "step": 18777 + }, + { + "epoch": 2.36, + "grad_norm": 9.90809154510498, + "learning_rate": 4.2890013805798435e-06, + "loss": 0.5576, + "step": 18778 + }, + { + "epoch": 2.36, + "grad_norm": 4.6491923332214355, + "learning_rate": 4.288164665523156e-06, + "loss": 0.1102, + "step": 18779 + }, + { + "epoch": 2.36, + "grad_norm": 57.226985931396484, + "learning_rate": 4.287327950466469e-06, + "loss": 1.3927, + "step": 18780 + }, + { + "epoch": 2.36, + "grad_norm": 17.85209083557129, + "learning_rate": 4.286491235409782e-06, + "loss": 1.3865, + "step": 18781 + }, + { + "epoch": 2.36, + "grad_norm": 14.462690353393555, + "learning_rate": 4.285654520353094e-06, + "loss": 1.7501, + "step": 18782 + }, + { + "epoch": 2.36, + "grad_norm": 17.78639030456543, + "learning_rate": 4.284817805296407e-06, + "loss": 0.9139, + "step": 18783 + }, + { + "epoch": 2.36, + "grad_norm": 11.665761947631836, + "learning_rate": 4.283981090239719e-06, + "loss": 0.6823, + "step": 18784 + }, + { + "epoch": 2.36, + "grad_norm": 11.269731521606445, + "learning_rate": 4.283144375183031e-06, + "loss": 0.2417, + "step": 18785 + }, + { + "epoch": 2.36, + "grad_norm": 9.31551742553711, + "learning_rate": 4.282307660126344e-06, + "loss": 0.2929, + "step": 18786 + }, + { + "epoch": 2.36, + "grad_norm": 15.734121322631836, + "learning_rate": 4.281470945069657e-06, + "loss": 1.9836, + "step": 18787 + }, + { + "epoch": 2.36, + "grad_norm": 12.540379524230957, + "learning_rate": 4.28063423001297e-06, + "loss": 0.3442, + "step": 18788 + }, + { + "epoch": 2.36, + "grad_norm": 21.346942901611328, + "learning_rate": 4.279797514956282e-06, + "loss": 0.7449, + "step": 18789 + }, + { + "epoch": 2.36, + "grad_norm": 12.596646308898926, + "learning_rate": 4.2789607998995945e-06, + "loss": 1.1474, + "step": 18790 + }, + { + "epoch": 2.36, + "grad_norm": 18.674962997436523, + "learning_rate": 4.278124084842907e-06, + "loss": 0.7168, + "step": 18791 + }, + { + "epoch": 2.36, + "grad_norm": 21.470760345458984, + "learning_rate": 4.277287369786219e-06, + "loss": 1.6533, + "step": 18792 + }, + { + "epoch": 2.36, + "grad_norm": 16.76324462890625, + "learning_rate": 4.276450654729532e-06, + "loss": 0.7499, + "step": 18793 + }, + { + "epoch": 2.36, + "grad_norm": 8.946023941040039, + "learning_rate": 4.275613939672845e-06, + "loss": 0.4861, + "step": 18794 + }, + { + "epoch": 2.36, + "grad_norm": 11.926936149597168, + "learning_rate": 4.274777224616158e-06, + "loss": 1.6696, + "step": 18795 + }, + { + "epoch": 2.36, + "grad_norm": 8.531956672668457, + "learning_rate": 4.27394050955947e-06, + "loss": 0.7272, + "step": 18796 + }, + { + "epoch": 2.36, + "grad_norm": 45.68627166748047, + "learning_rate": 4.2731037945027825e-06, + "loss": 1.5998, + "step": 18797 + }, + { + "epoch": 2.36, + "grad_norm": 49.42822265625, + "learning_rate": 4.272267079446095e-06, + "loss": 0.6409, + "step": 18798 + }, + { + "epoch": 2.36, + "grad_norm": 5.116422653198242, + "learning_rate": 4.271430364389407e-06, + "loss": 0.1939, + "step": 18799 + }, + { + "epoch": 2.36, + "grad_norm": 16.58919906616211, + "learning_rate": 4.27059364933272e-06, + "loss": 0.7827, + "step": 18800 + }, + { + "epoch": 2.36, + "eval_loss": 0.08024092018604279, + "eval_runtime": 94.8209, + "eval_samples_per_second": 37.355, + "eval_steps_per_second": 37.355, + "step": 18800 + }, + { + "epoch": 2.36, + "grad_norm": 16.0667724609375, + "learning_rate": 4.269756934276033e-06, + "loss": 1.9254, + "step": 18801 + }, + { + "epoch": 2.36, + "grad_norm": 193.6348876953125, + "learning_rate": 4.268920219219345e-06, + "loss": 1.5117, + "step": 18802 + }, + { + "epoch": 2.36, + "grad_norm": 12.501949310302734, + "learning_rate": 4.2680835041626576e-06, + "loss": 1.2545, + "step": 18803 + }, + { + "epoch": 2.36, + "grad_norm": 7.939141750335693, + "learning_rate": 4.26724678910597e-06, + "loss": 0.6291, + "step": 18804 + }, + { + "epoch": 2.36, + "grad_norm": 10.727766036987305, + "learning_rate": 4.266410074049283e-06, + "loss": 0.5534, + "step": 18805 + }, + { + "epoch": 2.36, + "grad_norm": 20.324636459350586, + "learning_rate": 4.265573358992595e-06, + "loss": 0.537, + "step": 18806 + }, + { + "epoch": 2.36, + "grad_norm": 5.044099807739258, + "learning_rate": 4.264736643935908e-06, + "loss": 0.1942, + "step": 18807 + }, + { + "epoch": 2.36, + "grad_norm": 26.941146850585938, + "learning_rate": 4.263899928879221e-06, + "loss": 1.7258, + "step": 18808 + }, + { + "epoch": 2.36, + "grad_norm": 58.14222717285156, + "learning_rate": 4.263063213822533e-06, + "loss": 1.8982, + "step": 18809 + }, + { + "epoch": 2.36, + "grad_norm": 9.319318771362305, + "learning_rate": 4.2622264987658455e-06, + "loss": 0.6623, + "step": 18810 + }, + { + "epoch": 2.36, + "grad_norm": 21.797510147094727, + "learning_rate": 4.261389783709158e-06, + "loss": 1.3951, + "step": 18811 + }, + { + "epoch": 2.36, + "grad_norm": 32.75272750854492, + "learning_rate": 4.260553068652471e-06, + "loss": 1.5829, + "step": 18812 + }, + { + "epoch": 2.36, + "grad_norm": 4.681191921234131, + "learning_rate": 4.259716353595783e-06, + "loss": 0.5909, + "step": 18813 + }, + { + "epoch": 2.36, + "grad_norm": 10.198139190673828, + "learning_rate": 4.258879638539096e-06, + "loss": 0.2836, + "step": 18814 + }, + { + "epoch": 2.36, + "grad_norm": 18.730148315429688, + "learning_rate": 4.258042923482409e-06, + "loss": 2.1234, + "step": 18815 + }, + { + "epoch": 2.36, + "grad_norm": 12.210789680480957, + "learning_rate": 4.257206208425721e-06, + "loss": 0.394, + "step": 18816 + }, + { + "epoch": 2.36, + "grad_norm": 8.160884857177734, + "learning_rate": 4.256369493369033e-06, + "loss": 0.9246, + "step": 18817 + }, + { + "epoch": 2.36, + "grad_norm": 12.022786140441895, + "learning_rate": 4.255532778312346e-06, + "loss": 0.6435, + "step": 18818 + }, + { + "epoch": 2.36, + "grad_norm": 17.289894104003906, + "learning_rate": 4.254696063255659e-06, + "loss": 1.4093, + "step": 18819 + }, + { + "epoch": 2.36, + "grad_norm": 16.079078674316406, + "learning_rate": 4.253859348198971e-06, + "loss": 1.842, + "step": 18820 + }, + { + "epoch": 2.36, + "grad_norm": 13.1908540725708, + "learning_rate": 4.253022633142284e-06, + "loss": 0.8832, + "step": 18821 + }, + { + "epoch": 2.36, + "grad_norm": 25.93243408203125, + "learning_rate": 4.2521859180855966e-06, + "loss": 1.1607, + "step": 18822 + }, + { + "epoch": 2.36, + "grad_norm": 10.018243789672852, + "learning_rate": 4.2513492030289085e-06, + "loss": 0.3035, + "step": 18823 + }, + { + "epoch": 2.36, + "grad_norm": 9.124788284301758, + "learning_rate": 4.250512487972221e-06, + "loss": 0.9549, + "step": 18824 + }, + { + "epoch": 2.36, + "grad_norm": 2.4602513313293457, + "learning_rate": 4.249675772915534e-06, + "loss": 0.0339, + "step": 18825 + }, + { + "epoch": 2.36, + "grad_norm": 18.3439998626709, + "learning_rate": 4.248839057858847e-06, + "loss": 0.6064, + "step": 18826 + }, + { + "epoch": 2.36, + "grad_norm": 16.524682998657227, + "learning_rate": 4.248002342802159e-06, + "loss": 0.5423, + "step": 18827 + }, + { + "epoch": 2.36, + "grad_norm": 15.11779499053955, + "learning_rate": 4.247165627745472e-06, + "loss": 0.4983, + "step": 18828 + }, + { + "epoch": 2.36, + "grad_norm": 9.199140548706055, + "learning_rate": 4.2463289126887845e-06, + "loss": 0.3444, + "step": 18829 + }, + { + "epoch": 2.36, + "grad_norm": 12.126413345336914, + "learning_rate": 4.2454921976320964e-06, + "loss": 0.4077, + "step": 18830 + }, + { + "epoch": 2.36, + "grad_norm": 8.376360893249512, + "learning_rate": 4.244655482575409e-06, + "loss": 1.4381, + "step": 18831 + }, + { + "epoch": 2.36, + "grad_norm": 14.740618705749512, + "learning_rate": 4.243818767518722e-06, + "loss": 1.1537, + "step": 18832 + }, + { + "epoch": 2.36, + "grad_norm": 21.037717819213867, + "learning_rate": 4.242982052462035e-06, + "loss": 1.0074, + "step": 18833 + }, + { + "epoch": 2.36, + "grad_norm": 14.283814430236816, + "learning_rate": 4.242145337405347e-06, + "loss": 0.6166, + "step": 18834 + }, + { + "epoch": 2.36, + "grad_norm": 5.688212871551514, + "learning_rate": 4.24130862234866e-06, + "loss": 0.4822, + "step": 18835 + }, + { + "epoch": 2.36, + "grad_norm": 23.023244857788086, + "learning_rate": 4.240471907291972e-06, + "loss": 1.8606, + "step": 18836 + }, + { + "epoch": 2.36, + "grad_norm": 6.0786662101745605, + "learning_rate": 4.239635192235284e-06, + "loss": 0.2515, + "step": 18837 + }, + { + "epoch": 2.36, + "grad_norm": 24.129573822021484, + "learning_rate": 4.238798477178597e-06, + "loss": 0.8353, + "step": 18838 + }, + { + "epoch": 2.36, + "grad_norm": 113.80068969726562, + "learning_rate": 4.23796176212191e-06, + "loss": 0.8471, + "step": 18839 + }, + { + "epoch": 2.36, + "grad_norm": 25.95708656311035, + "learning_rate": 4.237125047065223e-06, + "loss": 1.8908, + "step": 18840 + }, + { + "epoch": 2.36, + "grad_norm": 9.736194610595703, + "learning_rate": 4.236288332008535e-06, + "loss": 1.5688, + "step": 18841 + }, + { + "epoch": 2.36, + "grad_norm": 25.874277114868164, + "learning_rate": 4.2354516169518475e-06, + "loss": 2.0212, + "step": 18842 + }, + { + "epoch": 2.36, + "grad_norm": 8.549764633178711, + "learning_rate": 4.2346149018951595e-06, + "loss": 1.0517, + "step": 18843 + }, + { + "epoch": 2.36, + "grad_norm": 31.702428817749023, + "learning_rate": 4.233778186838472e-06, + "loss": 1.0722, + "step": 18844 + }, + { + "epoch": 2.37, + "grad_norm": 4.378050804138184, + "learning_rate": 4.232941471781785e-06, + "loss": 0.5427, + "step": 18845 + }, + { + "epoch": 2.37, + "grad_norm": 14.59392261505127, + "learning_rate": 4.232104756725098e-06, + "loss": 0.6592, + "step": 18846 + }, + { + "epoch": 2.37, + "grad_norm": 11.767300605773926, + "learning_rate": 4.231268041668411e-06, + "loss": 0.2533, + "step": 18847 + }, + { + "epoch": 2.37, + "grad_norm": 17.252201080322266, + "learning_rate": 4.230431326611723e-06, + "loss": 0.4975, + "step": 18848 + }, + { + "epoch": 2.37, + "grad_norm": 3.5346503257751465, + "learning_rate": 4.2295946115550354e-06, + "loss": 0.1133, + "step": 18849 + }, + { + "epoch": 2.37, + "grad_norm": 44.0077018737793, + "learning_rate": 4.228757896498347e-06, + "loss": 1.6376, + "step": 18850 + }, + { + "epoch": 2.37, + "grad_norm": 25.099565505981445, + "learning_rate": 4.22792118144166e-06, + "loss": 1.5878, + "step": 18851 + }, + { + "epoch": 2.37, + "grad_norm": 7.18458890914917, + "learning_rate": 4.227084466384973e-06, + "loss": 1.1702, + "step": 18852 + }, + { + "epoch": 2.37, + "grad_norm": 9.28117561340332, + "learning_rate": 4.226247751328286e-06, + "loss": 0.7456, + "step": 18853 + }, + { + "epoch": 2.37, + "grad_norm": 16.449676513671875, + "learning_rate": 4.225411036271599e-06, + "loss": 1.1489, + "step": 18854 + }, + { + "epoch": 2.37, + "grad_norm": 3.339559316635132, + "learning_rate": 4.2245743212149106e-06, + "loss": 0.0947, + "step": 18855 + }, + { + "epoch": 2.37, + "grad_norm": 26.09206771850586, + "learning_rate": 4.223737606158223e-06, + "loss": 2.6489, + "step": 18856 + }, + { + "epoch": 2.37, + "grad_norm": 15.931502342224121, + "learning_rate": 4.222900891101535e-06, + "loss": 0.9297, + "step": 18857 + }, + { + "epoch": 2.37, + "grad_norm": 19.132822036743164, + "learning_rate": 4.222064176044848e-06, + "loss": 0.3773, + "step": 18858 + }, + { + "epoch": 2.37, + "grad_norm": 27.903467178344727, + "learning_rate": 4.221227460988161e-06, + "loss": 0.9293, + "step": 18859 + }, + { + "epoch": 2.37, + "grad_norm": 15.282549858093262, + "learning_rate": 4.220390745931474e-06, + "loss": 0.7332, + "step": 18860 + }, + { + "epoch": 2.37, + "grad_norm": 9.827350616455078, + "learning_rate": 4.2195540308747865e-06, + "loss": 1.2795, + "step": 18861 + }, + { + "epoch": 2.37, + "grad_norm": 11.952923774719238, + "learning_rate": 4.2187173158180985e-06, + "loss": 1.1583, + "step": 18862 + }, + { + "epoch": 2.37, + "grad_norm": 8.408980369567871, + "learning_rate": 4.217880600761411e-06, + "loss": 0.4653, + "step": 18863 + }, + { + "epoch": 2.37, + "grad_norm": 51.14714431762695, + "learning_rate": 4.217043885704723e-06, + "loss": 0.923, + "step": 18864 + }, + { + "epoch": 2.37, + "grad_norm": 10.286130905151367, + "learning_rate": 4.216207170648036e-06, + "loss": 0.9121, + "step": 18865 + }, + { + "epoch": 2.37, + "grad_norm": 23.672351837158203, + "learning_rate": 4.215370455591349e-06, + "loss": 0.8421, + "step": 18866 + }, + { + "epoch": 2.37, + "grad_norm": 17.699554443359375, + "learning_rate": 4.214533740534662e-06, + "loss": 2.0303, + "step": 18867 + }, + { + "epoch": 2.37, + "grad_norm": 15.324867248535156, + "learning_rate": 4.213697025477974e-06, + "loss": 2.3224, + "step": 18868 + }, + { + "epoch": 2.37, + "grad_norm": 6.052308559417725, + "learning_rate": 4.212860310421286e-06, + "loss": 0.9046, + "step": 18869 + }, + { + "epoch": 2.37, + "grad_norm": 18.945478439331055, + "learning_rate": 4.212023595364599e-06, + "loss": 1.6304, + "step": 18870 + }, + { + "epoch": 2.37, + "grad_norm": 10.311890602111816, + "learning_rate": 4.211186880307911e-06, + "loss": 0.23, + "step": 18871 + }, + { + "epoch": 2.37, + "grad_norm": 4.769211769104004, + "learning_rate": 4.210350165251224e-06, + "loss": 0.1743, + "step": 18872 + }, + { + "epoch": 2.37, + "grad_norm": 14.367249488830566, + "learning_rate": 4.209513450194537e-06, + "loss": 0.8107, + "step": 18873 + }, + { + "epoch": 2.37, + "grad_norm": 20.644311904907227, + "learning_rate": 4.2086767351378496e-06, + "loss": 2.0886, + "step": 18874 + }, + { + "epoch": 2.37, + "grad_norm": 68.03089904785156, + "learning_rate": 4.2078400200811615e-06, + "loss": 3.0096, + "step": 18875 + }, + { + "epoch": 2.37, + "grad_norm": 29.12495994567871, + "learning_rate": 4.207003305024474e-06, + "loss": 0.7108, + "step": 18876 + }, + { + "epoch": 2.37, + "grad_norm": 19.217121124267578, + "learning_rate": 4.206166589967787e-06, + "loss": 0.7231, + "step": 18877 + }, + { + "epoch": 2.37, + "grad_norm": 115.30744171142578, + "learning_rate": 4.205329874911099e-06, + "loss": 1.8465, + "step": 18878 + }, + { + "epoch": 2.37, + "grad_norm": 23.02408218383789, + "learning_rate": 4.204493159854412e-06, + "loss": 1.339, + "step": 18879 + }, + { + "epoch": 2.37, + "grad_norm": 87.8171615600586, + "learning_rate": 4.203656444797725e-06, + "loss": 0.7673, + "step": 18880 + }, + { + "epoch": 2.37, + "grad_norm": 2.9575674533843994, + "learning_rate": 4.2028197297410375e-06, + "loss": 0.1289, + "step": 18881 + }, + { + "epoch": 2.37, + "grad_norm": 10.533477783203125, + "learning_rate": 4.2019830146843494e-06, + "loss": 0.6978, + "step": 18882 + }, + { + "epoch": 2.37, + "grad_norm": 12.552096366882324, + "learning_rate": 4.201146299627662e-06, + "loss": 0.348, + "step": 18883 + }, + { + "epoch": 2.37, + "grad_norm": 13.85686206817627, + "learning_rate": 4.200309584570974e-06, + "loss": 1.8697, + "step": 18884 + }, + { + "epoch": 2.37, + "grad_norm": 17.216442108154297, + "learning_rate": 4.199472869514287e-06, + "loss": 0.3324, + "step": 18885 + }, + { + "epoch": 2.37, + "grad_norm": 13.309085845947266, + "learning_rate": 4.1986361544576e-06, + "loss": 0.4239, + "step": 18886 + }, + { + "epoch": 2.37, + "grad_norm": 20.115856170654297, + "learning_rate": 4.197799439400913e-06, + "loss": 0.7594, + "step": 18887 + }, + { + "epoch": 2.37, + "grad_norm": 17.214750289916992, + "learning_rate": 4.196962724344225e-06, + "loss": 0.2819, + "step": 18888 + }, + { + "epoch": 2.37, + "grad_norm": 6.4202775955200195, + "learning_rate": 4.196126009287537e-06, + "loss": 1.2622, + "step": 18889 + }, + { + "epoch": 2.37, + "grad_norm": 8.798897743225098, + "learning_rate": 4.19528929423085e-06, + "loss": 0.3203, + "step": 18890 + }, + { + "epoch": 2.37, + "grad_norm": 14.010072708129883, + "learning_rate": 4.194452579174162e-06, + "loss": 0.4699, + "step": 18891 + }, + { + "epoch": 2.37, + "grad_norm": 2.101485013961792, + "learning_rate": 4.193615864117475e-06, + "loss": 0.097, + "step": 18892 + }, + { + "epoch": 2.37, + "grad_norm": 71.57234191894531, + "learning_rate": 4.192779149060788e-06, + "loss": 2.0226, + "step": 18893 + }, + { + "epoch": 2.37, + "grad_norm": 3.3469460010528564, + "learning_rate": 4.1919424340041005e-06, + "loss": 0.2548, + "step": 18894 + }, + { + "epoch": 2.37, + "grad_norm": 6.6450934410095215, + "learning_rate": 4.191105718947413e-06, + "loss": 0.385, + "step": 18895 + }, + { + "epoch": 2.37, + "grad_norm": 16.732669830322266, + "learning_rate": 4.190269003890725e-06, + "loss": 1.6495, + "step": 18896 + }, + { + "epoch": 2.37, + "grad_norm": 9.60965633392334, + "learning_rate": 4.189432288834038e-06, + "loss": 1.7285, + "step": 18897 + }, + { + "epoch": 2.37, + "grad_norm": 8.528308868408203, + "learning_rate": 4.18859557377735e-06, + "loss": 0.4976, + "step": 18898 + }, + { + "epoch": 2.37, + "grad_norm": 46.66935729980469, + "learning_rate": 4.187758858720663e-06, + "loss": 1.3655, + "step": 18899 + }, + { + "epoch": 2.37, + "grad_norm": 87.4046859741211, + "learning_rate": 4.186922143663976e-06, + "loss": 2.0976, + "step": 18900 + }, + { + "epoch": 2.37, + "grad_norm": 15.820965766906738, + "learning_rate": 4.1860854286072884e-06, + "loss": 1.2496, + "step": 18901 + }, + { + "epoch": 2.37, + "grad_norm": 17.30780601501465, + "learning_rate": 4.185248713550601e-06, + "loss": 1.9525, + "step": 18902 + }, + { + "epoch": 2.37, + "grad_norm": 10.334717750549316, + "learning_rate": 4.184411998493913e-06, + "loss": 0.5088, + "step": 18903 + }, + { + "epoch": 2.37, + "grad_norm": 18.360502243041992, + "learning_rate": 4.183575283437226e-06, + "loss": 0.9067, + "step": 18904 + }, + { + "epoch": 2.37, + "grad_norm": 13.89871883392334, + "learning_rate": 4.182738568380538e-06, + "loss": 2.7615, + "step": 18905 + }, + { + "epoch": 2.37, + "grad_norm": 16.200584411621094, + "learning_rate": 4.181901853323851e-06, + "loss": 0.6003, + "step": 18906 + }, + { + "epoch": 2.37, + "grad_norm": 7.320597171783447, + "learning_rate": 4.1810651382671636e-06, + "loss": 0.1048, + "step": 18907 + }, + { + "epoch": 2.37, + "grad_norm": 45.176151275634766, + "learning_rate": 4.180228423210476e-06, + "loss": 0.6831, + "step": 18908 + }, + { + "epoch": 2.37, + "grad_norm": 17.75602149963379, + "learning_rate": 4.179391708153789e-06, + "loss": 0.4882, + "step": 18909 + }, + { + "epoch": 2.37, + "grad_norm": 28.60691261291504, + "learning_rate": 4.178554993097101e-06, + "loss": 0.5252, + "step": 18910 + }, + { + "epoch": 2.37, + "grad_norm": 22.014205932617188, + "learning_rate": 4.177718278040414e-06, + "loss": 0.673, + "step": 18911 + }, + { + "epoch": 2.37, + "grad_norm": 16.342103958129883, + "learning_rate": 4.176881562983726e-06, + "loss": 1.3377, + "step": 18912 + }, + { + "epoch": 2.37, + "grad_norm": 23.191085815429688, + "learning_rate": 4.176044847927039e-06, + "loss": 1.2594, + "step": 18913 + }, + { + "epoch": 2.37, + "grad_norm": 3.5212202072143555, + "learning_rate": 4.1752081328703515e-06, + "loss": 0.0523, + "step": 18914 + }, + { + "epoch": 2.37, + "grad_norm": 20.338064193725586, + "learning_rate": 4.174371417813664e-06, + "loss": 0.7773, + "step": 18915 + }, + { + "epoch": 2.37, + "grad_norm": 10.315778732299805, + "learning_rate": 4.173534702756977e-06, + "loss": 0.6839, + "step": 18916 + }, + { + "epoch": 2.37, + "grad_norm": 18.50375747680664, + "learning_rate": 4.172697987700289e-06, + "loss": 1.3235, + "step": 18917 + }, + { + "epoch": 2.37, + "grad_norm": 5.669938564300537, + "learning_rate": 4.171861272643602e-06, + "loss": 0.2752, + "step": 18918 + }, + { + "epoch": 2.37, + "grad_norm": 5.4706034660339355, + "learning_rate": 4.171024557586914e-06, + "loss": 0.4694, + "step": 18919 + }, + { + "epoch": 2.37, + "grad_norm": 13.295501708984375, + "learning_rate": 4.170187842530227e-06, + "loss": 0.321, + "step": 18920 + }, + { + "epoch": 2.37, + "grad_norm": 1.5812788009643555, + "learning_rate": 4.169351127473539e-06, + "loss": 0.0755, + "step": 18921 + }, + { + "epoch": 2.37, + "grad_norm": 12.092533111572266, + "learning_rate": 4.168514412416852e-06, + "loss": 0.3939, + "step": 18922 + }, + { + "epoch": 2.37, + "grad_norm": 6.5334391593933105, + "learning_rate": 4.167677697360165e-06, + "loss": 0.4008, + "step": 18923 + }, + { + "epoch": 2.37, + "grad_norm": 6.879368782043457, + "learning_rate": 4.166840982303477e-06, + "loss": 0.4942, + "step": 18924 + }, + { + "epoch": 2.38, + "grad_norm": 17.075260162353516, + "learning_rate": 4.166004267246789e-06, + "loss": 0.6626, + "step": 18925 + }, + { + "epoch": 2.38, + "grad_norm": 12.275047302246094, + "learning_rate": 4.165167552190102e-06, + "loss": 0.9725, + "step": 18926 + }, + { + "epoch": 2.38, + "grad_norm": 22.844661712646484, + "learning_rate": 4.1643308371334145e-06, + "loss": 0.3083, + "step": 18927 + }, + { + "epoch": 2.38, + "grad_norm": 9.049379348754883, + "learning_rate": 4.163494122076727e-06, + "loss": 1.3238, + "step": 18928 + }, + { + "epoch": 2.38, + "grad_norm": 17.66362190246582, + "learning_rate": 4.16265740702004e-06, + "loss": 1.2612, + "step": 18929 + }, + { + "epoch": 2.38, + "grad_norm": 6.301072597503662, + "learning_rate": 4.161820691963352e-06, + "loss": 1.4726, + "step": 18930 + }, + { + "epoch": 2.38, + "grad_norm": 14.616764068603516, + "learning_rate": 4.160983976906665e-06, + "loss": 0.5514, + "step": 18931 + }, + { + "epoch": 2.38, + "grad_norm": 17.993520736694336, + "learning_rate": 4.160147261849977e-06, + "loss": 0.8817, + "step": 18932 + }, + { + "epoch": 2.38, + "grad_norm": 11.278755187988281, + "learning_rate": 4.15931054679329e-06, + "loss": 1.2055, + "step": 18933 + }, + { + "epoch": 2.38, + "grad_norm": 59.47795104980469, + "learning_rate": 4.158473831736602e-06, + "loss": 1.5626, + "step": 18934 + }, + { + "epoch": 2.38, + "grad_norm": 8.581803321838379, + "learning_rate": 4.157637116679915e-06, + "loss": 0.274, + "step": 18935 + }, + { + "epoch": 2.38, + "grad_norm": 6.019199371337891, + "learning_rate": 4.156800401623228e-06, + "loss": 0.4222, + "step": 18936 + }, + { + "epoch": 2.38, + "grad_norm": 14.822456359863281, + "learning_rate": 4.15596368656654e-06, + "loss": 0.8128, + "step": 18937 + }, + { + "epoch": 2.38, + "grad_norm": 17.66838836669922, + "learning_rate": 4.155126971509853e-06, + "loss": 1.3863, + "step": 18938 + }, + { + "epoch": 2.38, + "grad_norm": 24.690292358398438, + "learning_rate": 4.154290256453165e-06, + "loss": 2.2689, + "step": 18939 + }, + { + "epoch": 2.38, + "grad_norm": 9.440573692321777, + "learning_rate": 4.1534535413964775e-06, + "loss": 0.6847, + "step": 18940 + }, + { + "epoch": 2.38, + "grad_norm": 13.989275932312012, + "learning_rate": 4.15261682633979e-06, + "loss": 0.7083, + "step": 18941 + }, + { + "epoch": 2.38, + "grad_norm": 4.123847007751465, + "learning_rate": 4.151780111283103e-06, + "loss": 0.1782, + "step": 18942 + }, + { + "epoch": 2.38, + "grad_norm": 14.532522201538086, + "learning_rate": 4.150943396226416e-06, + "loss": 0.697, + "step": 18943 + }, + { + "epoch": 2.38, + "grad_norm": 19.20111846923828, + "learning_rate": 4.150106681169728e-06, + "loss": 0.7649, + "step": 18944 + }, + { + "epoch": 2.38, + "grad_norm": 37.36458969116211, + "learning_rate": 4.149269966113041e-06, + "loss": 3.0226, + "step": 18945 + }, + { + "epoch": 2.38, + "grad_norm": 7.685119152069092, + "learning_rate": 4.148433251056353e-06, + "loss": 0.3855, + "step": 18946 + }, + { + "epoch": 2.38, + "grad_norm": 14.21370792388916, + "learning_rate": 4.1475965359996655e-06, + "loss": 0.8942, + "step": 18947 + }, + { + "epoch": 2.38, + "grad_norm": 27.41675567626953, + "learning_rate": 4.146759820942978e-06, + "loss": 3.6937, + "step": 18948 + }, + { + "epoch": 2.38, + "grad_norm": 8.781967163085938, + "learning_rate": 4.145923105886291e-06, + "loss": 0.6549, + "step": 18949 + }, + { + "epoch": 2.38, + "grad_norm": 5.892064571380615, + "learning_rate": 4.145086390829604e-06, + "loss": 0.2349, + "step": 18950 + }, + { + "epoch": 2.38, + "grad_norm": 11.855024337768555, + "learning_rate": 4.144249675772916e-06, + "loss": 0.5825, + "step": 18951 + }, + { + "epoch": 2.38, + "grad_norm": 75.8827896118164, + "learning_rate": 4.143412960716229e-06, + "loss": 1.6483, + "step": 18952 + }, + { + "epoch": 2.38, + "grad_norm": 35.53962707519531, + "learning_rate": 4.1425762456595406e-06, + "loss": 1.5198, + "step": 18953 + }, + { + "epoch": 2.38, + "grad_norm": 5.657505989074707, + "learning_rate": 4.141739530602853e-06, + "loss": 0.3553, + "step": 18954 + }, + { + "epoch": 2.38, + "grad_norm": 12.064531326293945, + "learning_rate": 4.140902815546166e-06, + "loss": 0.2139, + "step": 18955 + }, + { + "epoch": 2.38, + "grad_norm": 5.048888206481934, + "learning_rate": 4.140066100489479e-06, + "loss": 0.3971, + "step": 18956 + }, + { + "epoch": 2.38, + "grad_norm": 9.145115852355957, + "learning_rate": 4.139229385432792e-06, + "loss": 0.4083, + "step": 18957 + }, + { + "epoch": 2.38, + "grad_norm": 6.697978973388672, + "learning_rate": 4.138392670376104e-06, + "loss": 0.4068, + "step": 18958 + }, + { + "epoch": 2.38, + "grad_norm": 10.511747360229492, + "learning_rate": 4.1375559553194165e-06, + "loss": 1.393, + "step": 18959 + }, + { + "epoch": 2.38, + "grad_norm": 3.4989471435546875, + "learning_rate": 4.1367192402627285e-06, + "loss": 0.0813, + "step": 18960 + }, + { + "epoch": 2.38, + "grad_norm": 14.240618705749512, + "learning_rate": 4.135882525206041e-06, + "loss": 0.3611, + "step": 18961 + }, + { + "epoch": 2.38, + "grad_norm": 12.395482063293457, + "learning_rate": 4.135045810149354e-06, + "loss": 1.7359, + "step": 18962 + }, + { + "epoch": 2.38, + "grad_norm": 92.8197250366211, + "learning_rate": 4.134209095092667e-06, + "loss": 2.6389, + "step": 18963 + }, + { + "epoch": 2.38, + "grad_norm": 19.586116790771484, + "learning_rate": 4.13337238003598e-06, + "loss": 0.3887, + "step": 18964 + }, + { + "epoch": 2.38, + "grad_norm": 12.205307960510254, + "learning_rate": 4.132535664979292e-06, + "loss": 1.1692, + "step": 18965 + }, + { + "epoch": 2.38, + "grad_norm": 59.4495964050293, + "learning_rate": 4.131698949922604e-06, + "loss": 2.6039, + "step": 18966 + }, + { + "epoch": 2.38, + "grad_norm": 5.818330764770508, + "learning_rate": 4.130862234865916e-06, + "loss": 0.5976, + "step": 18967 + }, + { + "epoch": 2.38, + "grad_norm": 12.355748176574707, + "learning_rate": 4.130025519809229e-06, + "loss": 1.4166, + "step": 18968 + }, + { + "epoch": 2.38, + "grad_norm": 21.618093490600586, + "learning_rate": 4.129188804752542e-06, + "loss": 1.5143, + "step": 18969 + }, + { + "epoch": 2.38, + "grad_norm": 21.394245147705078, + "learning_rate": 4.128352089695855e-06, + "loss": 0.7113, + "step": 18970 + }, + { + "epoch": 2.38, + "grad_norm": 15.48207950592041, + "learning_rate": 4.127515374639167e-06, + "loss": 0.1245, + "step": 18971 + }, + { + "epoch": 2.38, + "grad_norm": 11.195927619934082, + "learning_rate": 4.1266786595824796e-06, + "loss": 0.7492, + "step": 18972 + }, + { + "epoch": 2.38, + "grad_norm": 18.20484733581543, + "learning_rate": 4.1258419445257915e-06, + "loss": 0.6182, + "step": 18973 + }, + { + "epoch": 2.38, + "grad_norm": 13.016801834106445, + "learning_rate": 4.125005229469104e-06, + "loss": 0.7796, + "step": 18974 + }, + { + "epoch": 2.38, + "grad_norm": 12.392509460449219, + "learning_rate": 4.124168514412417e-06, + "loss": 1.2688, + "step": 18975 + }, + { + "epoch": 2.38, + "grad_norm": 27.529605865478516, + "learning_rate": 4.12333179935573e-06, + "loss": 1.6192, + "step": 18976 + }, + { + "epoch": 2.38, + "grad_norm": 14.227264404296875, + "learning_rate": 4.122495084299043e-06, + "loss": 0.5007, + "step": 18977 + }, + { + "epoch": 2.38, + "grad_norm": 6.663363456726074, + "learning_rate": 4.121658369242355e-06, + "loss": 0.3257, + "step": 18978 + }, + { + "epoch": 2.38, + "grad_norm": 16.237409591674805, + "learning_rate": 4.1208216541856675e-06, + "loss": 0.8368, + "step": 18979 + }, + { + "epoch": 2.38, + "grad_norm": 4.636362552642822, + "learning_rate": 4.1199849391289794e-06, + "loss": 0.9216, + "step": 18980 + }, + { + "epoch": 2.38, + "grad_norm": 15.949992179870605, + "learning_rate": 4.119148224072292e-06, + "loss": 0.783, + "step": 18981 + }, + { + "epoch": 2.38, + "grad_norm": 8.62476921081543, + "learning_rate": 4.118311509015605e-06, + "loss": 0.4269, + "step": 18982 + }, + { + "epoch": 2.38, + "grad_norm": 13.7176513671875, + "learning_rate": 4.117474793958918e-06, + "loss": 0.3969, + "step": 18983 + }, + { + "epoch": 2.38, + "grad_norm": 6.941378116607666, + "learning_rate": 4.116638078902231e-06, + "loss": 0.2093, + "step": 18984 + }, + { + "epoch": 2.38, + "grad_norm": 21.555898666381836, + "learning_rate": 4.115801363845543e-06, + "loss": 0.7242, + "step": 18985 + }, + { + "epoch": 2.38, + "grad_norm": 19.81019401550293, + "learning_rate": 4.114964648788855e-06, + "loss": 0.4602, + "step": 18986 + }, + { + "epoch": 2.38, + "grad_norm": 11.031840324401855, + "learning_rate": 4.114127933732167e-06, + "loss": 0.4644, + "step": 18987 + }, + { + "epoch": 2.38, + "grad_norm": 18.960458755493164, + "learning_rate": 4.11329121867548e-06, + "loss": 0.6422, + "step": 18988 + }, + { + "epoch": 2.38, + "grad_norm": 10.122111320495605, + "learning_rate": 4.112454503618793e-06, + "loss": 1.5367, + "step": 18989 + }, + { + "epoch": 2.38, + "grad_norm": 21.96918487548828, + "learning_rate": 4.111617788562106e-06, + "loss": 0.5202, + "step": 18990 + }, + { + "epoch": 2.38, + "grad_norm": 39.133480072021484, + "learning_rate": 4.1107810735054186e-06, + "loss": 1.3954, + "step": 18991 + }, + { + "epoch": 2.38, + "grad_norm": 14.091297149658203, + "learning_rate": 4.1099443584487305e-06, + "loss": 0.4381, + "step": 18992 + }, + { + "epoch": 2.38, + "grad_norm": 23.905752182006836, + "learning_rate": 4.109107643392043e-06, + "loss": 1.4625, + "step": 18993 + }, + { + "epoch": 2.38, + "grad_norm": 8.82052993774414, + "learning_rate": 4.108270928335355e-06, + "loss": 1.88, + "step": 18994 + }, + { + "epoch": 2.38, + "grad_norm": 31.28426742553711, + "learning_rate": 4.107434213278668e-06, + "loss": 0.8463, + "step": 18995 + }, + { + "epoch": 2.38, + "grad_norm": 25.97709083557129, + "learning_rate": 4.106597498221981e-06, + "loss": 1.4488, + "step": 18996 + }, + { + "epoch": 2.38, + "grad_norm": 6.084327220916748, + "learning_rate": 4.105760783165294e-06, + "loss": 0.4668, + "step": 18997 + }, + { + "epoch": 2.38, + "grad_norm": 6.0948405265808105, + "learning_rate": 4.1049240681086065e-06, + "loss": 0.7042, + "step": 18998 + }, + { + "epoch": 2.38, + "grad_norm": 458.3492431640625, + "learning_rate": 4.1040873530519184e-06, + "loss": 2.2982, + "step": 18999 + }, + { + "epoch": 2.38, + "grad_norm": 16.414735794067383, + "learning_rate": 4.10325063799523e-06, + "loss": 0.6207, + "step": 19000 + }, + { + "epoch": 2.38, + "grad_norm": 13.103593826293945, + "learning_rate": 4.102413922938543e-06, + "loss": 0.9018, + "step": 19001 + }, + { + "epoch": 2.38, + "grad_norm": 6.032532691955566, + "learning_rate": 4.101577207881856e-06, + "loss": 0.1367, + "step": 19002 + }, + { + "epoch": 2.38, + "grad_norm": 30.239131927490234, + "learning_rate": 4.100740492825169e-06, + "loss": 1.9775, + "step": 19003 + }, + { + "epoch": 2.38, + "grad_norm": 16.82728385925293, + "learning_rate": 4.099903777768482e-06, + "loss": 1.0877, + "step": 19004 + }, + { + "epoch": 2.39, + "grad_norm": 56.356014251708984, + "learning_rate": 4.099067062711794e-06, + "loss": 2.3591, + "step": 19005 + }, + { + "epoch": 2.39, + "grad_norm": 36.4920539855957, + "learning_rate": 4.098230347655106e-06, + "loss": 1.2582, + "step": 19006 + }, + { + "epoch": 2.39, + "grad_norm": 10.160242080688477, + "learning_rate": 4.097393632598418e-06, + "loss": 0.9966, + "step": 19007 + }, + { + "epoch": 2.39, + "grad_norm": 4.926279544830322, + "learning_rate": 4.096556917541731e-06, + "loss": 0.0569, + "step": 19008 + }, + { + "epoch": 2.39, + "grad_norm": 13.520187377929688, + "learning_rate": 4.095720202485044e-06, + "loss": 0.714, + "step": 19009 + }, + { + "epoch": 2.39, + "grad_norm": 6.462107181549072, + "learning_rate": 4.094883487428357e-06, + "loss": 0.1518, + "step": 19010 + }, + { + "epoch": 2.39, + "grad_norm": 124.68399810791016, + "learning_rate": 4.0940467723716695e-06, + "loss": 2.5772, + "step": 19011 + }, + { + "epoch": 2.39, + "grad_norm": 53.15134048461914, + "learning_rate": 4.0932100573149815e-06, + "loss": 0.9663, + "step": 19012 + }, + { + "epoch": 2.39, + "grad_norm": 38.00040817260742, + "learning_rate": 4.092373342258294e-06, + "loss": 1.5956, + "step": 19013 + }, + { + "epoch": 2.39, + "grad_norm": 9.098489761352539, + "learning_rate": 4.091536627201606e-06, + "loss": 0.3268, + "step": 19014 + }, + { + "epoch": 2.39, + "grad_norm": 46.67536544799805, + "learning_rate": 4.090699912144919e-06, + "loss": 3.1959, + "step": 19015 + }, + { + "epoch": 2.39, + "grad_norm": 50.20859146118164, + "learning_rate": 4.089863197088232e-06, + "loss": 0.9767, + "step": 19016 + }, + { + "epoch": 2.39, + "grad_norm": 11.621726036071777, + "learning_rate": 4.089026482031545e-06, + "loss": 0.9935, + "step": 19017 + }, + { + "epoch": 2.39, + "grad_norm": 6.445420742034912, + "learning_rate": 4.0881897669748574e-06, + "loss": 0.3812, + "step": 19018 + }, + { + "epoch": 2.39, + "grad_norm": 25.69889259338379, + "learning_rate": 4.087353051918169e-06, + "loss": 0.7589, + "step": 19019 + }, + { + "epoch": 2.39, + "grad_norm": 1.7841275930404663, + "learning_rate": 4.086516336861482e-06, + "loss": 0.1167, + "step": 19020 + }, + { + "epoch": 2.39, + "grad_norm": 6.596001625061035, + "learning_rate": 4.085679621804794e-06, + "loss": 0.5149, + "step": 19021 + }, + { + "epoch": 2.39, + "grad_norm": 17.258195877075195, + "learning_rate": 4.084842906748107e-06, + "loss": 0.4251, + "step": 19022 + }, + { + "epoch": 2.39, + "grad_norm": 18.278339385986328, + "learning_rate": 4.08400619169142e-06, + "loss": 0.9196, + "step": 19023 + }, + { + "epoch": 2.39, + "grad_norm": 15.891921043395996, + "learning_rate": 4.0831694766347326e-06, + "loss": 1.2198, + "step": 19024 + }, + { + "epoch": 2.39, + "grad_norm": 12.87957763671875, + "learning_rate": 4.082332761578045e-06, + "loss": 1.3263, + "step": 19025 + }, + { + "epoch": 2.39, + "grad_norm": 8.608296394348145, + "learning_rate": 4.081496046521357e-06, + "loss": 0.342, + "step": 19026 + }, + { + "epoch": 2.39, + "grad_norm": 17.928401947021484, + "learning_rate": 4.08065933146467e-06, + "loss": 1.6522, + "step": 19027 + }, + { + "epoch": 2.39, + "grad_norm": 9.473708152770996, + "learning_rate": 4.079822616407982e-06, + "loss": 0.8328, + "step": 19028 + }, + { + "epoch": 2.39, + "grad_norm": 10.671655654907227, + "learning_rate": 4.078985901351295e-06, + "loss": 1.4546, + "step": 19029 + }, + { + "epoch": 2.39, + "grad_norm": 69.41703033447266, + "learning_rate": 4.078149186294608e-06, + "loss": 1.2106, + "step": 19030 + }, + { + "epoch": 2.39, + "grad_norm": 28.889257431030273, + "learning_rate": 4.0773124712379205e-06, + "loss": 1.7755, + "step": 19031 + }, + { + "epoch": 2.39, + "grad_norm": 9.402454376220703, + "learning_rate": 4.076475756181233e-06, + "loss": 0.3341, + "step": 19032 + }, + { + "epoch": 2.39, + "grad_norm": 7.450305938720703, + "learning_rate": 4.075639041124545e-06, + "loss": 0.3173, + "step": 19033 + }, + { + "epoch": 2.39, + "grad_norm": 8.831427574157715, + "learning_rate": 4.074802326067858e-06, + "loss": 1.5428, + "step": 19034 + }, + { + "epoch": 2.39, + "grad_norm": 18.86667823791504, + "learning_rate": 4.07396561101117e-06, + "loss": 0.18, + "step": 19035 + }, + { + "epoch": 2.39, + "grad_norm": 16.75472068786621, + "learning_rate": 4.073128895954483e-06, + "loss": 0.6608, + "step": 19036 + }, + { + "epoch": 2.39, + "grad_norm": 23.9716854095459, + "learning_rate": 4.072292180897796e-06, + "loss": 1.7005, + "step": 19037 + }, + { + "epoch": 2.39, + "grad_norm": 10.454980850219727, + "learning_rate": 4.071455465841108e-06, + "loss": 1.2324, + "step": 19038 + }, + { + "epoch": 2.39, + "grad_norm": 33.47016143798828, + "learning_rate": 4.070618750784421e-06, + "loss": 0.7504, + "step": 19039 + }, + { + "epoch": 2.39, + "grad_norm": 18.456640243530273, + "learning_rate": 4.069782035727733e-06, + "loss": 1.3785, + "step": 19040 + }, + { + "epoch": 2.39, + "grad_norm": 19.197938919067383, + "learning_rate": 4.068945320671046e-06, + "loss": 1.41, + "step": 19041 + }, + { + "epoch": 2.39, + "grad_norm": 18.04644012451172, + "learning_rate": 4.068108605614358e-06, + "loss": 0.7376, + "step": 19042 + }, + { + "epoch": 2.39, + "grad_norm": 22.988388061523438, + "learning_rate": 4.067271890557671e-06, + "loss": 1.5383, + "step": 19043 + }, + { + "epoch": 2.39, + "grad_norm": 5.129838943481445, + "learning_rate": 4.0664351755009835e-06, + "loss": 0.1589, + "step": 19044 + }, + { + "epoch": 2.39, + "grad_norm": 15.090490341186523, + "learning_rate": 4.065598460444296e-06, + "loss": 0.4905, + "step": 19045 + }, + { + "epoch": 2.39, + "grad_norm": 21.809528350830078, + "learning_rate": 4.064761745387609e-06, + "loss": 2.1405, + "step": 19046 + }, + { + "epoch": 2.39, + "grad_norm": 40.71038055419922, + "learning_rate": 4.063925030330921e-06, + "loss": 0.5077, + "step": 19047 + }, + { + "epoch": 2.39, + "grad_norm": 15.136262893676758, + "learning_rate": 4.063088315274233e-06, + "loss": 1.4303, + "step": 19048 + }, + { + "epoch": 2.39, + "grad_norm": 27.37680435180664, + "learning_rate": 4.062251600217546e-06, + "loss": 2.7442, + "step": 19049 + }, + { + "epoch": 2.39, + "grad_norm": 7.498646259307861, + "learning_rate": 4.061414885160859e-06, + "loss": 1.2976, + "step": 19050 + }, + { + "epoch": 2.39, + "grad_norm": 12.314183235168457, + "learning_rate": 4.0605781701041714e-06, + "loss": 0.6545, + "step": 19051 + }, + { + "epoch": 2.39, + "grad_norm": 13.800204277038574, + "learning_rate": 4.059741455047484e-06, + "loss": 1.7657, + "step": 19052 + }, + { + "epoch": 2.39, + "grad_norm": 13.44913101196289, + "learning_rate": 4.058904739990796e-06, + "loss": 0.4265, + "step": 19053 + }, + { + "epoch": 2.39, + "grad_norm": 19.050992965698242, + "learning_rate": 4.058068024934109e-06, + "loss": 0.9761, + "step": 19054 + }, + { + "epoch": 2.39, + "grad_norm": 30.88131332397461, + "learning_rate": 4.057231309877421e-06, + "loss": 1.2247, + "step": 19055 + }, + { + "epoch": 2.39, + "grad_norm": 21.770322799682617, + "learning_rate": 4.056394594820734e-06, + "loss": 0.6648, + "step": 19056 + }, + { + "epoch": 2.39, + "grad_norm": 33.90979766845703, + "learning_rate": 4.0555578797640466e-06, + "loss": 1.006, + "step": 19057 + }, + { + "epoch": 2.39, + "grad_norm": 26.196422576904297, + "learning_rate": 4.054721164707359e-06, + "loss": 0.5683, + "step": 19058 + }, + { + "epoch": 2.39, + "grad_norm": 5.3883891105651855, + "learning_rate": 4.053884449650672e-06, + "loss": 0.5584, + "step": 19059 + }, + { + "epoch": 2.39, + "grad_norm": 22.187639236450195, + "learning_rate": 4.053047734593984e-06, + "loss": 2.7119, + "step": 19060 + }, + { + "epoch": 2.39, + "grad_norm": 40.63218688964844, + "learning_rate": 4.052211019537297e-06, + "loss": 1.6813, + "step": 19061 + }, + { + "epoch": 2.39, + "grad_norm": 9.716521263122559, + "learning_rate": 4.051374304480609e-06, + "loss": 0.2526, + "step": 19062 + }, + { + "epoch": 2.39, + "grad_norm": 9.759636878967285, + "learning_rate": 4.050537589423922e-06, + "loss": 0.4243, + "step": 19063 + }, + { + "epoch": 2.39, + "grad_norm": 27.638835906982422, + "learning_rate": 4.0497008743672345e-06, + "loss": 0.8442, + "step": 19064 + }, + { + "epoch": 2.39, + "grad_norm": 15.038436889648438, + "learning_rate": 4.048864159310547e-06, + "loss": 0.7551, + "step": 19065 + }, + { + "epoch": 2.39, + "grad_norm": 369.1249084472656, + "learning_rate": 4.04802744425386e-06, + "loss": 2.6777, + "step": 19066 + }, + { + "epoch": 2.39, + "grad_norm": 12.90273666381836, + "learning_rate": 4.047190729197172e-06, + "loss": 1.0085, + "step": 19067 + }, + { + "epoch": 2.39, + "grad_norm": 16.095760345458984, + "learning_rate": 4.046354014140485e-06, + "loss": 0.7467, + "step": 19068 + }, + { + "epoch": 2.39, + "grad_norm": 6.69637393951416, + "learning_rate": 4.045517299083797e-06, + "loss": 0.5625, + "step": 19069 + }, + { + "epoch": 2.39, + "grad_norm": 19.717710494995117, + "learning_rate": 4.04468058402711e-06, + "loss": 0.9955, + "step": 19070 + }, + { + "epoch": 2.39, + "grad_norm": 15.726923942565918, + "learning_rate": 4.043843868970422e-06, + "loss": 2.3147, + "step": 19071 + }, + { + "epoch": 2.39, + "grad_norm": 6.696619033813477, + "learning_rate": 4.043007153913735e-06, + "loss": 0.4699, + "step": 19072 + }, + { + "epoch": 2.39, + "grad_norm": 14.654072761535645, + "learning_rate": 4.042170438857048e-06, + "loss": 0.4272, + "step": 19073 + }, + { + "epoch": 2.39, + "grad_norm": 20.2203311920166, + "learning_rate": 4.04133372380036e-06, + "loss": 1.061, + "step": 19074 + }, + { + "epoch": 2.39, + "grad_norm": 21.050973892211914, + "learning_rate": 4.040497008743673e-06, + "loss": 0.7387, + "step": 19075 + }, + { + "epoch": 2.39, + "grad_norm": 13.224677085876465, + "learning_rate": 4.039660293686985e-06, + "loss": 0.5062, + "step": 19076 + }, + { + "epoch": 2.39, + "grad_norm": 10.98463249206543, + "learning_rate": 4.0388235786302975e-06, + "loss": 1.1087, + "step": 19077 + }, + { + "epoch": 2.39, + "grad_norm": 130.39227294921875, + "learning_rate": 4.03798686357361e-06, + "loss": 1.1727, + "step": 19078 + }, + { + "epoch": 2.39, + "grad_norm": 20.21027374267578, + "learning_rate": 4.037150148516923e-06, + "loss": 1.056, + "step": 19079 + }, + { + "epoch": 2.39, + "grad_norm": 9.529265403747559, + "learning_rate": 4.036313433460236e-06, + "loss": 0.851, + "step": 19080 + }, + { + "epoch": 2.39, + "grad_norm": 18.91885757446289, + "learning_rate": 4.035476718403548e-06, + "loss": 1.0852, + "step": 19081 + }, + { + "epoch": 2.39, + "grad_norm": 19.511430740356445, + "learning_rate": 4.034640003346861e-06, + "loss": 1.0616, + "step": 19082 + }, + { + "epoch": 2.39, + "grad_norm": 26.91684341430664, + "learning_rate": 4.033803288290173e-06, + "loss": 0.1865, + "step": 19083 + }, + { + "epoch": 2.4, + "grad_norm": 11.549873352050781, + "learning_rate": 4.032966573233485e-06, + "loss": 1.4849, + "step": 19084 + }, + { + "epoch": 2.4, + "grad_norm": 21.561246871948242, + "learning_rate": 4.032129858176798e-06, + "loss": 1.0529, + "step": 19085 + }, + { + "epoch": 2.4, + "grad_norm": 6.6802754402160645, + "learning_rate": 4.031293143120111e-06, + "loss": 0.7517, + "step": 19086 + }, + { + "epoch": 2.4, + "grad_norm": 30.852449417114258, + "learning_rate": 4.030456428063424e-06, + "loss": 0.6937, + "step": 19087 + }, + { + "epoch": 2.4, + "grad_norm": 65.0311050415039, + "learning_rate": 4.029619713006736e-06, + "loss": 1.7491, + "step": 19088 + }, + { + "epoch": 2.4, + "grad_norm": 21.973583221435547, + "learning_rate": 4.028782997950049e-06, + "loss": 1.6574, + "step": 19089 + }, + { + "epoch": 2.4, + "grad_norm": 10.833308219909668, + "learning_rate": 4.0279462828933605e-06, + "loss": 1.639, + "step": 19090 + }, + { + "epoch": 2.4, + "grad_norm": 15.514629364013672, + "learning_rate": 4.027109567836673e-06, + "loss": 0.9653, + "step": 19091 + }, + { + "epoch": 2.4, + "grad_norm": 30.378694534301758, + "learning_rate": 4.026272852779986e-06, + "loss": 1.764, + "step": 19092 + }, + { + "epoch": 2.4, + "grad_norm": 22.07337760925293, + "learning_rate": 4.025436137723299e-06, + "loss": 1.5077, + "step": 19093 + }, + { + "epoch": 2.4, + "grad_norm": 3.136129856109619, + "learning_rate": 4.024599422666611e-06, + "loss": 0.1062, + "step": 19094 + }, + { + "epoch": 2.4, + "grad_norm": 6.182592868804932, + "learning_rate": 4.023762707609924e-06, + "loss": 0.6325, + "step": 19095 + }, + { + "epoch": 2.4, + "grad_norm": 14.115240097045898, + "learning_rate": 4.0229259925532365e-06, + "loss": 1.8024, + "step": 19096 + }, + { + "epoch": 2.4, + "grad_norm": 44.72590255737305, + "learning_rate": 4.0220892774965485e-06, + "loss": 3.7641, + "step": 19097 + }, + { + "epoch": 2.4, + "grad_norm": 37.94907760620117, + "learning_rate": 4.021252562439861e-06, + "loss": 1.1817, + "step": 19098 + }, + { + "epoch": 2.4, + "grad_norm": 26.96363639831543, + "learning_rate": 4.020415847383174e-06, + "loss": 0.6482, + "step": 19099 + }, + { + "epoch": 2.4, + "grad_norm": 17.43169593811035, + "learning_rate": 4.019579132326487e-06, + "loss": 0.6672, + "step": 19100 + }, + { + "epoch": 2.4, + "grad_norm": 8.113611221313477, + "learning_rate": 4.018742417269799e-06, + "loss": 0.5586, + "step": 19101 + }, + { + "epoch": 2.4, + "grad_norm": 33.545021057128906, + "learning_rate": 4.017905702213112e-06, + "loss": 2.3937, + "step": 19102 + }, + { + "epoch": 2.4, + "grad_norm": 7.373061656951904, + "learning_rate": 4.017068987156424e-06, + "loss": 1.0229, + "step": 19103 + }, + { + "epoch": 2.4, + "grad_norm": 24.821794509887695, + "learning_rate": 4.016232272099736e-06, + "loss": 0.8089, + "step": 19104 + }, + { + "epoch": 2.4, + "grad_norm": 11.429865837097168, + "learning_rate": 4.015395557043049e-06, + "loss": 1.1606, + "step": 19105 + }, + { + "epoch": 2.4, + "grad_norm": 86.04908752441406, + "learning_rate": 4.014558841986362e-06, + "loss": 1.2803, + "step": 19106 + }, + { + "epoch": 2.4, + "grad_norm": 8.572955131530762, + "learning_rate": 4.013722126929675e-06, + "loss": 1.4158, + "step": 19107 + }, + { + "epoch": 2.4, + "grad_norm": 19.158706665039062, + "learning_rate": 4.012885411872987e-06, + "loss": 0.7274, + "step": 19108 + }, + { + "epoch": 2.4, + "grad_norm": 7.97062349319458, + "learning_rate": 4.0120486968162995e-06, + "loss": 2.0569, + "step": 19109 + }, + { + "epoch": 2.4, + "grad_norm": 22.070228576660156, + "learning_rate": 4.011211981759612e-06, + "loss": 0.5531, + "step": 19110 + }, + { + "epoch": 2.4, + "grad_norm": 11.128243446350098, + "learning_rate": 4.010375266702924e-06, + "loss": 0.5593, + "step": 19111 + }, + { + "epoch": 2.4, + "grad_norm": 9.367835998535156, + "learning_rate": 4.009538551646237e-06, + "loss": 0.3618, + "step": 19112 + }, + { + "epoch": 2.4, + "grad_norm": 23.092803955078125, + "learning_rate": 4.00870183658955e-06, + "loss": 0.8735, + "step": 19113 + }, + { + "epoch": 2.4, + "grad_norm": 7.3652849197387695, + "learning_rate": 4.007865121532863e-06, + "loss": 0.729, + "step": 19114 + }, + { + "epoch": 2.4, + "grad_norm": 5.7102227210998535, + "learning_rate": 4.007028406476175e-06, + "loss": 0.2235, + "step": 19115 + }, + { + "epoch": 2.4, + "grad_norm": 13.90251636505127, + "learning_rate": 4.0061916914194875e-06, + "loss": 1.2297, + "step": 19116 + }, + { + "epoch": 2.4, + "grad_norm": 36.02486801147461, + "learning_rate": 4.0053549763628e-06, + "loss": 1.7103, + "step": 19117 + }, + { + "epoch": 2.4, + "grad_norm": 44.666255950927734, + "learning_rate": 4.004518261306112e-06, + "loss": 1.6022, + "step": 19118 + }, + { + "epoch": 2.4, + "grad_norm": 29.21457862854004, + "learning_rate": 4.003681546249425e-06, + "loss": 1.1878, + "step": 19119 + }, + { + "epoch": 2.4, + "grad_norm": 13.307560920715332, + "learning_rate": 4.002844831192738e-06, + "loss": 0.7035, + "step": 19120 + }, + { + "epoch": 2.4, + "grad_norm": 17.413768768310547, + "learning_rate": 4.002008116136051e-06, + "loss": 0.7771, + "step": 19121 + }, + { + "epoch": 2.4, + "grad_norm": 4.2511372566223145, + "learning_rate": 4.0011714010793626e-06, + "loss": 0.1663, + "step": 19122 + }, + { + "epoch": 2.4, + "grad_norm": 10.610640525817871, + "learning_rate": 4.000334686022675e-06, + "loss": 0.7032, + "step": 19123 + }, + { + "epoch": 2.4, + "grad_norm": 8.91777515411377, + "learning_rate": 3.999497970965988e-06, + "loss": 2.1738, + "step": 19124 + }, + { + "epoch": 2.4, + "grad_norm": 17.62346076965332, + "learning_rate": 3.9986612559093e-06, + "loss": 0.7938, + "step": 19125 + }, + { + "epoch": 2.4, + "grad_norm": 23.286380767822266, + "learning_rate": 3.997824540852613e-06, + "loss": 1.5049, + "step": 19126 + }, + { + "epoch": 2.4, + "grad_norm": 21.39017105102539, + "learning_rate": 3.996987825795926e-06, + "loss": 1.7285, + "step": 19127 + }, + { + "epoch": 2.4, + "grad_norm": 21.869333267211914, + "learning_rate": 3.9961511107392385e-06, + "loss": 1.0293, + "step": 19128 + }, + { + "epoch": 2.4, + "grad_norm": 19.568134307861328, + "learning_rate": 3.9953143956825505e-06, + "loss": 1.342, + "step": 19129 + }, + { + "epoch": 2.4, + "grad_norm": 13.146456718444824, + "learning_rate": 3.994477680625863e-06, + "loss": 1.8469, + "step": 19130 + }, + { + "epoch": 2.4, + "grad_norm": 17.104389190673828, + "learning_rate": 3.993640965569176e-06, + "loss": 0.6416, + "step": 19131 + }, + { + "epoch": 2.4, + "grad_norm": 14.854141235351562, + "learning_rate": 3.992804250512488e-06, + "loss": 0.7681, + "step": 19132 + }, + { + "epoch": 2.4, + "grad_norm": 31.403024673461914, + "learning_rate": 3.991967535455801e-06, + "loss": 2.6604, + "step": 19133 + }, + { + "epoch": 2.4, + "grad_norm": 58.17258071899414, + "learning_rate": 3.991130820399114e-06, + "loss": 1.5447, + "step": 19134 + }, + { + "epoch": 2.4, + "grad_norm": 20.589649200439453, + "learning_rate": 3.990294105342426e-06, + "loss": 3.3073, + "step": 19135 + }, + { + "epoch": 2.4, + "grad_norm": 11.538389205932617, + "learning_rate": 3.989457390285738e-06, + "loss": 0.4872, + "step": 19136 + }, + { + "epoch": 2.4, + "grad_norm": 13.821202278137207, + "learning_rate": 3.988620675229051e-06, + "loss": 1.3287, + "step": 19137 + }, + { + "epoch": 2.4, + "grad_norm": 38.13163375854492, + "learning_rate": 3.987783960172363e-06, + "loss": 1.3356, + "step": 19138 + }, + { + "epoch": 2.4, + "grad_norm": 69.0101089477539, + "learning_rate": 3.986947245115676e-06, + "loss": 1.0536, + "step": 19139 + }, + { + "epoch": 2.4, + "grad_norm": 38.037227630615234, + "learning_rate": 3.986110530058989e-06, + "loss": 0.4617, + "step": 19140 + }, + { + "epoch": 2.4, + "grad_norm": 3.5604493618011475, + "learning_rate": 3.9852738150023016e-06, + "loss": 0.1397, + "step": 19141 + }, + { + "epoch": 2.4, + "grad_norm": 11.344226837158203, + "learning_rate": 3.9844370999456135e-06, + "loss": 0.3115, + "step": 19142 + }, + { + "epoch": 2.4, + "grad_norm": 22.58791732788086, + "learning_rate": 3.983600384888926e-06, + "loss": 1.9093, + "step": 19143 + }, + { + "epoch": 2.4, + "grad_norm": 14.093499183654785, + "learning_rate": 3.982763669832239e-06, + "loss": 0.4885, + "step": 19144 + }, + { + "epoch": 2.4, + "grad_norm": 12.28605842590332, + "learning_rate": 3.981926954775551e-06, + "loss": 1.5395, + "step": 19145 + }, + { + "epoch": 2.4, + "grad_norm": 200.0440673828125, + "learning_rate": 3.981090239718864e-06, + "loss": 1.5086, + "step": 19146 + }, + { + "epoch": 2.4, + "grad_norm": 15.651595115661621, + "learning_rate": 3.980253524662177e-06, + "loss": 0.5725, + "step": 19147 + }, + { + "epoch": 2.4, + "grad_norm": 99.0028076171875, + "learning_rate": 3.9794168096054895e-06, + "loss": 1.0011, + "step": 19148 + }, + { + "epoch": 2.4, + "grad_norm": 31.75554847717285, + "learning_rate": 3.9785800945488014e-06, + "loss": 0.994, + "step": 19149 + }, + { + "epoch": 2.4, + "grad_norm": 35.45192337036133, + "learning_rate": 3.977743379492114e-06, + "loss": 1.4886, + "step": 19150 + }, + { + "epoch": 2.4, + "grad_norm": 14.628107070922852, + "learning_rate": 3.976906664435427e-06, + "loss": 0.4394, + "step": 19151 + }, + { + "epoch": 2.4, + "grad_norm": 14.06891918182373, + "learning_rate": 3.976069949378739e-06, + "loss": 1.1679, + "step": 19152 + }, + { + "epoch": 2.4, + "grad_norm": 12.731145858764648, + "learning_rate": 3.975233234322052e-06, + "loss": 1.7926, + "step": 19153 + }, + { + "epoch": 2.4, + "grad_norm": 11.016291618347168, + "learning_rate": 3.974396519265365e-06, + "loss": 0.4143, + "step": 19154 + }, + { + "epoch": 2.4, + "grad_norm": 26.453344345092773, + "learning_rate": 3.973559804208677e-06, + "loss": 1.2922, + "step": 19155 + }, + { + "epoch": 2.4, + "grad_norm": 22.435813903808594, + "learning_rate": 3.972723089151989e-06, + "loss": 0.9505, + "step": 19156 + }, + { + "epoch": 2.4, + "grad_norm": 14.01541805267334, + "learning_rate": 3.971886374095302e-06, + "loss": 0.505, + "step": 19157 + }, + { + "epoch": 2.4, + "grad_norm": 18.743432998657227, + "learning_rate": 3.971049659038615e-06, + "loss": 1.2548, + "step": 19158 + }, + { + "epoch": 2.4, + "grad_norm": 23.11290740966797, + "learning_rate": 3.970212943981927e-06, + "loss": 1.0154, + "step": 19159 + }, + { + "epoch": 2.4, + "grad_norm": 24.722064971923828, + "learning_rate": 3.96937622892524e-06, + "loss": 0.6556, + "step": 19160 + }, + { + "epoch": 2.4, + "grad_norm": 10.541865348815918, + "learning_rate": 3.9685395138685525e-06, + "loss": 0.4639, + "step": 19161 + }, + { + "epoch": 2.4, + "grad_norm": 31.57651710510254, + "learning_rate": 3.967702798811865e-06, + "loss": 1.409, + "step": 19162 + }, + { + "epoch": 2.4, + "grad_norm": 6.214509963989258, + "learning_rate": 3.966866083755177e-06, + "loss": 0.5104, + "step": 19163 + }, + { + "epoch": 2.41, + "grad_norm": 11.101883888244629, + "learning_rate": 3.96602936869849e-06, + "loss": 0.7904, + "step": 19164 + }, + { + "epoch": 2.41, + "grad_norm": 22.653268814086914, + "learning_rate": 3.965192653641803e-06, + "loss": 2.0983, + "step": 19165 + }, + { + "epoch": 2.41, + "grad_norm": 20.678075790405273, + "learning_rate": 3.964355938585115e-06, + "loss": 1.9968, + "step": 19166 + }, + { + "epoch": 2.41, + "grad_norm": 80.93486785888672, + "learning_rate": 3.963519223528428e-06, + "loss": 0.8472, + "step": 19167 + }, + { + "epoch": 2.41, + "grad_norm": 9.310606956481934, + "learning_rate": 3.9626825084717404e-06, + "loss": 0.5267, + "step": 19168 + }, + { + "epoch": 2.41, + "grad_norm": 17.505313873291016, + "learning_rate": 3.961845793415052e-06, + "loss": 2.1095, + "step": 19169 + }, + { + "epoch": 2.41, + "grad_norm": 6.054166793823242, + "learning_rate": 3.961009078358365e-06, + "loss": 0.2008, + "step": 19170 + }, + { + "epoch": 2.41, + "grad_norm": 28.922883987426758, + "learning_rate": 3.960172363301678e-06, + "loss": 1.1252, + "step": 19171 + }, + { + "epoch": 2.41, + "grad_norm": 7.792020320892334, + "learning_rate": 3.959335648244991e-06, + "loss": 0.5446, + "step": 19172 + }, + { + "epoch": 2.41, + "grad_norm": 55.393497467041016, + "learning_rate": 3.958498933188303e-06, + "loss": 2.2708, + "step": 19173 + }, + { + "epoch": 2.41, + "grad_norm": 10.111420631408691, + "learning_rate": 3.9576622181316156e-06, + "loss": 1.2057, + "step": 19174 + }, + { + "epoch": 2.41, + "grad_norm": 17.73046112060547, + "learning_rate": 3.956825503074928e-06, + "loss": 1.1219, + "step": 19175 + }, + { + "epoch": 2.41, + "grad_norm": 8.470479011535645, + "learning_rate": 3.95598878801824e-06, + "loss": 0.6359, + "step": 19176 + }, + { + "epoch": 2.41, + "grad_norm": 3.160224199295044, + "learning_rate": 3.955152072961553e-06, + "loss": 0.1297, + "step": 19177 + }, + { + "epoch": 2.41, + "grad_norm": 7.111017227172852, + "learning_rate": 3.954315357904866e-06, + "loss": 0.3905, + "step": 19178 + }, + { + "epoch": 2.41, + "grad_norm": 15.66545295715332, + "learning_rate": 3.953478642848179e-06, + "loss": 0.825, + "step": 19179 + }, + { + "epoch": 2.41, + "grad_norm": 27.49559211730957, + "learning_rate": 3.952641927791491e-06, + "loss": 0.533, + "step": 19180 + }, + { + "epoch": 2.41, + "grad_norm": 22.51540756225586, + "learning_rate": 3.9518052127348035e-06, + "loss": 1.4591, + "step": 19181 + }, + { + "epoch": 2.41, + "grad_norm": 23.426937103271484, + "learning_rate": 3.950968497678116e-06, + "loss": 1.9108, + "step": 19182 + }, + { + "epoch": 2.41, + "grad_norm": 27.181591033935547, + "learning_rate": 3.950131782621428e-06, + "loss": 1.0922, + "step": 19183 + }, + { + "epoch": 2.41, + "grad_norm": 7.732165813446045, + "learning_rate": 3.949295067564741e-06, + "loss": 0.2493, + "step": 19184 + }, + { + "epoch": 2.41, + "grad_norm": 10.207642555236816, + "learning_rate": 3.948458352508054e-06, + "loss": 1.5212, + "step": 19185 + }, + { + "epoch": 2.41, + "grad_norm": 9.41203498840332, + "learning_rate": 3.947621637451367e-06, + "loss": 1.6123, + "step": 19186 + }, + { + "epoch": 2.41, + "grad_norm": 9.990777015686035, + "learning_rate": 3.946784922394679e-06, + "loss": 0.4817, + "step": 19187 + }, + { + "epoch": 2.41, + "grad_norm": 4.777212142944336, + "learning_rate": 3.945948207337991e-06, + "loss": 0.3308, + "step": 19188 + }, + { + "epoch": 2.41, + "grad_norm": 16.02861976623535, + "learning_rate": 3.945111492281304e-06, + "loss": 0.4803, + "step": 19189 + }, + { + "epoch": 2.41, + "grad_norm": 9.46855354309082, + "learning_rate": 3.944274777224616e-06, + "loss": 0.8299, + "step": 19190 + }, + { + "epoch": 2.41, + "grad_norm": 1.9646720886230469, + "learning_rate": 3.943438062167929e-06, + "loss": 0.073, + "step": 19191 + }, + { + "epoch": 2.41, + "grad_norm": 10.14421272277832, + "learning_rate": 3.942601347111242e-06, + "loss": 0.4138, + "step": 19192 + }, + { + "epoch": 2.41, + "grad_norm": 8.660815238952637, + "learning_rate": 3.9417646320545546e-06, + "loss": 0.7271, + "step": 19193 + }, + { + "epoch": 2.41, + "grad_norm": 18.654617309570312, + "learning_rate": 3.9409279169978665e-06, + "loss": 1.6773, + "step": 19194 + }, + { + "epoch": 2.41, + "grad_norm": 120.55852508544922, + "learning_rate": 3.940091201941179e-06, + "loss": 2.4435, + "step": 19195 + }, + { + "epoch": 2.41, + "grad_norm": 6.149465084075928, + "learning_rate": 3.939254486884492e-06, + "loss": 0.2222, + "step": 19196 + }, + { + "epoch": 2.41, + "grad_norm": 23.639225006103516, + "learning_rate": 3.938417771827804e-06, + "loss": 0.5304, + "step": 19197 + }, + { + "epoch": 2.41, + "grad_norm": 13.503562927246094, + "learning_rate": 3.937581056771117e-06, + "loss": 0.5163, + "step": 19198 + }, + { + "epoch": 2.41, + "grad_norm": 39.53706741333008, + "learning_rate": 3.93674434171443e-06, + "loss": 1.2687, + "step": 19199 + }, + { + "epoch": 2.41, + "grad_norm": 6.831623554229736, + "learning_rate": 3.9359076266577425e-06, + "loss": 1.3298, + "step": 19200 + }, + { + "epoch": 2.41, + "eval_loss": 0.07908567786216736, + "eval_runtime": 96.7748, + "eval_samples_per_second": 36.6, + "eval_steps_per_second": 36.6, + "step": 19200 + }, + { + "epoch": 2.41, + "grad_norm": 117.7660140991211, + "learning_rate": 3.9350709116010544e-06, + "loss": 1.8972, + "step": 19201 + }, + { + "epoch": 2.41, + "grad_norm": 17.90907859802246, + "learning_rate": 3.934234196544367e-06, + "loss": 0.7705, + "step": 19202 + }, + { + "epoch": 2.41, + "grad_norm": 11.033977508544922, + "learning_rate": 3.93339748148768e-06, + "loss": 0.4761, + "step": 19203 + }, + { + "epoch": 2.41, + "grad_norm": 6.191403865814209, + "learning_rate": 3.932560766430992e-06, + "loss": 0.3172, + "step": 19204 + }, + { + "epoch": 2.41, + "grad_norm": 18.340211868286133, + "learning_rate": 3.931724051374305e-06, + "loss": 0.963, + "step": 19205 + }, + { + "epoch": 2.41, + "grad_norm": 16.469741821289062, + "learning_rate": 3.930887336317618e-06, + "loss": 1.8976, + "step": 19206 + }, + { + "epoch": 2.41, + "grad_norm": 10.327061653137207, + "learning_rate": 3.93005062126093e-06, + "loss": 0.8535, + "step": 19207 + }, + { + "epoch": 2.41, + "grad_norm": 20.400331497192383, + "learning_rate": 3.929213906204242e-06, + "loss": 0.5511, + "step": 19208 + }, + { + "epoch": 2.41, + "grad_norm": 8.918400764465332, + "learning_rate": 3.928377191147555e-06, + "loss": 0.5542, + "step": 19209 + }, + { + "epoch": 2.41, + "grad_norm": 16.06703758239746, + "learning_rate": 3.927540476090867e-06, + "loss": 1.9891, + "step": 19210 + }, + { + "epoch": 2.41, + "grad_norm": 42.195884704589844, + "learning_rate": 3.92670376103418e-06, + "loss": 1.3129, + "step": 19211 + }, + { + "epoch": 2.41, + "grad_norm": 30.93027114868164, + "learning_rate": 3.925867045977493e-06, + "loss": 1.1214, + "step": 19212 + }, + { + "epoch": 2.41, + "grad_norm": 2.7529571056365967, + "learning_rate": 3.9250303309208055e-06, + "loss": 0.1369, + "step": 19213 + }, + { + "epoch": 2.41, + "grad_norm": 6.6867828369140625, + "learning_rate": 3.924193615864118e-06, + "loss": 0.6473, + "step": 19214 + }, + { + "epoch": 2.41, + "grad_norm": 7.075080394744873, + "learning_rate": 3.92335690080743e-06, + "loss": 0.2434, + "step": 19215 + }, + { + "epoch": 2.41, + "grad_norm": 18.304428100585938, + "learning_rate": 3.922520185750743e-06, + "loss": 1.6788, + "step": 19216 + }, + { + "epoch": 2.41, + "grad_norm": 7.171009540557861, + "learning_rate": 3.921683470694055e-06, + "loss": 0.171, + "step": 19217 + }, + { + "epoch": 2.41, + "grad_norm": 13.496152877807617, + "learning_rate": 3.920846755637368e-06, + "loss": 1.196, + "step": 19218 + }, + { + "epoch": 2.41, + "grad_norm": 14.460394859313965, + "learning_rate": 3.920010040580681e-06, + "loss": 1.0907, + "step": 19219 + }, + { + "epoch": 2.41, + "grad_norm": 16.681568145751953, + "learning_rate": 3.9191733255239934e-06, + "loss": 0.7428, + "step": 19220 + }, + { + "epoch": 2.41, + "grad_norm": 14.806492805480957, + "learning_rate": 3.918336610467306e-06, + "loss": 1.4748, + "step": 19221 + }, + { + "epoch": 2.41, + "grad_norm": 5.13444709777832, + "learning_rate": 3.917499895410618e-06, + "loss": 1.0966, + "step": 19222 + }, + { + "epoch": 2.41, + "grad_norm": 10.014902114868164, + "learning_rate": 3.916663180353931e-06, + "loss": 0.2904, + "step": 19223 + }, + { + "epoch": 2.41, + "grad_norm": 19.748510360717773, + "learning_rate": 3.915826465297243e-06, + "loss": 2.6818, + "step": 19224 + }, + { + "epoch": 2.41, + "grad_norm": 15.84622859954834, + "learning_rate": 3.914989750240556e-06, + "loss": 1.2931, + "step": 19225 + }, + { + "epoch": 2.41, + "grad_norm": 12.150506019592285, + "learning_rate": 3.9141530351838685e-06, + "loss": 0.8015, + "step": 19226 + }, + { + "epoch": 2.41, + "grad_norm": 11.955732345581055, + "learning_rate": 3.913316320127181e-06, + "loss": 0.9369, + "step": 19227 + }, + { + "epoch": 2.41, + "grad_norm": 12.21967887878418, + "learning_rate": 3.912479605070493e-06, + "loss": 0.6531, + "step": 19228 + }, + { + "epoch": 2.41, + "grad_norm": 24.59291648864746, + "learning_rate": 3.911642890013806e-06, + "loss": 1.1242, + "step": 19229 + }, + { + "epoch": 2.41, + "grad_norm": 33.7893180847168, + "learning_rate": 3.910806174957119e-06, + "loss": 1.9844, + "step": 19230 + }, + { + "epoch": 2.41, + "grad_norm": 32.203125, + "learning_rate": 3.909969459900431e-06, + "loss": 1.6182, + "step": 19231 + }, + { + "epoch": 2.41, + "grad_norm": 18.622575759887695, + "learning_rate": 3.909132744843744e-06, + "loss": 0.869, + "step": 19232 + }, + { + "epoch": 2.41, + "grad_norm": 14.749338150024414, + "learning_rate": 3.9082960297870565e-06, + "loss": 0.7788, + "step": 19233 + }, + { + "epoch": 2.41, + "grad_norm": 14.998071670532227, + "learning_rate": 3.907459314730369e-06, + "loss": 1.0434, + "step": 19234 + }, + { + "epoch": 2.41, + "grad_norm": 148.5703887939453, + "learning_rate": 3.906622599673681e-06, + "loss": 1.4984, + "step": 19235 + }, + { + "epoch": 2.41, + "grad_norm": 7.36978006362915, + "learning_rate": 3.905785884616994e-06, + "loss": 0.4504, + "step": 19236 + }, + { + "epoch": 2.41, + "grad_norm": 8.224552154541016, + "learning_rate": 3.904949169560307e-06, + "loss": 0.4879, + "step": 19237 + }, + { + "epoch": 2.41, + "grad_norm": 32.19786834716797, + "learning_rate": 3.904112454503619e-06, + "loss": 0.5529, + "step": 19238 + }, + { + "epoch": 2.41, + "grad_norm": 40.72687530517578, + "learning_rate": 3.903275739446932e-06, + "loss": 1.2672, + "step": 19239 + }, + { + "epoch": 2.41, + "grad_norm": 22.027299880981445, + "learning_rate": 3.902439024390244e-06, + "loss": 1.2204, + "step": 19240 + }, + { + "epoch": 2.41, + "grad_norm": 60.05638885498047, + "learning_rate": 3.901602309333557e-06, + "loss": 1.807, + "step": 19241 + }, + { + "epoch": 2.41, + "grad_norm": 18.618318557739258, + "learning_rate": 3.900765594276869e-06, + "loss": 1.3002, + "step": 19242 + }, + { + "epoch": 2.41, + "grad_norm": 11.675912857055664, + "learning_rate": 3.899928879220182e-06, + "loss": 0.534, + "step": 19243 + }, + { + "epoch": 2.42, + "grad_norm": 21.279949188232422, + "learning_rate": 3.899092164163495e-06, + "loss": 0.5592, + "step": 19244 + }, + { + "epoch": 2.42, + "grad_norm": 105.58536529541016, + "learning_rate": 3.898255449106807e-06, + "loss": 1.5623, + "step": 19245 + }, + { + "epoch": 2.42, + "grad_norm": 30.80086326599121, + "learning_rate": 3.8974187340501195e-06, + "loss": 1.1525, + "step": 19246 + }, + { + "epoch": 2.42, + "grad_norm": 6.376068115234375, + "learning_rate": 3.896582018993432e-06, + "loss": 1.0781, + "step": 19247 + }, + { + "epoch": 2.42, + "grad_norm": 11.469795227050781, + "learning_rate": 3.895745303936745e-06, + "loss": 0.9443, + "step": 19248 + }, + { + "epoch": 2.42, + "grad_norm": 11.311722755432129, + "learning_rate": 3.894908588880057e-06, + "loss": 1.2709, + "step": 19249 + }, + { + "epoch": 2.42, + "grad_norm": 8.450668334960938, + "learning_rate": 3.89407187382337e-06, + "loss": 1.0794, + "step": 19250 + }, + { + "epoch": 2.42, + "grad_norm": 33.71548080444336, + "learning_rate": 3.893235158766682e-06, + "loss": 1.5056, + "step": 19251 + }, + { + "epoch": 2.42, + "grad_norm": 115.70248413085938, + "learning_rate": 3.892398443709995e-06, + "loss": 0.8059, + "step": 19252 + }, + { + "epoch": 2.42, + "grad_norm": 13.588934898376465, + "learning_rate": 3.891561728653307e-06, + "loss": 2.1443, + "step": 19253 + }, + { + "epoch": 2.42, + "grad_norm": 20.4144344329834, + "learning_rate": 3.89072501359662e-06, + "loss": 2.0834, + "step": 19254 + }, + { + "epoch": 2.42, + "grad_norm": 13.07769775390625, + "learning_rate": 3.889888298539933e-06, + "loss": 0.9905, + "step": 19255 + }, + { + "epoch": 2.42, + "grad_norm": 59.85422134399414, + "learning_rate": 3.889051583483245e-06, + "loss": 1.6588, + "step": 19256 + }, + { + "epoch": 2.42, + "grad_norm": 17.695573806762695, + "learning_rate": 3.888214868426558e-06, + "loss": 1.8516, + "step": 19257 + }, + { + "epoch": 2.42, + "grad_norm": 3.559680461883545, + "learning_rate": 3.88737815336987e-06, + "loss": 1.1526, + "step": 19258 + }, + { + "epoch": 2.42, + "grad_norm": 11.505620956420898, + "learning_rate": 3.8865414383131825e-06, + "loss": 0.9919, + "step": 19259 + }, + { + "epoch": 2.42, + "grad_norm": 9.828024864196777, + "learning_rate": 3.885704723256495e-06, + "loss": 0.8659, + "step": 19260 + }, + { + "epoch": 2.42, + "grad_norm": 10.574488639831543, + "learning_rate": 3.884868008199808e-06, + "loss": 1.7149, + "step": 19261 + }, + { + "epoch": 2.42, + "grad_norm": 8.784329414367676, + "learning_rate": 3.884031293143121e-06, + "loss": 0.2339, + "step": 19262 + }, + { + "epoch": 2.42, + "grad_norm": 31.123836517333984, + "learning_rate": 3.883194578086433e-06, + "loss": 1.7855, + "step": 19263 + }, + { + "epoch": 2.42, + "grad_norm": 26.213031768798828, + "learning_rate": 3.882357863029746e-06, + "loss": 1.5882, + "step": 19264 + }, + { + "epoch": 2.42, + "grad_norm": 10.312332153320312, + "learning_rate": 3.881521147973058e-06, + "loss": 0.5542, + "step": 19265 + }, + { + "epoch": 2.42, + "grad_norm": 10.933004379272461, + "learning_rate": 3.8806844329163705e-06, + "loss": 0.526, + "step": 19266 + }, + { + "epoch": 2.42, + "grad_norm": 9.574731826782227, + "learning_rate": 3.879847717859683e-06, + "loss": 0.3757, + "step": 19267 + }, + { + "epoch": 2.42, + "grad_norm": 10.87021255493164, + "learning_rate": 3.879011002802996e-06, + "loss": 2.3607, + "step": 19268 + }, + { + "epoch": 2.42, + "grad_norm": 6.682435989379883, + "learning_rate": 3.878174287746309e-06, + "loss": 1.4867, + "step": 19269 + }, + { + "epoch": 2.42, + "grad_norm": 380.7222900390625, + "learning_rate": 3.877337572689621e-06, + "loss": 1.6735, + "step": 19270 + }, + { + "epoch": 2.42, + "grad_norm": 17.722379684448242, + "learning_rate": 3.876500857632934e-06, + "loss": 1.0548, + "step": 19271 + }, + { + "epoch": 2.42, + "grad_norm": 8.213561058044434, + "learning_rate": 3.8756641425762456e-06, + "loss": 0.6286, + "step": 19272 + }, + { + "epoch": 2.42, + "grad_norm": 10.587211608886719, + "learning_rate": 3.874827427519558e-06, + "loss": 0.4255, + "step": 19273 + }, + { + "epoch": 2.42, + "grad_norm": 21.14800262451172, + "learning_rate": 3.873990712462871e-06, + "loss": 0.9089, + "step": 19274 + }, + { + "epoch": 2.42, + "grad_norm": 26.88003921508789, + "learning_rate": 3.873153997406184e-06, + "loss": 2.877, + "step": 19275 + }, + { + "epoch": 2.42, + "grad_norm": 5.936546325683594, + "learning_rate": 3.872317282349497e-06, + "loss": 0.4939, + "step": 19276 + }, + { + "epoch": 2.42, + "grad_norm": 18.24165153503418, + "learning_rate": 3.871480567292809e-06, + "loss": 0.6357, + "step": 19277 + }, + { + "epoch": 2.42, + "grad_norm": 10.405789375305176, + "learning_rate": 3.8706438522361215e-06, + "loss": 1.3881, + "step": 19278 + }, + { + "epoch": 2.42, + "grad_norm": 8.698683738708496, + "learning_rate": 3.8698071371794335e-06, + "loss": 0.9599, + "step": 19279 + }, + { + "epoch": 2.42, + "grad_norm": 4.385855674743652, + "learning_rate": 3.868970422122746e-06, + "loss": 0.2265, + "step": 19280 + }, + { + "epoch": 2.42, + "grad_norm": 10.598146438598633, + "learning_rate": 3.868133707066059e-06, + "loss": 1.0209, + "step": 19281 + }, + { + "epoch": 2.42, + "grad_norm": 6.441844463348389, + "learning_rate": 3.867296992009372e-06, + "loss": 0.7301, + "step": 19282 + }, + { + "epoch": 2.42, + "grad_norm": 15.066563606262207, + "learning_rate": 3.866460276952685e-06, + "loss": 0.4305, + "step": 19283 + }, + { + "epoch": 2.42, + "grad_norm": 26.297943115234375, + "learning_rate": 3.865623561895997e-06, + "loss": 1.183, + "step": 19284 + }, + { + "epoch": 2.42, + "grad_norm": 2.300495147705078, + "learning_rate": 3.8647868468393095e-06, + "loss": 0.1023, + "step": 19285 + }, + { + "epoch": 2.42, + "grad_norm": 17.563278198242188, + "learning_rate": 3.863950131782621e-06, + "loss": 1.1672, + "step": 19286 + }, + { + "epoch": 2.42, + "grad_norm": 29.381669998168945, + "learning_rate": 3.863113416725934e-06, + "loss": 0.734, + "step": 19287 + }, + { + "epoch": 2.42, + "grad_norm": 42.8314094543457, + "learning_rate": 3.862276701669247e-06, + "loss": 1.2531, + "step": 19288 + }, + { + "epoch": 2.42, + "grad_norm": 11.0601167678833, + "learning_rate": 3.86143998661256e-06, + "loss": 0.5414, + "step": 19289 + }, + { + "epoch": 2.42, + "grad_norm": 39.98393249511719, + "learning_rate": 3.860603271555873e-06, + "loss": 1.9857, + "step": 19290 + }, + { + "epoch": 2.42, + "grad_norm": 157.44747924804688, + "learning_rate": 3.8597665564991846e-06, + "loss": 1.1885, + "step": 19291 + }, + { + "epoch": 2.42, + "grad_norm": 11.619778633117676, + "learning_rate": 3.8589298414424965e-06, + "loss": 1.5894, + "step": 19292 + }, + { + "epoch": 2.42, + "grad_norm": 14.429756164550781, + "learning_rate": 3.858093126385809e-06, + "loss": 0.9863, + "step": 19293 + }, + { + "epoch": 2.42, + "grad_norm": 22.345094680786133, + "learning_rate": 3.857256411329122e-06, + "loss": 1.3794, + "step": 19294 + }, + { + "epoch": 2.42, + "grad_norm": 5.763068199157715, + "learning_rate": 3.856419696272435e-06, + "loss": 1.5051, + "step": 19295 + }, + { + "epoch": 2.42, + "grad_norm": 29.449905395507812, + "learning_rate": 3.855582981215748e-06, + "loss": 1.2129, + "step": 19296 + }, + { + "epoch": 2.42, + "grad_norm": 16.893463134765625, + "learning_rate": 3.85474626615906e-06, + "loss": 0.7417, + "step": 19297 + }, + { + "epoch": 2.42, + "grad_norm": 5.6840009689331055, + "learning_rate": 3.8539095511023725e-06, + "loss": 0.2945, + "step": 19298 + }, + { + "epoch": 2.42, + "grad_norm": 55.94939041137695, + "learning_rate": 3.8530728360456844e-06, + "loss": 0.2093, + "step": 19299 + }, + { + "epoch": 2.42, + "grad_norm": 188.28958129882812, + "learning_rate": 3.852236120988997e-06, + "loss": 1.5405, + "step": 19300 + }, + { + "epoch": 2.42, + "grad_norm": 6.573770046234131, + "learning_rate": 3.85139940593231e-06, + "loss": 0.0694, + "step": 19301 + }, + { + "epoch": 2.42, + "grad_norm": 17.996570587158203, + "learning_rate": 3.850562690875623e-06, + "loss": 1.7903, + "step": 19302 + }, + { + "epoch": 2.42, + "grad_norm": 6.312846660614014, + "learning_rate": 3.849725975818936e-06, + "loss": 0.1564, + "step": 19303 + }, + { + "epoch": 2.42, + "grad_norm": 16.640140533447266, + "learning_rate": 3.848889260762248e-06, + "loss": 1.5704, + "step": 19304 + }, + { + "epoch": 2.42, + "grad_norm": 18.343538284301758, + "learning_rate": 3.84805254570556e-06, + "loss": 1.6799, + "step": 19305 + }, + { + "epoch": 2.42, + "grad_norm": 44.953529357910156, + "learning_rate": 3.847215830648872e-06, + "loss": 0.5269, + "step": 19306 + }, + { + "epoch": 2.42, + "grad_norm": 18.762069702148438, + "learning_rate": 3.846379115592185e-06, + "loss": 0.8328, + "step": 19307 + }, + { + "epoch": 2.42, + "grad_norm": 13.904192924499512, + "learning_rate": 3.845542400535498e-06, + "loss": 0.6432, + "step": 19308 + }, + { + "epoch": 2.42, + "grad_norm": 20.194324493408203, + "learning_rate": 3.844705685478811e-06, + "loss": 0.8736, + "step": 19309 + }, + { + "epoch": 2.42, + "grad_norm": 23.708715438842773, + "learning_rate": 3.8438689704221236e-06, + "loss": 0.9779, + "step": 19310 + }, + { + "epoch": 2.42, + "grad_norm": 10.490565299987793, + "learning_rate": 3.8430322553654355e-06, + "loss": 0.8575, + "step": 19311 + }, + { + "epoch": 2.42, + "grad_norm": 9.342899322509766, + "learning_rate": 3.842195540308748e-06, + "loss": 0.2516, + "step": 19312 + }, + { + "epoch": 2.42, + "grad_norm": 6.65676212310791, + "learning_rate": 3.84135882525206e-06, + "loss": 0.4363, + "step": 19313 + }, + { + "epoch": 2.42, + "grad_norm": 19.738954544067383, + "learning_rate": 3.840522110195373e-06, + "loss": 0.7261, + "step": 19314 + }, + { + "epoch": 2.42, + "grad_norm": 9.836162567138672, + "learning_rate": 3.839685395138686e-06, + "loss": 0.5298, + "step": 19315 + }, + { + "epoch": 2.42, + "grad_norm": 4.317717552185059, + "learning_rate": 3.838848680081999e-06, + "loss": 0.46, + "step": 19316 + }, + { + "epoch": 2.42, + "grad_norm": 12.451818466186523, + "learning_rate": 3.8380119650253115e-06, + "loss": 0.8643, + "step": 19317 + }, + { + "epoch": 2.42, + "grad_norm": 11.131498336791992, + "learning_rate": 3.8371752499686234e-06, + "loss": 0.6231, + "step": 19318 + }, + { + "epoch": 2.42, + "grad_norm": 6.2766523361206055, + "learning_rate": 3.836338534911936e-06, + "loss": 1.1419, + "step": 19319 + }, + { + "epoch": 2.42, + "grad_norm": 12.388383865356445, + "learning_rate": 3.835501819855248e-06, + "loss": 0.3583, + "step": 19320 + }, + { + "epoch": 2.42, + "grad_norm": 21.208301544189453, + "learning_rate": 3.834665104798561e-06, + "loss": 0.4861, + "step": 19321 + }, + { + "epoch": 2.42, + "grad_norm": 16.611286163330078, + "learning_rate": 3.833828389741874e-06, + "loss": 0.612, + "step": 19322 + }, + { + "epoch": 2.42, + "grad_norm": 11.50299072265625, + "learning_rate": 3.832991674685187e-06, + "loss": 1.377, + "step": 19323 + }, + { + "epoch": 2.43, + "grad_norm": 12.140826225280762, + "learning_rate": 3.832154959628499e-06, + "loss": 0.5706, + "step": 19324 + }, + { + "epoch": 2.43, + "grad_norm": 20.794418334960938, + "learning_rate": 3.831318244571811e-06, + "loss": 1.0795, + "step": 19325 + }, + { + "epoch": 2.43, + "grad_norm": 27.155277252197266, + "learning_rate": 3.830481529515124e-06, + "loss": 1.8459, + "step": 19326 + }, + { + "epoch": 2.43, + "grad_norm": 25.26287269592285, + "learning_rate": 3.829644814458436e-06, + "loss": 1.5525, + "step": 19327 + }, + { + "epoch": 2.43, + "grad_norm": 6.756760597229004, + "learning_rate": 3.828808099401749e-06, + "loss": 0.44, + "step": 19328 + }, + { + "epoch": 2.43, + "grad_norm": 10.38017463684082, + "learning_rate": 3.827971384345062e-06, + "loss": 0.3319, + "step": 19329 + }, + { + "epoch": 2.43, + "grad_norm": 16.2598934173584, + "learning_rate": 3.8271346692883745e-06, + "loss": 1.7218, + "step": 19330 + }, + { + "epoch": 2.43, + "grad_norm": 12.016488075256348, + "learning_rate": 3.826297954231687e-06, + "loss": 0.7483, + "step": 19331 + }, + { + "epoch": 2.43, + "grad_norm": 8.901429176330566, + "learning_rate": 3.825461239174999e-06, + "loss": 1.7516, + "step": 19332 + }, + { + "epoch": 2.43, + "grad_norm": 25.796611785888672, + "learning_rate": 3.824624524118311e-06, + "loss": 0.5186, + "step": 19333 + }, + { + "epoch": 2.43, + "grad_norm": 112.78844451904297, + "learning_rate": 3.823787809061624e-06, + "loss": 0.7428, + "step": 19334 + }, + { + "epoch": 2.43, + "grad_norm": 48.96705627441406, + "learning_rate": 3.822951094004937e-06, + "loss": 1.0306, + "step": 19335 + }, + { + "epoch": 2.43, + "grad_norm": 12.96463394165039, + "learning_rate": 3.82211437894825e-06, + "loss": 2.505, + "step": 19336 + }, + { + "epoch": 2.43, + "grad_norm": 25.676706314086914, + "learning_rate": 3.8212776638915624e-06, + "loss": 0.8836, + "step": 19337 + }, + { + "epoch": 2.43, + "grad_norm": 6.562462329864502, + "learning_rate": 3.820440948834874e-06, + "loss": 0.2133, + "step": 19338 + }, + { + "epoch": 2.43, + "grad_norm": 39.64939880371094, + "learning_rate": 3.819604233778187e-06, + "loss": 0.7905, + "step": 19339 + }, + { + "epoch": 2.43, + "grad_norm": 18.159381866455078, + "learning_rate": 3.818767518721499e-06, + "loss": 0.7596, + "step": 19340 + }, + { + "epoch": 2.43, + "grad_norm": 6.936798572540283, + "learning_rate": 3.817930803664812e-06, + "loss": 0.2851, + "step": 19341 + }, + { + "epoch": 2.43, + "grad_norm": 14.394283294677734, + "learning_rate": 3.817094088608125e-06, + "loss": 1.9691, + "step": 19342 + }, + { + "epoch": 2.43, + "grad_norm": 11.488725662231445, + "learning_rate": 3.8162573735514376e-06, + "loss": 1.0727, + "step": 19343 + }, + { + "epoch": 2.43, + "grad_norm": 6.755542755126953, + "learning_rate": 3.81542065849475e-06, + "loss": 0.238, + "step": 19344 + }, + { + "epoch": 2.43, + "grad_norm": 15.270042419433594, + "learning_rate": 3.8145839434380627e-06, + "loss": 0.4638, + "step": 19345 + }, + { + "epoch": 2.43, + "grad_norm": 22.63208770751953, + "learning_rate": 3.8137472283813747e-06, + "loss": 1.8979, + "step": 19346 + }, + { + "epoch": 2.43, + "grad_norm": 45.051815032958984, + "learning_rate": 3.8129105133246875e-06, + "loss": 1.3845, + "step": 19347 + }, + { + "epoch": 2.43, + "grad_norm": 1.2802832126617432, + "learning_rate": 3.812073798268e-06, + "loss": 0.0313, + "step": 19348 + }, + { + "epoch": 2.43, + "grad_norm": 17.48478126525879, + "learning_rate": 3.8112370832113127e-06, + "loss": 0.9503, + "step": 19349 + }, + { + "epoch": 2.43, + "grad_norm": 10.073248863220215, + "learning_rate": 3.8104003681546255e-06, + "loss": 0.4555, + "step": 19350 + }, + { + "epoch": 2.43, + "grad_norm": 30.915050506591797, + "learning_rate": 3.809563653097938e-06, + "loss": 1.0442, + "step": 19351 + }, + { + "epoch": 2.43, + "grad_norm": 10.166717529296875, + "learning_rate": 3.8087269380412507e-06, + "loss": 1.2188, + "step": 19352 + }, + { + "epoch": 2.43, + "grad_norm": 11.059699058532715, + "learning_rate": 3.8078902229845626e-06, + "loss": 0.0837, + "step": 19353 + }, + { + "epoch": 2.43, + "grad_norm": 21.718486785888672, + "learning_rate": 3.8070535079278754e-06, + "loss": 2.2156, + "step": 19354 + }, + { + "epoch": 2.43, + "grad_norm": 16.999832153320312, + "learning_rate": 3.806216792871188e-06, + "loss": 0.7584, + "step": 19355 + }, + { + "epoch": 2.43, + "grad_norm": 11.521601676940918, + "learning_rate": 3.8053800778145006e-06, + "loss": 1.0624, + "step": 19356 + }, + { + "epoch": 2.43, + "grad_norm": 16.258691787719727, + "learning_rate": 3.8045433627578134e-06, + "loss": 0.4942, + "step": 19357 + }, + { + "epoch": 2.43, + "grad_norm": 8.650497436523438, + "learning_rate": 3.8037066477011258e-06, + "loss": 1.1155, + "step": 19358 + }, + { + "epoch": 2.43, + "grad_norm": 10.786202430725098, + "learning_rate": 3.8028699326444386e-06, + "loss": 0.5909, + "step": 19359 + }, + { + "epoch": 2.43, + "grad_norm": 33.60354232788086, + "learning_rate": 3.8020332175877505e-06, + "loss": 0.3585, + "step": 19360 + }, + { + "epoch": 2.43, + "grad_norm": 6.411097526550293, + "learning_rate": 3.8011965025310633e-06, + "loss": 0.22, + "step": 19361 + }, + { + "epoch": 2.43, + "grad_norm": 9.783259391784668, + "learning_rate": 3.8003597874743757e-06, + "loss": 0.9092, + "step": 19362 + }, + { + "epoch": 2.43, + "grad_norm": 8.718094825744629, + "learning_rate": 3.7995230724176885e-06, + "loss": 0.5791, + "step": 19363 + }, + { + "epoch": 2.43, + "grad_norm": 24.100753784179688, + "learning_rate": 3.7986863573610013e-06, + "loss": 0.7816, + "step": 19364 + }, + { + "epoch": 2.43, + "grad_norm": 25.01043701171875, + "learning_rate": 3.7978496423043137e-06, + "loss": 2.6373, + "step": 19365 + }, + { + "epoch": 2.43, + "grad_norm": 14.57491683959961, + "learning_rate": 3.7970129272476265e-06, + "loss": 0.3222, + "step": 19366 + }, + { + "epoch": 2.43, + "grad_norm": 6.600212097167969, + "learning_rate": 3.7961762121909385e-06, + "loss": 0.1352, + "step": 19367 + }, + { + "epoch": 2.43, + "grad_norm": 14.061747550964355, + "learning_rate": 3.7953394971342513e-06, + "loss": 0.5259, + "step": 19368 + }, + { + "epoch": 2.43, + "grad_norm": 35.42488098144531, + "learning_rate": 3.7945027820775636e-06, + "loss": 1.9063, + "step": 19369 + }, + { + "epoch": 2.43, + "grad_norm": 17.88988494873047, + "learning_rate": 3.7936660670208764e-06, + "loss": 1.4857, + "step": 19370 + }, + { + "epoch": 2.43, + "grad_norm": 12.470402717590332, + "learning_rate": 3.792829351964189e-06, + "loss": 0.6173, + "step": 19371 + }, + { + "epoch": 2.43, + "grad_norm": 3.856550931930542, + "learning_rate": 3.7919926369075016e-06, + "loss": 0.0938, + "step": 19372 + }, + { + "epoch": 2.43, + "grad_norm": 12.61774730682373, + "learning_rate": 3.7911559218508144e-06, + "loss": 0.1798, + "step": 19373 + }, + { + "epoch": 2.43, + "grad_norm": 181.10760498046875, + "learning_rate": 3.7903192067941264e-06, + "loss": 2.1395, + "step": 19374 + }, + { + "epoch": 2.43, + "grad_norm": 44.0191650390625, + "learning_rate": 3.789482491737439e-06, + "loss": 1.591, + "step": 19375 + }, + { + "epoch": 2.43, + "grad_norm": 87.66629028320312, + "learning_rate": 3.7886457766807515e-06, + "loss": 0.7762, + "step": 19376 + }, + { + "epoch": 2.43, + "grad_norm": 11.412515640258789, + "learning_rate": 3.7878090616240644e-06, + "loss": 0.5234, + "step": 19377 + }, + { + "epoch": 2.43, + "grad_norm": 43.73072814941406, + "learning_rate": 3.7869723465673767e-06, + "loss": 2.9941, + "step": 19378 + }, + { + "epoch": 2.43, + "grad_norm": 12.963578224182129, + "learning_rate": 3.7861356315106895e-06, + "loss": 0.8023, + "step": 19379 + }, + { + "epoch": 2.43, + "grad_norm": 10.066496849060059, + "learning_rate": 3.7852989164540023e-06, + "loss": 1.0838, + "step": 19380 + }, + { + "epoch": 2.43, + "grad_norm": 2.953787088394165, + "learning_rate": 3.7844622013973143e-06, + "loss": 0.2104, + "step": 19381 + }, + { + "epoch": 2.43, + "grad_norm": 20.860986709594727, + "learning_rate": 3.7836254863406267e-06, + "loss": 1.8658, + "step": 19382 + }, + { + "epoch": 2.43, + "grad_norm": 17.35547637939453, + "learning_rate": 3.7827887712839395e-06, + "loss": 0.6201, + "step": 19383 + }, + { + "epoch": 2.43, + "grad_norm": 9.459033012390137, + "learning_rate": 3.7819520562272523e-06, + "loss": 0.2003, + "step": 19384 + }, + { + "epoch": 2.43, + "grad_norm": 15.11513614654541, + "learning_rate": 3.7811153411705646e-06, + "loss": 0.4576, + "step": 19385 + }, + { + "epoch": 2.43, + "grad_norm": 3.2749271392822266, + "learning_rate": 3.7802786261138775e-06, + "loss": 0.0915, + "step": 19386 + }, + { + "epoch": 2.43, + "grad_norm": 8.152831077575684, + "learning_rate": 3.7794419110571903e-06, + "loss": 0.7088, + "step": 19387 + }, + { + "epoch": 2.43, + "grad_norm": 7.579389572143555, + "learning_rate": 3.778605196000502e-06, + "loss": 0.7091, + "step": 19388 + }, + { + "epoch": 2.43, + "grad_norm": 10.391526222229004, + "learning_rate": 3.7777684809438146e-06, + "loss": 0.7642, + "step": 19389 + }, + { + "epoch": 2.43, + "grad_norm": 86.48945617675781, + "learning_rate": 3.7769317658871274e-06, + "loss": 1.9806, + "step": 19390 + }, + { + "epoch": 2.43, + "grad_norm": 10.321609497070312, + "learning_rate": 3.77609505083044e-06, + "loss": 0.9803, + "step": 19391 + }, + { + "epoch": 2.43, + "grad_norm": 8.198219299316406, + "learning_rate": 3.7752583357737526e-06, + "loss": 0.5157, + "step": 19392 + }, + { + "epoch": 2.43, + "grad_norm": 12.072665214538574, + "learning_rate": 3.7744216207170654e-06, + "loss": 1.122, + "step": 19393 + }, + { + "epoch": 2.43, + "grad_norm": 16.971315383911133, + "learning_rate": 3.7735849056603777e-06, + "loss": 0.9175, + "step": 19394 + }, + { + "epoch": 2.43, + "grad_norm": 6.218404769897461, + "learning_rate": 3.77274819060369e-06, + "loss": 0.4567, + "step": 19395 + }, + { + "epoch": 2.43, + "grad_norm": 11.903154373168945, + "learning_rate": 3.7719114755470025e-06, + "loss": 1.9367, + "step": 19396 + }, + { + "epoch": 2.43, + "grad_norm": 35.32255172729492, + "learning_rate": 3.7710747604903153e-06, + "loss": 0.7885, + "step": 19397 + }, + { + "epoch": 2.43, + "grad_norm": 10.045111656188965, + "learning_rate": 3.770238045433628e-06, + "loss": 0.3523, + "step": 19398 + }, + { + "epoch": 2.43, + "grad_norm": 6.714310169219971, + "learning_rate": 3.7694013303769405e-06, + "loss": 0.8446, + "step": 19399 + }, + { + "epoch": 2.43, + "grad_norm": 214.91531372070312, + "learning_rate": 3.7685646153202533e-06, + "loss": 1.7278, + "step": 19400 + }, + { + "epoch": 2.43, + "grad_norm": 12.572393417358398, + "learning_rate": 3.7677279002635652e-06, + "loss": 0.8203, + "step": 19401 + }, + { + "epoch": 2.43, + "grad_norm": 42.073081970214844, + "learning_rate": 3.766891185206878e-06, + "loss": 2.2245, + "step": 19402 + }, + { + "epoch": 2.44, + "grad_norm": 5.576661586761475, + "learning_rate": 3.7660544701501904e-06, + "loss": 0.2068, + "step": 19403 + }, + { + "epoch": 2.44, + "grad_norm": 9.408468246459961, + "learning_rate": 3.7652177550935032e-06, + "loss": 0.6677, + "step": 19404 + }, + { + "epoch": 2.44, + "grad_norm": 17.082490921020508, + "learning_rate": 3.764381040036816e-06, + "loss": 1.1447, + "step": 19405 + }, + { + "epoch": 2.44, + "grad_norm": 8.84753131866455, + "learning_rate": 3.7635443249801284e-06, + "loss": 0.7181, + "step": 19406 + }, + { + "epoch": 2.44, + "grad_norm": 12.232440948486328, + "learning_rate": 3.762707609923441e-06, + "loss": 0.8382, + "step": 19407 + }, + { + "epoch": 2.44, + "grad_norm": 29.958683013916016, + "learning_rate": 3.761870894866753e-06, + "loss": 1.5797, + "step": 19408 + }, + { + "epoch": 2.44, + "grad_norm": 26.029563903808594, + "learning_rate": 3.761034179810066e-06, + "loss": 2.1999, + "step": 19409 + }, + { + "epoch": 2.44, + "grad_norm": 14.655035972595215, + "learning_rate": 3.7601974647533783e-06, + "loss": 0.9913, + "step": 19410 + }, + { + "epoch": 2.44, + "grad_norm": 7.407000541687012, + "learning_rate": 3.759360749696691e-06, + "loss": 0.831, + "step": 19411 + }, + { + "epoch": 2.44, + "grad_norm": 11.39689826965332, + "learning_rate": 3.7585240346400035e-06, + "loss": 0.4357, + "step": 19412 + }, + { + "epoch": 2.44, + "grad_norm": 21.788785934448242, + "learning_rate": 3.7576873195833163e-06, + "loss": 0.5291, + "step": 19413 + }, + { + "epoch": 2.44, + "grad_norm": 8.13565731048584, + "learning_rate": 3.756850604526629e-06, + "loss": 0.3877, + "step": 19414 + }, + { + "epoch": 2.44, + "grad_norm": 6.812713623046875, + "learning_rate": 3.756013889469941e-06, + "loss": 1.2595, + "step": 19415 + }, + { + "epoch": 2.44, + "grad_norm": 6.118829727172852, + "learning_rate": 3.755177174413254e-06, + "loss": 0.4958, + "step": 19416 + }, + { + "epoch": 2.44, + "grad_norm": 5.148907661437988, + "learning_rate": 3.7543404593565663e-06, + "loss": 0.1913, + "step": 19417 + }, + { + "epoch": 2.44, + "grad_norm": 21.72593116760254, + "learning_rate": 3.753503744299879e-06, + "loss": 0.746, + "step": 19418 + }, + { + "epoch": 2.44, + "grad_norm": 14.517721176147461, + "learning_rate": 3.7526670292431914e-06, + "loss": 0.9438, + "step": 19419 + }, + { + "epoch": 2.44, + "grad_norm": 14.233525276184082, + "learning_rate": 3.7518303141865042e-06, + "loss": 0.743, + "step": 19420 + }, + { + "epoch": 2.44, + "grad_norm": 18.300806045532227, + "learning_rate": 3.750993599129817e-06, + "loss": 0.945, + "step": 19421 + }, + { + "epoch": 2.44, + "grad_norm": 11.133574485778809, + "learning_rate": 3.750156884073129e-06, + "loss": 1.0218, + "step": 19422 + }, + { + "epoch": 2.44, + "grad_norm": 11.053126335144043, + "learning_rate": 3.7493201690164414e-06, + "loss": 2.1753, + "step": 19423 + }, + { + "epoch": 2.44, + "grad_norm": 18.2052001953125, + "learning_rate": 3.748483453959754e-06, + "loss": 2.6961, + "step": 19424 + }, + { + "epoch": 2.44, + "grad_norm": 38.304786682128906, + "learning_rate": 3.747646738903067e-06, + "loss": 0.769, + "step": 19425 + }, + { + "epoch": 2.44, + "grad_norm": 11.873785972595215, + "learning_rate": 3.7468100238463794e-06, + "loss": 0.4924, + "step": 19426 + }, + { + "epoch": 2.44, + "grad_norm": 13.371991157531738, + "learning_rate": 3.745973308789692e-06, + "loss": 0.8206, + "step": 19427 + }, + { + "epoch": 2.44, + "grad_norm": 27.54091453552246, + "learning_rate": 3.745136593733005e-06, + "loss": 1.5238, + "step": 19428 + }, + { + "epoch": 2.44, + "grad_norm": 20.18380355834961, + "learning_rate": 3.744299878676317e-06, + "loss": 1.0788, + "step": 19429 + }, + { + "epoch": 2.44, + "grad_norm": 9.077404022216797, + "learning_rate": 3.7434631636196293e-06, + "loss": 0.922, + "step": 19430 + }, + { + "epoch": 2.44, + "grad_norm": 15.031689643859863, + "learning_rate": 3.742626448562942e-06, + "loss": 0.9573, + "step": 19431 + }, + { + "epoch": 2.44, + "grad_norm": 23.64993667602539, + "learning_rate": 3.741789733506255e-06, + "loss": 1.1587, + "step": 19432 + }, + { + "epoch": 2.44, + "grad_norm": 15.048856735229492, + "learning_rate": 3.7409530184495673e-06, + "loss": 0.7992, + "step": 19433 + }, + { + "epoch": 2.44, + "grad_norm": 50.3357048034668, + "learning_rate": 3.74011630339288e-06, + "loss": 2.5979, + "step": 19434 + }, + { + "epoch": 2.44, + "grad_norm": 0.9730583429336548, + "learning_rate": 3.7392795883361925e-06, + "loss": 0.0196, + "step": 19435 + }, + { + "epoch": 2.44, + "grad_norm": 11.490036010742188, + "learning_rate": 3.738442873279505e-06, + "loss": 1.5979, + "step": 19436 + }, + { + "epoch": 2.44, + "grad_norm": 69.22692108154297, + "learning_rate": 3.7376061582228172e-06, + "loss": 4.0118, + "step": 19437 + }, + { + "epoch": 2.44, + "grad_norm": 14.819534301757812, + "learning_rate": 3.73676944316613e-06, + "loss": 1.1017, + "step": 19438 + }, + { + "epoch": 2.44, + "grad_norm": 35.04603958129883, + "learning_rate": 3.735932728109443e-06, + "loss": 1.308, + "step": 19439 + }, + { + "epoch": 2.44, + "grad_norm": 11.587259292602539, + "learning_rate": 3.735096013052755e-06, + "loss": 0.4893, + "step": 19440 + }, + { + "epoch": 2.44, + "grad_norm": 21.826257705688477, + "learning_rate": 3.734259297996068e-06, + "loss": 2.334, + "step": 19441 + }, + { + "epoch": 2.44, + "grad_norm": 24.667236328125, + "learning_rate": 3.7334225829393804e-06, + "loss": 0.9229, + "step": 19442 + }, + { + "epoch": 2.44, + "grad_norm": 23.616796493530273, + "learning_rate": 3.7325858678826928e-06, + "loss": 1.2666, + "step": 19443 + }, + { + "epoch": 2.44, + "grad_norm": 36.222328186035156, + "learning_rate": 3.731749152826005e-06, + "loss": 1.8234, + "step": 19444 + }, + { + "epoch": 2.44, + "grad_norm": 12.682394027709961, + "learning_rate": 3.730912437769318e-06, + "loss": 0.3684, + "step": 19445 + }, + { + "epoch": 2.44, + "grad_norm": 115.44532012939453, + "learning_rate": 3.7300757227126303e-06, + "loss": 2.5413, + "step": 19446 + }, + { + "epoch": 2.44, + "grad_norm": 5.9555535316467285, + "learning_rate": 3.729239007655943e-06, + "loss": 0.2132, + "step": 19447 + }, + { + "epoch": 2.44, + "grad_norm": 3.5936670303344727, + "learning_rate": 3.728402292599256e-06, + "loss": 0.3806, + "step": 19448 + }, + { + "epoch": 2.44, + "grad_norm": 40.56935501098633, + "learning_rate": 3.7275655775425683e-06, + "loss": 0.8785, + "step": 19449 + }, + { + "epoch": 2.44, + "grad_norm": 23.914955139160156, + "learning_rate": 3.7267288624858807e-06, + "loss": 1.6017, + "step": 19450 + }, + { + "epoch": 2.44, + "grad_norm": 5.980844020843506, + "learning_rate": 3.725892147429193e-06, + "loss": 0.2545, + "step": 19451 + }, + { + "epoch": 2.44, + "grad_norm": 12.170584678649902, + "learning_rate": 3.725055432372506e-06, + "loss": 0.9213, + "step": 19452 + }, + { + "epoch": 2.44, + "grad_norm": 18.933670043945312, + "learning_rate": 3.7242187173158182e-06, + "loss": 1.3134, + "step": 19453 + }, + { + "epoch": 2.44, + "grad_norm": 32.968238830566406, + "learning_rate": 3.723382002259131e-06, + "loss": 0.8626, + "step": 19454 + }, + { + "epoch": 2.44, + "grad_norm": 6.560389995574951, + "learning_rate": 3.722545287202444e-06, + "loss": 0.5033, + "step": 19455 + }, + { + "epoch": 2.44, + "grad_norm": 9.095024108886719, + "learning_rate": 3.7217085721457562e-06, + "loss": 0.5097, + "step": 19456 + }, + { + "epoch": 2.44, + "grad_norm": 7.687363147735596, + "learning_rate": 3.7208718570890686e-06, + "loss": 0.3373, + "step": 19457 + }, + { + "epoch": 2.44, + "grad_norm": 22.17128562927246, + "learning_rate": 3.720035142032381e-06, + "loss": 0.6416, + "step": 19458 + }, + { + "epoch": 2.44, + "grad_norm": 10.849040985107422, + "learning_rate": 3.7191984269756938e-06, + "loss": 1.394, + "step": 19459 + }, + { + "epoch": 2.44, + "grad_norm": 3.1686325073242188, + "learning_rate": 3.718361711919006e-06, + "loss": 0.252, + "step": 19460 + }, + { + "epoch": 2.44, + "grad_norm": 94.78205108642578, + "learning_rate": 3.717524996862319e-06, + "loss": 2.0621, + "step": 19461 + }, + { + "epoch": 2.44, + "grad_norm": 13.970884323120117, + "learning_rate": 3.7166882818056318e-06, + "loss": 0.522, + "step": 19462 + }, + { + "epoch": 2.44, + "grad_norm": 10.70833683013916, + "learning_rate": 3.715851566748944e-06, + "loss": 1.0751, + "step": 19463 + }, + { + "epoch": 2.44, + "grad_norm": 113.82579040527344, + "learning_rate": 3.715014851692256e-06, + "loss": 1.6744, + "step": 19464 + }, + { + "epoch": 2.44, + "grad_norm": 11.373026847839355, + "learning_rate": 3.714178136635569e-06, + "loss": 0.5601, + "step": 19465 + }, + { + "epoch": 2.44, + "grad_norm": 17.753814697265625, + "learning_rate": 3.7133414215788817e-06, + "loss": 1.9597, + "step": 19466 + }, + { + "epoch": 2.44, + "grad_norm": 15.960187911987305, + "learning_rate": 3.712504706522194e-06, + "loss": 2.7553, + "step": 19467 + }, + { + "epoch": 2.44, + "grad_norm": 6.061026096343994, + "learning_rate": 3.711667991465507e-06, + "loss": 0.3813, + "step": 19468 + }, + { + "epoch": 2.44, + "grad_norm": 5.9911651611328125, + "learning_rate": 3.7108312764088197e-06, + "loss": 0.0639, + "step": 19469 + }, + { + "epoch": 2.44, + "grad_norm": 14.849716186523438, + "learning_rate": 3.709994561352132e-06, + "loss": 0.3714, + "step": 19470 + }, + { + "epoch": 2.44, + "grad_norm": 6.802692413330078, + "learning_rate": 3.709157846295444e-06, + "loss": 0.4296, + "step": 19471 + }, + { + "epoch": 2.44, + "grad_norm": 57.396915435791016, + "learning_rate": 3.708321131238757e-06, + "loss": 2.195, + "step": 19472 + }, + { + "epoch": 2.44, + "grad_norm": 12.485960006713867, + "learning_rate": 3.7074844161820696e-06, + "loss": 1.3668, + "step": 19473 + }, + { + "epoch": 2.44, + "grad_norm": 6.738912105560303, + "learning_rate": 3.706647701125382e-06, + "loss": 0.2507, + "step": 19474 + }, + { + "epoch": 2.44, + "grad_norm": 2.0292773246765137, + "learning_rate": 3.7058109860686948e-06, + "loss": 0.0491, + "step": 19475 + }, + { + "epoch": 2.44, + "grad_norm": 7.762231349945068, + "learning_rate": 3.704974271012007e-06, + "loss": 1.7823, + "step": 19476 + }, + { + "epoch": 2.44, + "grad_norm": 8.054952621459961, + "learning_rate": 3.70413755595532e-06, + "loss": 0.5199, + "step": 19477 + }, + { + "epoch": 2.44, + "grad_norm": 27.710302352905273, + "learning_rate": 3.703300840898632e-06, + "loss": 1.8586, + "step": 19478 + }, + { + "epoch": 2.44, + "grad_norm": 21.074888229370117, + "learning_rate": 3.7024641258419447e-06, + "loss": 0.7223, + "step": 19479 + }, + { + "epoch": 2.44, + "grad_norm": 48.69511032104492, + "learning_rate": 3.7016274107852575e-06, + "loss": 0.6589, + "step": 19480 + }, + { + "epoch": 2.44, + "grad_norm": 18.42178726196289, + "learning_rate": 3.70079069572857e-06, + "loss": 0.9482, + "step": 19481 + }, + { + "epoch": 2.44, + "grad_norm": 12.237410545349121, + "learning_rate": 3.6999539806718827e-06, + "loss": 0.443, + "step": 19482 + }, + { + "epoch": 2.45, + "grad_norm": 87.9030532836914, + "learning_rate": 3.699117265615195e-06, + "loss": 1.164, + "step": 19483 + }, + { + "epoch": 2.45, + "grad_norm": 10.423582077026367, + "learning_rate": 3.698280550558508e-06, + "loss": 1.4942, + "step": 19484 + }, + { + "epoch": 2.45, + "grad_norm": 15.563819885253906, + "learning_rate": 3.69744383550182e-06, + "loss": 0.3124, + "step": 19485 + }, + { + "epoch": 2.45, + "grad_norm": 26.41593360900879, + "learning_rate": 3.6966071204451326e-06, + "loss": 1.2273, + "step": 19486 + }, + { + "epoch": 2.45, + "grad_norm": 8.330730438232422, + "learning_rate": 3.695770405388445e-06, + "loss": 0.3219, + "step": 19487 + }, + { + "epoch": 2.45, + "grad_norm": 46.10406494140625, + "learning_rate": 3.694933690331758e-06, + "loss": 1.6226, + "step": 19488 + }, + { + "epoch": 2.45, + "grad_norm": 9.911351203918457, + "learning_rate": 3.6940969752750706e-06, + "loss": 1.0932, + "step": 19489 + }, + { + "epoch": 2.45, + "grad_norm": 7.939240455627441, + "learning_rate": 3.693260260218383e-06, + "loss": 0.6091, + "step": 19490 + }, + { + "epoch": 2.45, + "grad_norm": 14.360014915466309, + "learning_rate": 3.6924235451616954e-06, + "loss": 0.5849, + "step": 19491 + }, + { + "epoch": 2.45, + "grad_norm": 17.98020362854004, + "learning_rate": 3.6915868301050078e-06, + "loss": 0.8839, + "step": 19492 + }, + { + "epoch": 2.45, + "grad_norm": 20.01367950439453, + "learning_rate": 3.6907501150483206e-06, + "loss": 1.1177, + "step": 19493 + }, + { + "epoch": 2.45, + "grad_norm": 8.152234077453613, + "learning_rate": 3.689913399991633e-06, + "loss": 1.4944, + "step": 19494 + }, + { + "epoch": 2.45, + "grad_norm": 15.775879859924316, + "learning_rate": 3.6890766849349457e-06, + "loss": 0.8828, + "step": 19495 + }, + { + "epoch": 2.45, + "grad_norm": 18.101858139038086, + "learning_rate": 3.6882399698782585e-06, + "loss": 0.6848, + "step": 19496 + }, + { + "epoch": 2.45, + "grad_norm": 14.405749320983887, + "learning_rate": 3.687403254821571e-06, + "loss": 2.2475, + "step": 19497 + }, + { + "epoch": 2.45, + "grad_norm": 29.071508407592773, + "learning_rate": 3.6865665397648833e-06, + "loss": 1.3265, + "step": 19498 + }, + { + "epoch": 2.45, + "grad_norm": 15.056915283203125, + "learning_rate": 3.6857298247081957e-06, + "loss": 1.1738, + "step": 19499 + }, + { + "epoch": 2.45, + "grad_norm": 12.642854690551758, + "learning_rate": 3.6848931096515085e-06, + "loss": 1.3507, + "step": 19500 + }, + { + "epoch": 2.45, + "grad_norm": 9.297966003417969, + "learning_rate": 3.684056394594821e-06, + "loss": 0.3365, + "step": 19501 + }, + { + "epoch": 2.45, + "grad_norm": 5.268497467041016, + "learning_rate": 3.6832196795381337e-06, + "loss": 0.2009, + "step": 19502 + }, + { + "epoch": 2.45, + "grad_norm": 7.838582992553711, + "learning_rate": 3.6823829644814465e-06, + "loss": 0.1295, + "step": 19503 + }, + { + "epoch": 2.45, + "grad_norm": 9.778390884399414, + "learning_rate": 3.681546249424759e-06, + "loss": 1.5067, + "step": 19504 + }, + { + "epoch": 2.45, + "grad_norm": 3.048859119415283, + "learning_rate": 3.680709534368071e-06, + "loss": 0.1969, + "step": 19505 + }, + { + "epoch": 2.45, + "grad_norm": 7.797203063964844, + "learning_rate": 3.6798728193113836e-06, + "loss": 1.3604, + "step": 19506 + }, + { + "epoch": 2.45, + "grad_norm": 17.60069465637207, + "learning_rate": 3.6790361042546964e-06, + "loss": 1.4674, + "step": 19507 + }, + { + "epoch": 2.45, + "grad_norm": 15.349143981933594, + "learning_rate": 3.6781993891980088e-06, + "loss": 1.5963, + "step": 19508 + }, + { + "epoch": 2.45, + "grad_norm": 20.265392303466797, + "learning_rate": 3.6773626741413216e-06, + "loss": 1.4714, + "step": 19509 + }, + { + "epoch": 2.45, + "grad_norm": 37.8333740234375, + "learning_rate": 3.676525959084634e-06, + "loss": 1.4036, + "step": 19510 + }, + { + "epoch": 2.45, + "grad_norm": 17.156679153442383, + "learning_rate": 3.6756892440279468e-06, + "loss": 1.0847, + "step": 19511 + }, + { + "epoch": 2.45, + "grad_norm": 18.195537567138672, + "learning_rate": 3.6748525289712587e-06, + "loss": 0.7521, + "step": 19512 + }, + { + "epoch": 2.45, + "grad_norm": 16.99808120727539, + "learning_rate": 3.6740158139145715e-06, + "loss": 0.2767, + "step": 19513 + }, + { + "epoch": 2.45, + "grad_norm": 20.308456420898438, + "learning_rate": 3.6731790988578843e-06, + "loss": 1.6816, + "step": 19514 + }, + { + "epoch": 2.45, + "grad_norm": 15.886495590209961, + "learning_rate": 3.6723423838011967e-06, + "loss": 2.2394, + "step": 19515 + }, + { + "epoch": 2.45, + "grad_norm": 110.13824462890625, + "learning_rate": 3.6715056687445095e-06, + "loss": 2.045, + "step": 19516 + }, + { + "epoch": 2.45, + "grad_norm": 14.06793212890625, + "learning_rate": 3.670668953687822e-06, + "loss": 0.3872, + "step": 19517 + }, + { + "epoch": 2.45, + "grad_norm": 6.715120315551758, + "learning_rate": 3.6698322386311347e-06, + "loss": 1.452, + "step": 19518 + }, + { + "epoch": 2.45, + "grad_norm": 37.93868637084961, + "learning_rate": 3.6689955235744466e-06, + "loss": 2.0243, + "step": 19519 + }, + { + "epoch": 2.45, + "grad_norm": 21.736034393310547, + "learning_rate": 3.6681588085177594e-06, + "loss": 0.7348, + "step": 19520 + }, + { + "epoch": 2.45, + "grad_norm": 6.163998603820801, + "learning_rate": 3.6673220934610722e-06, + "loss": 0.1308, + "step": 19521 + }, + { + "epoch": 2.45, + "grad_norm": 12.94540023803711, + "learning_rate": 3.6664853784043846e-06, + "loss": 0.5496, + "step": 19522 + }, + { + "epoch": 2.45, + "grad_norm": 11.611268997192383, + "learning_rate": 3.6656486633476974e-06, + "loss": 1.0797, + "step": 19523 + }, + { + "epoch": 2.45, + "grad_norm": 14.129745483398438, + "learning_rate": 3.66481194829101e-06, + "loss": 1.3862, + "step": 19524 + }, + { + "epoch": 2.45, + "grad_norm": 46.87518310546875, + "learning_rate": 3.6639752332343226e-06, + "loss": 1.7547, + "step": 19525 + }, + { + "epoch": 2.45, + "grad_norm": 8.3310546875, + "learning_rate": 3.6631385181776345e-06, + "loss": 0.8219, + "step": 19526 + }, + { + "epoch": 2.45, + "grad_norm": 11.203372955322266, + "learning_rate": 3.6623018031209474e-06, + "loss": 0.7145, + "step": 19527 + }, + { + "epoch": 2.45, + "grad_norm": 27.463226318359375, + "learning_rate": 3.6614650880642597e-06, + "loss": 0.6897, + "step": 19528 + }, + { + "epoch": 2.45, + "grad_norm": 16.030445098876953, + "learning_rate": 3.6606283730075725e-06, + "loss": 1.0668, + "step": 19529 + }, + { + "epoch": 2.45, + "grad_norm": 14.853161811828613, + "learning_rate": 3.6597916579508853e-06, + "loss": 1.0309, + "step": 19530 + }, + { + "epoch": 2.45, + "grad_norm": 19.87004852294922, + "learning_rate": 3.6589549428941977e-06, + "loss": 0.8711, + "step": 19531 + }, + { + "epoch": 2.45, + "grad_norm": 15.398309707641602, + "learning_rate": 3.6581182278375105e-06, + "loss": 1.8404, + "step": 19532 + }, + { + "epoch": 2.45, + "grad_norm": 4.537481307983398, + "learning_rate": 3.6572815127808225e-06, + "loss": 0.5392, + "step": 19533 + }, + { + "epoch": 2.45, + "grad_norm": 6.660141468048096, + "learning_rate": 3.6564447977241353e-06, + "loss": 0.3912, + "step": 19534 + }, + { + "epoch": 2.45, + "grad_norm": 10.438698768615723, + "learning_rate": 3.6556080826674476e-06, + "loss": 0.4849, + "step": 19535 + }, + { + "epoch": 2.45, + "grad_norm": 3.368213653564453, + "learning_rate": 3.6547713676107605e-06, + "loss": 1.1956, + "step": 19536 + }, + { + "epoch": 2.45, + "grad_norm": 7.393680572509766, + "learning_rate": 3.6539346525540733e-06, + "loss": 1.0433, + "step": 19537 + }, + { + "epoch": 2.45, + "grad_norm": 11.357941627502441, + "learning_rate": 3.6530979374973856e-06, + "loss": 0.5289, + "step": 19538 + }, + { + "epoch": 2.45, + "grad_norm": 9.764481544494629, + "learning_rate": 3.6522612224406984e-06, + "loss": 0.3442, + "step": 19539 + }, + { + "epoch": 2.45, + "grad_norm": 9.39493465423584, + "learning_rate": 3.6514245073840104e-06, + "loss": 1.3427, + "step": 19540 + }, + { + "epoch": 2.45, + "grad_norm": 11.347414016723633, + "learning_rate": 3.650587792327323e-06, + "loss": 1.4416, + "step": 19541 + }, + { + "epoch": 2.45, + "grad_norm": 11.80206298828125, + "learning_rate": 3.6497510772706356e-06, + "loss": 0.6181, + "step": 19542 + }, + { + "epoch": 2.45, + "grad_norm": 23.547332763671875, + "learning_rate": 3.6489143622139484e-06, + "loss": 1.0659, + "step": 19543 + }, + { + "epoch": 2.45, + "grad_norm": 15.252655982971191, + "learning_rate": 3.648077647157261e-06, + "loss": 0.5836, + "step": 19544 + }, + { + "epoch": 2.45, + "grad_norm": 17.29207992553711, + "learning_rate": 3.6472409321005735e-06, + "loss": 0.8593, + "step": 19545 + }, + { + "epoch": 2.45, + "grad_norm": 25.271240234375, + "learning_rate": 3.6464042170438864e-06, + "loss": 2.1257, + "step": 19546 + }, + { + "epoch": 2.45, + "grad_norm": 6.511344909667969, + "learning_rate": 3.6455675019871983e-06, + "loss": 0.4393, + "step": 19547 + }, + { + "epoch": 2.45, + "grad_norm": 2.7553915977478027, + "learning_rate": 3.644730786930511e-06, + "loss": 1.0807, + "step": 19548 + }, + { + "epoch": 2.45, + "grad_norm": 66.39590454101562, + "learning_rate": 3.6438940718738235e-06, + "loss": 1.712, + "step": 19549 + }, + { + "epoch": 2.45, + "grad_norm": 25.083534240722656, + "learning_rate": 3.6430573568171363e-06, + "loss": 1.3258, + "step": 19550 + }, + { + "epoch": 2.45, + "grad_norm": 16.88987159729004, + "learning_rate": 3.6422206417604487e-06, + "loss": 1.274, + "step": 19551 + }, + { + "epoch": 2.45, + "grad_norm": 28.262577056884766, + "learning_rate": 3.6413839267037615e-06, + "loss": 1.6496, + "step": 19552 + }, + { + "epoch": 2.45, + "grad_norm": 69.0420150756836, + "learning_rate": 3.6405472116470743e-06, + "loss": 1.1903, + "step": 19553 + }, + { + "epoch": 2.45, + "grad_norm": 10.95007610321045, + "learning_rate": 3.6397104965903862e-06, + "loss": 0.1559, + "step": 19554 + }, + { + "epoch": 2.45, + "grad_norm": 12.338822364807129, + "learning_rate": 3.638873781533699e-06, + "loss": 0.8436, + "step": 19555 + }, + { + "epoch": 2.45, + "grad_norm": 9.753349304199219, + "learning_rate": 3.6380370664770114e-06, + "loss": 0.4897, + "step": 19556 + }, + { + "epoch": 2.45, + "grad_norm": 12.568263053894043, + "learning_rate": 3.637200351420324e-06, + "loss": 0.6605, + "step": 19557 + }, + { + "epoch": 2.45, + "grad_norm": 60.659671783447266, + "learning_rate": 3.6363636363636366e-06, + "loss": 2.3283, + "step": 19558 + }, + { + "epoch": 2.45, + "grad_norm": 16.86094093322754, + "learning_rate": 3.6355269213069494e-06, + "loss": 0.4849, + "step": 19559 + }, + { + "epoch": 2.45, + "grad_norm": 17.64162254333496, + "learning_rate": 3.634690206250262e-06, + "loss": 0.8689, + "step": 19560 + }, + { + "epoch": 2.45, + "grad_norm": 13.738029479980469, + "learning_rate": 3.633853491193574e-06, + "loss": 0.8648, + "step": 19561 + }, + { + "epoch": 2.45, + "grad_norm": 14.854199409484863, + "learning_rate": 3.633016776136887e-06, + "loss": 1.7406, + "step": 19562 + }, + { + "epoch": 2.46, + "grad_norm": 21.5056095123291, + "learning_rate": 3.6321800610801993e-06, + "loss": 0.4041, + "step": 19563 + }, + { + "epoch": 2.46, + "grad_norm": 19.813793182373047, + "learning_rate": 3.631343346023512e-06, + "loss": 1.316, + "step": 19564 + }, + { + "epoch": 2.46, + "grad_norm": 17.993206024169922, + "learning_rate": 3.6305066309668245e-06, + "loss": 0.6492, + "step": 19565 + }, + { + "epoch": 2.46, + "grad_norm": 8.353463172912598, + "learning_rate": 3.6296699159101373e-06, + "loss": 1.2699, + "step": 19566 + }, + { + "epoch": 2.46, + "grad_norm": 49.2107048034668, + "learning_rate": 3.62883320085345e-06, + "loss": 0.7511, + "step": 19567 + }, + { + "epoch": 2.46, + "grad_norm": 14.674556732177734, + "learning_rate": 3.627996485796762e-06, + "loss": 0.4794, + "step": 19568 + }, + { + "epoch": 2.46, + "grad_norm": 10.151453971862793, + "learning_rate": 3.6271597707400744e-06, + "loss": 0.3333, + "step": 19569 + }, + { + "epoch": 2.46, + "grad_norm": 11.01027774810791, + "learning_rate": 3.6263230556833872e-06, + "loss": 1.0966, + "step": 19570 + }, + { + "epoch": 2.46, + "grad_norm": 15.187713623046875, + "learning_rate": 3.6254863406267e-06, + "loss": 1.5509, + "step": 19571 + }, + { + "epoch": 2.46, + "grad_norm": 52.74294662475586, + "learning_rate": 3.6246496255700124e-06, + "loss": 1.6114, + "step": 19572 + }, + { + "epoch": 2.46, + "grad_norm": 2.404120683670044, + "learning_rate": 3.6238129105133252e-06, + "loss": 0.1087, + "step": 19573 + }, + { + "epoch": 2.46, + "grad_norm": 13.5791654586792, + "learning_rate": 3.6229761954566376e-06, + "loss": 0.5679, + "step": 19574 + }, + { + "epoch": 2.46, + "grad_norm": 9.572688102722168, + "learning_rate": 3.62213948039995e-06, + "loss": 0.4548, + "step": 19575 + }, + { + "epoch": 2.46, + "grad_norm": 26.158750534057617, + "learning_rate": 3.6213027653432624e-06, + "loss": 2.7927, + "step": 19576 + }, + { + "epoch": 2.46, + "grad_norm": 11.314776420593262, + "learning_rate": 3.620466050286575e-06, + "loss": 0.4775, + "step": 19577 + }, + { + "epoch": 2.46, + "grad_norm": 5.215376853942871, + "learning_rate": 3.619629335229888e-06, + "loss": 0.3213, + "step": 19578 + }, + { + "epoch": 2.46, + "grad_norm": 22.112407684326172, + "learning_rate": 3.6187926201732003e-06, + "loss": 1.6694, + "step": 19579 + }, + { + "epoch": 2.46, + "grad_norm": 18.88576889038086, + "learning_rate": 3.617955905116513e-06, + "loss": 0.802, + "step": 19580 + }, + { + "epoch": 2.46, + "grad_norm": 14.532073974609375, + "learning_rate": 3.617119190059825e-06, + "loss": 0.8773, + "step": 19581 + }, + { + "epoch": 2.46, + "grad_norm": 21.64769744873047, + "learning_rate": 3.616282475003138e-06, + "loss": 0.8309, + "step": 19582 + }, + { + "epoch": 2.46, + "grad_norm": 7.648311138153076, + "learning_rate": 3.6154457599464503e-06, + "loss": 0.7514, + "step": 19583 + }, + { + "epoch": 2.46, + "grad_norm": 18.131874084472656, + "learning_rate": 3.614609044889763e-06, + "loss": 0.5386, + "step": 19584 + }, + { + "epoch": 2.46, + "grad_norm": 78.26471710205078, + "learning_rate": 3.613772329833076e-06, + "loss": 1.9049, + "step": 19585 + }, + { + "epoch": 2.46, + "grad_norm": 19.853479385375977, + "learning_rate": 3.6129356147763883e-06, + "loss": 0.8589, + "step": 19586 + }, + { + "epoch": 2.46, + "grad_norm": 11.111654281616211, + "learning_rate": 3.612098899719701e-06, + "loss": 0.9744, + "step": 19587 + }, + { + "epoch": 2.46, + "grad_norm": 31.242660522460938, + "learning_rate": 3.611262184663013e-06, + "loss": 1.9299, + "step": 19588 + }, + { + "epoch": 2.46, + "grad_norm": 33.60970687866211, + "learning_rate": 3.610425469606326e-06, + "loss": 1.4131, + "step": 19589 + }, + { + "epoch": 2.46, + "grad_norm": 5.124098300933838, + "learning_rate": 3.609588754549638e-06, + "loss": 0.4458, + "step": 19590 + }, + { + "epoch": 2.46, + "grad_norm": 3.3736352920532227, + "learning_rate": 3.608752039492951e-06, + "loss": 0.1404, + "step": 19591 + }, + { + "epoch": 2.46, + "grad_norm": 114.87059783935547, + "learning_rate": 3.6079153244362634e-06, + "loss": 2.0552, + "step": 19592 + }, + { + "epoch": 2.46, + "grad_norm": 6.528724193572998, + "learning_rate": 3.607078609379576e-06, + "loss": 0.4047, + "step": 19593 + }, + { + "epoch": 2.46, + "grad_norm": 14.813525199890137, + "learning_rate": 3.606241894322889e-06, + "loss": 0.3514, + "step": 19594 + }, + { + "epoch": 2.46, + "grad_norm": 6.653222560882568, + "learning_rate": 3.605405179266201e-06, + "loss": 0.4972, + "step": 19595 + }, + { + "epoch": 2.46, + "grad_norm": 29.276601791381836, + "learning_rate": 3.6045684642095137e-06, + "loss": 1.7808, + "step": 19596 + }, + { + "epoch": 2.46, + "grad_norm": 13.689377784729004, + "learning_rate": 3.603731749152826e-06, + "loss": 0.602, + "step": 19597 + }, + { + "epoch": 2.46, + "grad_norm": 7.918407917022705, + "learning_rate": 3.602895034096139e-06, + "loss": 0.3228, + "step": 19598 + }, + { + "epoch": 2.46, + "grad_norm": 40.56988525390625, + "learning_rate": 3.6020583190394513e-06, + "loss": 1.5681, + "step": 19599 + }, + { + "epoch": 2.46, + "grad_norm": 9.635811805725098, + "learning_rate": 3.601221603982764e-06, + "loss": 0.2254, + "step": 19600 + }, + { + "epoch": 2.46, + "eval_loss": 0.08711016178131104, + "eval_runtime": 95.4209, + "eval_samples_per_second": 37.12, + "eval_steps_per_second": 37.12, + "step": 19600 + }, + { + "epoch": 2.46, + "grad_norm": 17.06913185119629, + "learning_rate": 3.600384888926077e-06, + "loss": 1.5398, + "step": 19601 + }, + { + "epoch": 2.46, + "grad_norm": 15.327908515930176, + "learning_rate": 3.599548173869389e-06, + "loss": 1.126, + "step": 19602 + }, + { + "epoch": 2.46, + "grad_norm": 7.690364837646484, + "learning_rate": 3.5987114588127017e-06, + "loss": 0.8513, + "step": 19603 + }, + { + "epoch": 2.46, + "grad_norm": 14.208024024963379, + "learning_rate": 3.597874743756014e-06, + "loss": 0.637, + "step": 19604 + }, + { + "epoch": 2.46, + "grad_norm": 27.198848724365234, + "learning_rate": 3.597038028699327e-06, + "loss": 0.8151, + "step": 19605 + }, + { + "epoch": 2.46, + "grad_norm": 8.483991622924805, + "learning_rate": 3.5962013136426392e-06, + "loss": 0.2585, + "step": 19606 + }, + { + "epoch": 2.46, + "grad_norm": 10.895722389221191, + "learning_rate": 3.595364598585952e-06, + "loss": 1.4322, + "step": 19607 + }, + { + "epoch": 2.46, + "grad_norm": 3.930285930633545, + "learning_rate": 3.594527883529265e-06, + "loss": 0.2244, + "step": 19608 + }, + { + "epoch": 2.46, + "grad_norm": 20.113201141357422, + "learning_rate": 3.5936911684725768e-06, + "loss": 3.2422, + "step": 19609 + }, + { + "epoch": 2.46, + "grad_norm": 3.7983405590057373, + "learning_rate": 3.592854453415889e-06, + "loss": 0.0766, + "step": 19610 + }, + { + "epoch": 2.46, + "grad_norm": 10.632842063903809, + "learning_rate": 3.592017738359202e-06, + "loss": 0.5352, + "step": 19611 + }, + { + "epoch": 2.46, + "grad_norm": 40.46181869506836, + "learning_rate": 3.5911810233025148e-06, + "loss": 0.6148, + "step": 19612 + }, + { + "epoch": 2.46, + "grad_norm": 14.521509170532227, + "learning_rate": 3.590344308245827e-06, + "loss": 0.8839, + "step": 19613 + }, + { + "epoch": 2.46, + "grad_norm": 11.604077339172363, + "learning_rate": 3.58950759318914e-06, + "loss": 0.8917, + "step": 19614 + }, + { + "epoch": 2.46, + "grad_norm": 14.674836158752441, + "learning_rate": 3.5886708781324523e-06, + "loss": 1.3426, + "step": 19615 + }, + { + "epoch": 2.46, + "grad_norm": 16.176982879638672, + "learning_rate": 3.5878341630757647e-06, + "loss": 0.6628, + "step": 19616 + }, + { + "epoch": 2.46, + "grad_norm": 14.715084075927734, + "learning_rate": 3.586997448019077e-06, + "loss": 1.1123, + "step": 19617 + }, + { + "epoch": 2.46, + "grad_norm": 31.576904296875, + "learning_rate": 3.58616073296239e-06, + "loss": 1.2923, + "step": 19618 + }, + { + "epoch": 2.46, + "grad_norm": 10.267995834350586, + "learning_rate": 3.5853240179057027e-06, + "loss": 0.6754, + "step": 19619 + }, + { + "epoch": 2.46, + "grad_norm": 15.240357398986816, + "learning_rate": 3.584487302849015e-06, + "loss": 1.5771, + "step": 19620 + }, + { + "epoch": 2.46, + "grad_norm": 111.63504028320312, + "learning_rate": 3.583650587792328e-06, + "loss": 2.1596, + "step": 19621 + }, + { + "epoch": 2.46, + "grad_norm": 11.189507484436035, + "learning_rate": 3.5828138727356402e-06, + "loss": 0.9857, + "step": 19622 + }, + { + "epoch": 2.46, + "grad_norm": 84.83074951171875, + "learning_rate": 3.5819771576789526e-06, + "loss": 1.9947, + "step": 19623 + }, + { + "epoch": 2.46, + "grad_norm": 4.97153377532959, + "learning_rate": 3.581140442622265e-06, + "loss": 0.1164, + "step": 19624 + }, + { + "epoch": 2.46, + "grad_norm": 8.341716766357422, + "learning_rate": 3.5803037275655778e-06, + "loss": 1.4288, + "step": 19625 + }, + { + "epoch": 2.46, + "grad_norm": 31.253576278686523, + "learning_rate": 3.5794670125088906e-06, + "loss": 2.1505, + "step": 19626 + }, + { + "epoch": 2.46, + "grad_norm": 26.299104690551758, + "learning_rate": 3.578630297452203e-06, + "loss": 1.4682, + "step": 19627 + }, + { + "epoch": 2.46, + "grad_norm": 23.523075103759766, + "learning_rate": 3.5777935823955158e-06, + "loss": 1.395, + "step": 19628 + }, + { + "epoch": 2.46, + "grad_norm": 14.520183563232422, + "learning_rate": 3.576956867338828e-06, + "loss": 0.8351, + "step": 19629 + }, + { + "epoch": 2.46, + "grad_norm": 11.813876152038574, + "learning_rate": 3.5761201522821405e-06, + "loss": 2.7818, + "step": 19630 + }, + { + "epoch": 2.46, + "grad_norm": 16.819231033325195, + "learning_rate": 3.575283437225453e-06, + "loss": 1.6929, + "step": 19631 + }, + { + "epoch": 2.46, + "grad_norm": 12.025693893432617, + "learning_rate": 3.5744467221687657e-06, + "loss": 0.6277, + "step": 19632 + }, + { + "epoch": 2.46, + "grad_norm": 119.55182647705078, + "learning_rate": 3.573610007112078e-06, + "loss": 0.5821, + "step": 19633 + }, + { + "epoch": 2.46, + "grad_norm": 43.22489547729492, + "learning_rate": 3.572773292055391e-06, + "loss": 0.9943, + "step": 19634 + }, + { + "epoch": 2.46, + "grad_norm": 170.36593627929688, + "learning_rate": 3.5719365769987037e-06, + "loss": 1.7828, + "step": 19635 + }, + { + "epoch": 2.46, + "grad_norm": 8.834037780761719, + "learning_rate": 3.571099861942016e-06, + "loss": 0.8836, + "step": 19636 + }, + { + "epoch": 2.46, + "grad_norm": 13.344812393188477, + "learning_rate": 3.5702631468853284e-06, + "loss": 2.1031, + "step": 19637 + }, + { + "epoch": 2.46, + "grad_norm": 8.85362434387207, + "learning_rate": 3.569426431828641e-06, + "loss": 0.9738, + "step": 19638 + }, + { + "epoch": 2.46, + "grad_norm": 15.561305046081543, + "learning_rate": 3.5685897167719536e-06, + "loss": 1.2766, + "step": 19639 + }, + { + "epoch": 2.46, + "grad_norm": 15.3659086227417, + "learning_rate": 3.567753001715266e-06, + "loss": 0.8837, + "step": 19640 + }, + { + "epoch": 2.46, + "grad_norm": 24.762359619140625, + "learning_rate": 3.566916286658579e-06, + "loss": 0.8039, + "step": 19641 + }, + { + "epoch": 2.47, + "grad_norm": 20.780590057373047, + "learning_rate": 3.5660795716018916e-06, + "loss": 0.9226, + "step": 19642 + }, + { + "epoch": 2.47, + "grad_norm": 13.365804672241211, + "learning_rate": 3.565242856545204e-06, + "loss": 0.4648, + "step": 19643 + }, + { + "epoch": 2.47, + "grad_norm": 9.629612922668457, + "learning_rate": 3.5644061414885164e-06, + "loss": 1.1486, + "step": 19644 + }, + { + "epoch": 2.47, + "grad_norm": 5.953568458557129, + "learning_rate": 3.5635694264318287e-06, + "loss": 0.3317, + "step": 19645 + }, + { + "epoch": 2.47, + "grad_norm": 5.891057968139648, + "learning_rate": 3.5627327113751415e-06, + "loss": 0.3262, + "step": 19646 + }, + { + "epoch": 2.47, + "grad_norm": 13.948858261108398, + "learning_rate": 3.561895996318454e-06, + "loss": 0.934, + "step": 19647 + }, + { + "epoch": 2.47, + "grad_norm": 14.469639778137207, + "learning_rate": 3.5610592812617667e-06, + "loss": 0.7603, + "step": 19648 + }, + { + "epoch": 2.47, + "grad_norm": 32.848106384277344, + "learning_rate": 3.5602225662050795e-06, + "loss": 0.9696, + "step": 19649 + }, + { + "epoch": 2.47, + "grad_norm": 1.0937618017196655, + "learning_rate": 3.559385851148392e-06, + "loss": 0.0276, + "step": 19650 + }, + { + "epoch": 2.47, + "grad_norm": 10.17780876159668, + "learning_rate": 3.558549136091704e-06, + "loss": 0.3886, + "step": 19651 + }, + { + "epoch": 2.47, + "grad_norm": 17.023239135742188, + "learning_rate": 3.5577124210350167e-06, + "loss": 0.7694, + "step": 19652 + }, + { + "epoch": 2.47, + "grad_norm": 14.045829772949219, + "learning_rate": 3.5568757059783295e-06, + "loss": 0.4005, + "step": 19653 + }, + { + "epoch": 2.47, + "grad_norm": 17.317874908447266, + "learning_rate": 3.556038990921642e-06, + "loss": 0.3614, + "step": 19654 + }, + { + "epoch": 2.47, + "grad_norm": 6.471225261688232, + "learning_rate": 3.5552022758649546e-06, + "loss": 0.5899, + "step": 19655 + }, + { + "epoch": 2.47, + "grad_norm": 18.64377784729004, + "learning_rate": 3.554365560808267e-06, + "loss": 0.8984, + "step": 19656 + }, + { + "epoch": 2.47, + "grad_norm": 55.732696533203125, + "learning_rate": 3.55352884575158e-06, + "loss": 1.6754, + "step": 19657 + }, + { + "epoch": 2.47, + "grad_norm": 15.808688163757324, + "learning_rate": 3.5526921306948918e-06, + "loss": 0.6719, + "step": 19658 + }, + { + "epoch": 2.47, + "grad_norm": 76.85340118408203, + "learning_rate": 3.5518554156382046e-06, + "loss": 2.71, + "step": 19659 + }, + { + "epoch": 2.47, + "grad_norm": 11.649335861206055, + "learning_rate": 3.5510187005815174e-06, + "loss": 0.6056, + "step": 19660 + }, + { + "epoch": 2.47, + "grad_norm": 14.526211738586426, + "learning_rate": 3.5501819855248298e-06, + "loss": 0.4754, + "step": 19661 + }, + { + "epoch": 2.47, + "grad_norm": 111.2694091796875, + "learning_rate": 3.5493452704681426e-06, + "loss": 1.8594, + "step": 19662 + }, + { + "epoch": 2.47, + "grad_norm": 23.605730056762695, + "learning_rate": 3.548508555411455e-06, + "loss": 0.8543, + "step": 19663 + }, + { + "epoch": 2.47, + "grad_norm": 45.71969223022461, + "learning_rate": 3.5476718403547677e-06, + "loss": 1.9528, + "step": 19664 + }, + { + "epoch": 2.47, + "grad_norm": 5.812906742095947, + "learning_rate": 3.5468351252980797e-06, + "loss": 0.0803, + "step": 19665 + }, + { + "epoch": 2.47, + "grad_norm": 24.693025588989258, + "learning_rate": 3.5459984102413925e-06, + "loss": 2.4052, + "step": 19666 + }, + { + "epoch": 2.47, + "grad_norm": 23.21428871154785, + "learning_rate": 3.5451616951847053e-06, + "loss": 0.9809, + "step": 19667 + }, + { + "epoch": 2.47, + "grad_norm": 14.185833930969238, + "learning_rate": 3.5443249801280177e-06, + "loss": 0.5601, + "step": 19668 + }, + { + "epoch": 2.47, + "grad_norm": 16.369680404663086, + "learning_rate": 3.5434882650713305e-06, + "loss": 2.8719, + "step": 19669 + }, + { + "epoch": 2.47, + "grad_norm": 10.543824195861816, + "learning_rate": 3.542651550014643e-06, + "loss": 0.1977, + "step": 19670 + }, + { + "epoch": 2.47, + "grad_norm": 15.344318389892578, + "learning_rate": 3.5418148349579552e-06, + "loss": 2.0361, + "step": 19671 + }, + { + "epoch": 2.47, + "grad_norm": 23.386581420898438, + "learning_rate": 3.5409781199012676e-06, + "loss": 1.4351, + "step": 19672 + }, + { + "epoch": 2.47, + "grad_norm": 14.04124927520752, + "learning_rate": 3.5401414048445804e-06, + "loss": 0.4966, + "step": 19673 + }, + { + "epoch": 2.47, + "grad_norm": 23.20058822631836, + "learning_rate": 3.539304689787893e-06, + "loss": 1.6201, + "step": 19674 + }, + { + "epoch": 2.47, + "grad_norm": 14.27526569366455, + "learning_rate": 3.5384679747312056e-06, + "loss": 0.8549, + "step": 19675 + }, + { + "epoch": 2.47, + "grad_norm": 17.937129974365234, + "learning_rate": 3.5376312596745184e-06, + "loss": 0.8855, + "step": 19676 + }, + { + "epoch": 2.47, + "grad_norm": 45.540687561035156, + "learning_rate": 3.5367945446178308e-06, + "loss": 2.1641, + "step": 19677 + }, + { + "epoch": 2.47, + "grad_norm": 5.103177070617676, + "learning_rate": 3.535957829561143e-06, + "loss": 0.3367, + "step": 19678 + }, + { + "epoch": 2.47, + "grad_norm": 22.65744972229004, + "learning_rate": 3.5351211145044555e-06, + "loss": 1.3482, + "step": 19679 + }, + { + "epoch": 2.47, + "grad_norm": 26.80577278137207, + "learning_rate": 3.5342843994477683e-06, + "loss": 0.8469, + "step": 19680 + }, + { + "epoch": 2.47, + "grad_norm": 4.383388519287109, + "learning_rate": 3.5334476843910807e-06, + "loss": 0.1368, + "step": 19681 + }, + { + "epoch": 2.47, + "grad_norm": 42.43025588989258, + "learning_rate": 3.5326109693343935e-06, + "loss": 1.5467, + "step": 19682 + }, + { + "epoch": 2.47, + "grad_norm": 106.4570083618164, + "learning_rate": 3.5317742542777063e-06, + "loss": 1.3021, + "step": 19683 + }, + { + "epoch": 2.47, + "grad_norm": 66.1055679321289, + "learning_rate": 3.5309375392210187e-06, + "loss": 1.7628, + "step": 19684 + }, + { + "epoch": 2.47, + "grad_norm": 17.448001861572266, + "learning_rate": 3.530100824164331e-06, + "loss": 1.1025, + "step": 19685 + }, + { + "epoch": 2.47, + "grad_norm": 11.699751853942871, + "learning_rate": 3.5292641091076435e-06, + "loss": 0.3397, + "step": 19686 + }, + { + "epoch": 2.47, + "grad_norm": 11.920134544372559, + "learning_rate": 3.5284273940509563e-06, + "loss": 0.5954, + "step": 19687 + }, + { + "epoch": 2.47, + "grad_norm": 39.143470764160156, + "learning_rate": 3.5275906789942686e-06, + "loss": 1.0038, + "step": 19688 + }, + { + "epoch": 2.47, + "grad_norm": 18.545639038085938, + "learning_rate": 3.5267539639375814e-06, + "loss": 2.4994, + "step": 19689 + }, + { + "epoch": 2.47, + "grad_norm": 64.48916625976562, + "learning_rate": 3.5259172488808942e-06, + "loss": 1.3953, + "step": 19690 + }, + { + "epoch": 2.47, + "grad_norm": 13.650774002075195, + "learning_rate": 3.5250805338242066e-06, + "loss": 0.3642, + "step": 19691 + }, + { + "epoch": 2.47, + "grad_norm": 48.111083984375, + "learning_rate": 3.5242438187675186e-06, + "loss": 1.8149, + "step": 19692 + }, + { + "epoch": 2.47, + "grad_norm": 3.1658313274383545, + "learning_rate": 3.5234071037108314e-06, + "loss": 0.075, + "step": 19693 + }, + { + "epoch": 2.47, + "grad_norm": 9.8385648727417, + "learning_rate": 3.522570388654144e-06, + "loss": 0.8082, + "step": 19694 + }, + { + "epoch": 2.47, + "grad_norm": 18.068117141723633, + "learning_rate": 3.5217336735974565e-06, + "loss": 2.5033, + "step": 19695 + }, + { + "epoch": 2.47, + "grad_norm": 23.014169692993164, + "learning_rate": 3.5208969585407694e-06, + "loss": 0.8867, + "step": 19696 + }, + { + "epoch": 2.47, + "grad_norm": 233.1300811767578, + "learning_rate": 3.5200602434840817e-06, + "loss": 1.2144, + "step": 19697 + }, + { + "epoch": 2.47, + "grad_norm": 17.034318923950195, + "learning_rate": 3.5192235284273945e-06, + "loss": 1.1768, + "step": 19698 + }, + { + "epoch": 2.47, + "grad_norm": 10.715003967285156, + "learning_rate": 3.5183868133707065e-06, + "loss": 0.2994, + "step": 19699 + }, + { + "epoch": 2.47, + "grad_norm": 18.686586380004883, + "learning_rate": 3.5175500983140193e-06, + "loss": 0.476, + "step": 19700 + }, + { + "epoch": 2.47, + "grad_norm": 17.066011428833008, + "learning_rate": 3.516713383257332e-06, + "loss": 1.4482, + "step": 19701 + }, + { + "epoch": 2.47, + "grad_norm": 14.509745597839355, + "learning_rate": 3.5158766682006445e-06, + "loss": 1.2477, + "step": 19702 + }, + { + "epoch": 2.47, + "grad_norm": 3.128937244415283, + "learning_rate": 3.5150399531439573e-06, + "loss": 0.0904, + "step": 19703 + }, + { + "epoch": 2.47, + "grad_norm": 10.236748695373535, + "learning_rate": 3.5142032380872696e-06, + "loss": 1.3713, + "step": 19704 + }, + { + "epoch": 2.47, + "grad_norm": 15.119993209838867, + "learning_rate": 3.5133665230305824e-06, + "loss": 1.3188, + "step": 19705 + }, + { + "epoch": 2.47, + "grad_norm": 12.767293930053711, + "learning_rate": 3.5125298079738944e-06, + "loss": 0.7497, + "step": 19706 + }, + { + "epoch": 2.47, + "grad_norm": 11.747312545776367, + "learning_rate": 3.511693092917207e-06, + "loss": 0.6895, + "step": 19707 + }, + { + "epoch": 2.47, + "grad_norm": 11.619932174682617, + "learning_rate": 3.51085637786052e-06, + "loss": 0.7908, + "step": 19708 + }, + { + "epoch": 2.47, + "grad_norm": 17.017072677612305, + "learning_rate": 3.5100196628038324e-06, + "loss": 1.3293, + "step": 19709 + }, + { + "epoch": 2.47, + "grad_norm": 13.582147598266602, + "learning_rate": 3.509182947747145e-06, + "loss": 1.2597, + "step": 19710 + }, + { + "epoch": 2.47, + "grad_norm": 6.918244361877441, + "learning_rate": 3.5083462326904576e-06, + "loss": 0.5073, + "step": 19711 + }, + { + "epoch": 2.47, + "grad_norm": 5.3982391357421875, + "learning_rate": 3.5075095176337704e-06, + "loss": 2.3984, + "step": 19712 + }, + { + "epoch": 2.47, + "grad_norm": 8.981369018554688, + "learning_rate": 3.5066728025770823e-06, + "loss": 0.6294, + "step": 19713 + }, + { + "epoch": 2.47, + "grad_norm": 3.048067569732666, + "learning_rate": 3.505836087520395e-06, + "loss": 0.1552, + "step": 19714 + }, + { + "epoch": 2.47, + "grad_norm": 16.41480255126953, + "learning_rate": 3.5049993724637075e-06, + "loss": 0.676, + "step": 19715 + }, + { + "epoch": 2.47, + "grad_norm": 10.085628509521484, + "learning_rate": 3.5041626574070203e-06, + "loss": 0.1062, + "step": 19716 + }, + { + "epoch": 2.47, + "grad_norm": 10.479913711547852, + "learning_rate": 3.503325942350333e-06, + "loss": 0.6488, + "step": 19717 + }, + { + "epoch": 2.47, + "grad_norm": 8.183189392089844, + "learning_rate": 3.5024892272936455e-06, + "loss": 0.3081, + "step": 19718 + }, + { + "epoch": 2.47, + "grad_norm": 26.590784072875977, + "learning_rate": 3.5016525122369583e-06, + "loss": 1.0394, + "step": 19719 + }, + { + "epoch": 2.47, + "grad_norm": 5.6834235191345215, + "learning_rate": 3.5008157971802702e-06, + "loss": 0.5966, + "step": 19720 + }, + { + "epoch": 2.47, + "grad_norm": 10.283235549926758, + "learning_rate": 3.499979082123583e-06, + "loss": 1.4802, + "step": 19721 + }, + { + "epoch": 2.48, + "grad_norm": 13.874239921569824, + "learning_rate": 3.4991423670668954e-06, + "loss": 1.2384, + "step": 19722 + }, + { + "epoch": 2.48, + "grad_norm": 14.061607360839844, + "learning_rate": 3.4983056520102082e-06, + "loss": 0.635, + "step": 19723 + }, + { + "epoch": 2.48, + "grad_norm": 5.137641906738281, + "learning_rate": 3.497468936953521e-06, + "loss": 0.8886, + "step": 19724 + }, + { + "epoch": 2.48, + "grad_norm": 16.545169830322266, + "learning_rate": 3.4966322218968334e-06, + "loss": 0.7775, + "step": 19725 + }, + { + "epoch": 2.48, + "grad_norm": 4.7449951171875, + "learning_rate": 3.495795506840146e-06, + "loss": 0.9171, + "step": 19726 + }, + { + "epoch": 2.48, + "grad_norm": 35.38679885864258, + "learning_rate": 3.494958791783458e-06, + "loss": 0.6957, + "step": 19727 + }, + { + "epoch": 2.48, + "grad_norm": 163.88958740234375, + "learning_rate": 3.494122076726771e-06, + "loss": 2.2629, + "step": 19728 + }, + { + "epoch": 2.48, + "grad_norm": 33.55429458618164, + "learning_rate": 3.4932853616700833e-06, + "loss": 1.0029, + "step": 19729 + }, + { + "epoch": 2.48, + "grad_norm": 17.60959815979004, + "learning_rate": 3.492448646613396e-06, + "loss": 0.6596, + "step": 19730 + }, + { + "epoch": 2.48, + "grad_norm": 172.82119750976562, + "learning_rate": 3.491611931556709e-06, + "loss": 2.0909, + "step": 19731 + }, + { + "epoch": 2.48, + "grad_norm": 71.0232162475586, + "learning_rate": 3.4907752165000213e-06, + "loss": 2.8295, + "step": 19732 + }, + { + "epoch": 2.48, + "grad_norm": 16.076486587524414, + "learning_rate": 3.489938501443334e-06, + "loss": 1.1559, + "step": 19733 + }, + { + "epoch": 2.48, + "grad_norm": 11.610815048217773, + "learning_rate": 3.489101786386646e-06, + "loss": 1.2087, + "step": 19734 + }, + { + "epoch": 2.48, + "grad_norm": 14.32686710357666, + "learning_rate": 3.488265071329959e-06, + "loss": 0.5971, + "step": 19735 + }, + { + "epoch": 2.48, + "grad_norm": 13.450050354003906, + "learning_rate": 3.4874283562732713e-06, + "loss": 0.6311, + "step": 19736 + }, + { + "epoch": 2.48, + "grad_norm": 21.91973304748535, + "learning_rate": 3.486591641216584e-06, + "loss": 0.7215, + "step": 19737 + }, + { + "epoch": 2.48, + "grad_norm": 2.609929084777832, + "learning_rate": 3.4857549261598964e-06, + "loss": 0.3477, + "step": 19738 + }, + { + "epoch": 2.48, + "grad_norm": 11.93908977508545, + "learning_rate": 3.4849182111032092e-06, + "loss": 0.4525, + "step": 19739 + }, + { + "epoch": 2.48, + "grad_norm": 8.637784004211426, + "learning_rate": 3.484081496046522e-06, + "loss": 0.5528, + "step": 19740 + }, + { + "epoch": 2.48, + "grad_norm": 9.774800300598145, + "learning_rate": 3.483244780989834e-06, + "loss": 0.6823, + "step": 19741 + }, + { + "epoch": 2.48, + "grad_norm": 16.305648803710938, + "learning_rate": 3.482408065933147e-06, + "loss": 2.189, + "step": 19742 + }, + { + "epoch": 2.48, + "grad_norm": 7.879077434539795, + "learning_rate": 3.481571350876459e-06, + "loss": 0.5587, + "step": 19743 + }, + { + "epoch": 2.48, + "grad_norm": 10.909440040588379, + "learning_rate": 3.480734635819772e-06, + "loss": 0.664, + "step": 19744 + }, + { + "epoch": 2.48, + "grad_norm": 10.326915740966797, + "learning_rate": 3.4798979207630844e-06, + "loss": 1.8753, + "step": 19745 + }, + { + "epoch": 2.48, + "grad_norm": 12.269172668457031, + "learning_rate": 3.479061205706397e-06, + "loss": 0.7914, + "step": 19746 + }, + { + "epoch": 2.48, + "grad_norm": 25.903446197509766, + "learning_rate": 3.47822449064971e-06, + "loss": 0.7501, + "step": 19747 + }, + { + "epoch": 2.48, + "grad_norm": 7.591154098510742, + "learning_rate": 3.477387775593022e-06, + "loss": 0.4238, + "step": 19748 + }, + { + "epoch": 2.48, + "grad_norm": 13.29786491394043, + "learning_rate": 3.4765510605363347e-06, + "loss": 0.5984, + "step": 19749 + }, + { + "epoch": 2.48, + "grad_norm": 4.9905242919921875, + "learning_rate": 3.475714345479647e-06, + "loss": 0.1857, + "step": 19750 + }, + { + "epoch": 2.48, + "grad_norm": 18.47494888305664, + "learning_rate": 3.47487763042296e-06, + "loss": 1.0523, + "step": 19751 + }, + { + "epoch": 2.48, + "grad_norm": 80.27205657958984, + "learning_rate": 3.4740409153662723e-06, + "loss": 2.1472, + "step": 19752 + }, + { + "epoch": 2.48, + "grad_norm": 19.939136505126953, + "learning_rate": 3.473204200309585e-06, + "loss": 0.6476, + "step": 19753 + }, + { + "epoch": 2.48, + "grad_norm": 15.560843467712402, + "learning_rate": 3.472367485252898e-06, + "loss": 0.425, + "step": 19754 + }, + { + "epoch": 2.48, + "grad_norm": 14.479058265686035, + "learning_rate": 3.47153077019621e-06, + "loss": 1.2733, + "step": 19755 + }, + { + "epoch": 2.48, + "grad_norm": 11.08471965789795, + "learning_rate": 3.4706940551395222e-06, + "loss": 0.4975, + "step": 19756 + }, + { + "epoch": 2.48, + "grad_norm": 18.777667999267578, + "learning_rate": 3.469857340082835e-06, + "loss": 0.1296, + "step": 19757 + }, + { + "epoch": 2.48, + "grad_norm": 44.677101135253906, + "learning_rate": 3.469020625026148e-06, + "loss": 0.9832, + "step": 19758 + }, + { + "epoch": 2.48, + "grad_norm": 5.612020492553711, + "learning_rate": 3.46818390996946e-06, + "loss": 0.4032, + "step": 19759 + }, + { + "epoch": 2.48, + "grad_norm": 24.597963333129883, + "learning_rate": 3.467347194912773e-06, + "loss": 2.2152, + "step": 19760 + }, + { + "epoch": 2.48, + "grad_norm": 11.08047866821289, + "learning_rate": 3.466510479856085e-06, + "loss": 1.2333, + "step": 19761 + }, + { + "epoch": 2.48, + "grad_norm": 6.552147388458252, + "learning_rate": 3.4656737647993978e-06, + "loss": 0.975, + "step": 19762 + }, + { + "epoch": 2.48, + "grad_norm": 10.885284423828125, + "learning_rate": 3.46483704974271e-06, + "loss": 0.6009, + "step": 19763 + }, + { + "epoch": 2.48, + "grad_norm": 8.18996810913086, + "learning_rate": 3.464000334686023e-06, + "loss": 0.4458, + "step": 19764 + }, + { + "epoch": 2.48, + "grad_norm": 15.223823547363281, + "learning_rate": 3.4631636196293357e-06, + "loss": 0.5961, + "step": 19765 + }, + { + "epoch": 2.48, + "grad_norm": 2.9410605430603027, + "learning_rate": 3.462326904572648e-06, + "loss": 0.2058, + "step": 19766 + }, + { + "epoch": 2.48, + "grad_norm": 62.11387252807617, + "learning_rate": 3.461490189515961e-06, + "loss": 1.2756, + "step": 19767 + }, + { + "epoch": 2.48, + "grad_norm": 23.682811737060547, + "learning_rate": 3.460653474459273e-06, + "loss": 1.4904, + "step": 19768 + }, + { + "epoch": 2.48, + "grad_norm": 19.57076072692871, + "learning_rate": 3.4598167594025857e-06, + "loss": 0.6619, + "step": 19769 + }, + { + "epoch": 2.48, + "grad_norm": 15.864212989807129, + "learning_rate": 3.458980044345898e-06, + "loss": 0.6604, + "step": 19770 + }, + { + "epoch": 2.48, + "grad_norm": 91.69075775146484, + "learning_rate": 3.458143329289211e-06, + "loss": 1.7616, + "step": 19771 + }, + { + "epoch": 2.48, + "grad_norm": 9.820259094238281, + "learning_rate": 3.4573066142325237e-06, + "loss": 0.5641, + "step": 19772 + }, + { + "epoch": 2.48, + "grad_norm": 31.864337921142578, + "learning_rate": 3.456469899175836e-06, + "loss": 0.957, + "step": 19773 + }, + { + "epoch": 2.48, + "grad_norm": 7.831910610198975, + "learning_rate": 3.455633184119149e-06, + "loss": 1.7723, + "step": 19774 + }, + { + "epoch": 2.48, + "grad_norm": 17.594562530517578, + "learning_rate": 3.4547964690624608e-06, + "loss": 0.9591, + "step": 19775 + }, + { + "epoch": 2.48, + "grad_norm": 8.951953887939453, + "learning_rate": 3.4539597540057736e-06, + "loss": 1.1267, + "step": 19776 + }, + { + "epoch": 2.48, + "grad_norm": 21.579511642456055, + "learning_rate": 3.453123038949086e-06, + "loss": 0.7413, + "step": 19777 + }, + { + "epoch": 2.48, + "grad_norm": 15.402226448059082, + "learning_rate": 3.4522863238923988e-06, + "loss": 0.555, + "step": 19778 + }, + { + "epoch": 2.48, + "grad_norm": 21.79979133605957, + "learning_rate": 3.451449608835711e-06, + "loss": 2.6856, + "step": 19779 + }, + { + "epoch": 2.48, + "grad_norm": 15.194982528686523, + "learning_rate": 3.450612893779024e-06, + "loss": 1.3308, + "step": 19780 + }, + { + "epoch": 2.48, + "grad_norm": 3.8116750717163086, + "learning_rate": 3.4497761787223368e-06, + "loss": 0.1193, + "step": 19781 + }, + { + "epoch": 2.48, + "grad_norm": 11.502127647399902, + "learning_rate": 3.4489394636656487e-06, + "loss": 0.5772, + "step": 19782 + }, + { + "epoch": 2.48, + "grad_norm": 27.45766258239746, + "learning_rate": 3.4481027486089615e-06, + "loss": 1.7305, + "step": 19783 + }, + { + "epoch": 2.48, + "grad_norm": 16.907424926757812, + "learning_rate": 3.447266033552274e-06, + "loss": 2.2742, + "step": 19784 + }, + { + "epoch": 2.48, + "grad_norm": 4.822242259979248, + "learning_rate": 3.4464293184955867e-06, + "loss": 0.5652, + "step": 19785 + }, + { + "epoch": 2.48, + "grad_norm": 20.32942771911621, + "learning_rate": 3.445592603438899e-06, + "loss": 0.4808, + "step": 19786 + }, + { + "epoch": 2.48, + "grad_norm": 8.529967308044434, + "learning_rate": 3.444755888382212e-06, + "loss": 0.1695, + "step": 19787 + }, + { + "epoch": 2.48, + "grad_norm": 16.412992477416992, + "learning_rate": 3.4439191733255247e-06, + "loss": 0.4608, + "step": 19788 + }, + { + "epoch": 2.48, + "grad_norm": 18.60361671447754, + "learning_rate": 3.4430824582688366e-06, + "loss": 1.1064, + "step": 19789 + }, + { + "epoch": 2.48, + "grad_norm": 18.82691192626953, + "learning_rate": 3.4422457432121494e-06, + "loss": 1.9421, + "step": 19790 + }, + { + "epoch": 2.48, + "grad_norm": 43.89506149291992, + "learning_rate": 3.441409028155462e-06, + "loss": 1.5197, + "step": 19791 + }, + { + "epoch": 2.48, + "grad_norm": 7.936665058135986, + "learning_rate": 3.4405723130987746e-06, + "loss": 0.7861, + "step": 19792 + }, + { + "epoch": 2.48, + "grad_norm": 18.61930274963379, + "learning_rate": 3.439735598042087e-06, + "loss": 1.1221, + "step": 19793 + }, + { + "epoch": 2.48, + "grad_norm": 5.85394811630249, + "learning_rate": 3.4388988829853998e-06, + "loss": 1.7637, + "step": 19794 + }, + { + "epoch": 2.48, + "grad_norm": 12.541993141174316, + "learning_rate": 3.4380621679287126e-06, + "loss": 0.9118, + "step": 19795 + }, + { + "epoch": 2.48, + "grad_norm": 27.030841827392578, + "learning_rate": 3.4372254528720245e-06, + "loss": 1.0295, + "step": 19796 + }, + { + "epoch": 2.48, + "grad_norm": 36.323028564453125, + "learning_rate": 3.436388737815337e-06, + "loss": 0.4287, + "step": 19797 + }, + { + "epoch": 2.48, + "grad_norm": 13.478815078735352, + "learning_rate": 3.4355520227586497e-06, + "loss": 0.7857, + "step": 19798 + }, + { + "epoch": 2.48, + "grad_norm": 4.058920860290527, + "learning_rate": 3.4347153077019625e-06, + "loss": 0.4471, + "step": 19799 + }, + { + "epoch": 2.48, + "grad_norm": 6.7582597732543945, + "learning_rate": 3.433878592645275e-06, + "loss": 0.1954, + "step": 19800 + }, + { + "epoch": 2.48, + "grad_norm": 37.303916931152344, + "learning_rate": 3.4330418775885877e-06, + "loss": 1.8653, + "step": 19801 + }, + { + "epoch": 2.49, + "grad_norm": 15.346481323242188, + "learning_rate": 3.4322051625319e-06, + "loss": 0.3085, + "step": 19802 + }, + { + "epoch": 2.49, + "grad_norm": 7.7009429931640625, + "learning_rate": 3.4313684474752125e-06, + "loss": 0.4781, + "step": 19803 + }, + { + "epoch": 2.49, + "grad_norm": 7.781106948852539, + "learning_rate": 3.430531732418525e-06, + "loss": 0.936, + "step": 19804 + }, + { + "epoch": 2.49, + "grad_norm": 21.661930084228516, + "learning_rate": 3.4296950173618376e-06, + "loss": 0.8793, + "step": 19805 + }, + { + "epoch": 2.49, + "grad_norm": 4.812716960906982, + "learning_rate": 3.4288583023051504e-06, + "loss": 0.518, + "step": 19806 + }, + { + "epoch": 2.49, + "grad_norm": 15.22900676727295, + "learning_rate": 3.428021587248463e-06, + "loss": 0.683, + "step": 19807 + }, + { + "epoch": 2.49, + "grad_norm": 61.01286697387695, + "learning_rate": 3.4271848721917756e-06, + "loss": 1.0509, + "step": 19808 + }, + { + "epoch": 2.49, + "grad_norm": 11.959772109985352, + "learning_rate": 3.426348157135088e-06, + "loss": 0.9398, + "step": 19809 + }, + { + "epoch": 2.49, + "grad_norm": 27.148923873901367, + "learning_rate": 3.4255114420784004e-06, + "loss": 1.8368, + "step": 19810 + }, + { + "epoch": 2.49, + "grad_norm": 9.010526657104492, + "learning_rate": 3.4246747270217128e-06, + "loss": 0.8156, + "step": 19811 + }, + { + "epoch": 2.49, + "grad_norm": 30.53544807434082, + "learning_rate": 3.4238380119650256e-06, + "loss": 0.9557, + "step": 19812 + }, + { + "epoch": 2.49, + "grad_norm": 24.31468391418457, + "learning_rate": 3.4230012969083384e-06, + "loss": 3.0352, + "step": 19813 + }, + { + "epoch": 2.49, + "grad_norm": 11.948583602905273, + "learning_rate": 3.4221645818516507e-06, + "loss": 0.784, + "step": 19814 + }, + { + "epoch": 2.49, + "grad_norm": 14.328639030456543, + "learning_rate": 3.4213278667949635e-06, + "loss": 0.307, + "step": 19815 + }, + { + "epoch": 2.49, + "grad_norm": 20.781282424926758, + "learning_rate": 3.420491151738276e-06, + "loss": 0.6423, + "step": 19816 + }, + { + "epoch": 2.49, + "grad_norm": 34.052772521972656, + "learning_rate": 3.4196544366815883e-06, + "loss": 2.9036, + "step": 19817 + }, + { + "epoch": 2.49, + "grad_norm": 10.930405616760254, + "learning_rate": 3.4188177216249007e-06, + "loss": 1.4673, + "step": 19818 + }, + { + "epoch": 2.49, + "grad_norm": 27.7197322845459, + "learning_rate": 3.4179810065682135e-06, + "loss": 0.7177, + "step": 19819 + }, + { + "epoch": 2.49, + "grad_norm": 3.597465753555298, + "learning_rate": 3.417144291511526e-06, + "loss": 0.1199, + "step": 19820 + }, + { + "epoch": 2.49, + "grad_norm": 18.610553741455078, + "learning_rate": 3.4163075764548387e-06, + "loss": 1.3404, + "step": 19821 + }, + { + "epoch": 2.49, + "grad_norm": 24.10991096496582, + "learning_rate": 3.4154708613981515e-06, + "loss": 1.0394, + "step": 19822 + }, + { + "epoch": 2.49, + "grad_norm": 7.0872931480407715, + "learning_rate": 3.414634146341464e-06, + "loss": 0.6687, + "step": 19823 + }, + { + "epoch": 2.49, + "grad_norm": 65.09789276123047, + "learning_rate": 3.4137974312847762e-06, + "loss": 3.3621, + "step": 19824 + }, + { + "epoch": 2.49, + "grad_norm": 28.820966720581055, + "learning_rate": 3.4129607162280886e-06, + "loss": 1.3284, + "step": 19825 + }, + { + "epoch": 2.49, + "grad_norm": 27.640705108642578, + "learning_rate": 3.4121240011714014e-06, + "loss": 1.0373, + "step": 19826 + }, + { + "epoch": 2.49, + "grad_norm": 21.718908309936523, + "learning_rate": 3.4112872861147138e-06, + "loss": 1.6755, + "step": 19827 + }, + { + "epoch": 2.49, + "grad_norm": 16.77260398864746, + "learning_rate": 3.4104505710580266e-06, + "loss": 0.7976, + "step": 19828 + }, + { + "epoch": 2.49, + "grad_norm": 13.465864181518555, + "learning_rate": 3.4096138560013394e-06, + "loss": 1.2967, + "step": 19829 + }, + { + "epoch": 2.49, + "grad_norm": 31.5263729095459, + "learning_rate": 3.4087771409446518e-06, + "loss": 1.9992, + "step": 19830 + }, + { + "epoch": 2.49, + "grad_norm": 10.719361305236816, + "learning_rate": 3.4079404258879637e-06, + "loss": 0.7871, + "step": 19831 + }, + { + "epoch": 2.49, + "grad_norm": 339.76422119140625, + "learning_rate": 3.4071037108312765e-06, + "loss": 0.8432, + "step": 19832 + }, + { + "epoch": 2.49, + "grad_norm": 15.445401191711426, + "learning_rate": 3.4062669957745893e-06, + "loss": 0.5135, + "step": 19833 + }, + { + "epoch": 2.49, + "grad_norm": 23.20089340209961, + "learning_rate": 3.4054302807179017e-06, + "loss": 1.6641, + "step": 19834 + }, + { + "epoch": 2.49, + "grad_norm": 10.486080169677734, + "learning_rate": 3.4045935656612145e-06, + "loss": 0.7401, + "step": 19835 + }, + { + "epoch": 2.49, + "grad_norm": 34.3616828918457, + "learning_rate": 3.4037568506045273e-06, + "loss": 1.6526, + "step": 19836 + }, + { + "epoch": 2.49, + "grad_norm": 10.02161693572998, + "learning_rate": 3.4029201355478397e-06, + "loss": 0.9273, + "step": 19837 + }, + { + "epoch": 2.49, + "grad_norm": 10.030625343322754, + "learning_rate": 3.4020834204911516e-06, + "loss": 0.452, + "step": 19838 + }, + { + "epoch": 2.49, + "grad_norm": 21.02669906616211, + "learning_rate": 3.4012467054344644e-06, + "loss": 0.3191, + "step": 19839 + }, + { + "epoch": 2.49, + "grad_norm": 6.957141876220703, + "learning_rate": 3.4004099903777772e-06, + "loss": 0.3309, + "step": 19840 + }, + { + "epoch": 2.49, + "grad_norm": 10.954840660095215, + "learning_rate": 3.3995732753210896e-06, + "loss": 0.6704, + "step": 19841 + }, + { + "epoch": 2.49, + "grad_norm": 9.489827156066895, + "learning_rate": 3.3987365602644024e-06, + "loss": 0.3872, + "step": 19842 + }, + { + "epoch": 2.49, + "grad_norm": 12.518203735351562, + "learning_rate": 3.397899845207715e-06, + "loss": 1.1024, + "step": 19843 + }, + { + "epoch": 2.49, + "grad_norm": 13.665872573852539, + "learning_rate": 3.3970631301510276e-06, + "loss": 0.9706, + "step": 19844 + }, + { + "epoch": 2.49, + "grad_norm": 10.394923210144043, + "learning_rate": 3.3962264150943395e-06, + "loss": 1.0683, + "step": 19845 + }, + { + "epoch": 2.49, + "grad_norm": 37.72017288208008, + "learning_rate": 3.3953897000376524e-06, + "loss": 2.3307, + "step": 19846 + }, + { + "epoch": 2.49, + "grad_norm": 27.23324966430664, + "learning_rate": 3.394552984980965e-06, + "loss": 0.8453, + "step": 19847 + }, + { + "epoch": 2.49, + "grad_norm": 9.878998756408691, + "learning_rate": 3.3937162699242775e-06, + "loss": 0.452, + "step": 19848 + }, + { + "epoch": 2.49, + "grad_norm": 37.411582946777344, + "learning_rate": 3.3928795548675903e-06, + "loss": 1.9002, + "step": 19849 + }, + { + "epoch": 2.49, + "grad_norm": 8.64536190032959, + "learning_rate": 3.3920428398109027e-06, + "loss": 0.7126, + "step": 19850 + }, + { + "epoch": 2.49, + "grad_norm": 32.167545318603516, + "learning_rate": 3.391206124754215e-06, + "loss": 2.47, + "step": 19851 + }, + { + "epoch": 2.49, + "grad_norm": 8.786133766174316, + "learning_rate": 3.3903694096975275e-06, + "loss": 0.5359, + "step": 19852 + }, + { + "epoch": 2.49, + "grad_norm": 11.467194557189941, + "learning_rate": 3.3895326946408403e-06, + "loss": 0.3455, + "step": 19853 + }, + { + "epoch": 2.49, + "grad_norm": 7.817058563232422, + "learning_rate": 3.388695979584153e-06, + "loss": 0.3058, + "step": 19854 + }, + { + "epoch": 2.49, + "grad_norm": 17.731460571289062, + "learning_rate": 3.3878592645274654e-06, + "loss": 0.9054, + "step": 19855 + }, + { + "epoch": 2.49, + "grad_norm": 26.260496139526367, + "learning_rate": 3.3870225494707783e-06, + "loss": 1.6367, + "step": 19856 + }, + { + "epoch": 2.49, + "grad_norm": 18.765504837036133, + "learning_rate": 3.3861858344140906e-06, + "loss": 0.3956, + "step": 19857 + }, + { + "epoch": 2.49, + "grad_norm": 8.115496635437012, + "learning_rate": 3.385349119357403e-06, + "loss": 0.9168, + "step": 19858 + }, + { + "epoch": 2.49, + "grad_norm": 6.180880546569824, + "learning_rate": 3.3845124043007154e-06, + "loss": 0.5112, + "step": 19859 + }, + { + "epoch": 2.49, + "grad_norm": 11.935192108154297, + "learning_rate": 3.383675689244028e-06, + "loss": 0.4514, + "step": 19860 + }, + { + "epoch": 2.49, + "grad_norm": 21.07114028930664, + "learning_rate": 3.3828389741873406e-06, + "loss": 1.3732, + "step": 19861 + }, + { + "epoch": 2.49, + "grad_norm": 33.54058837890625, + "learning_rate": 3.3820022591306534e-06, + "loss": 1.4688, + "step": 19862 + }, + { + "epoch": 2.49, + "grad_norm": 13.222314834594727, + "learning_rate": 3.381165544073966e-06, + "loss": 0.6227, + "step": 19863 + }, + { + "epoch": 2.49, + "grad_norm": 36.876625061035156, + "learning_rate": 3.3803288290172785e-06, + "loss": 2.2184, + "step": 19864 + }, + { + "epoch": 2.49, + "grad_norm": 7.127270221710205, + "learning_rate": 3.379492113960591e-06, + "loss": 0.4504, + "step": 19865 + }, + { + "epoch": 2.49, + "grad_norm": 17.18406867980957, + "learning_rate": 3.3786553989039033e-06, + "loss": 1.1996, + "step": 19866 + }, + { + "epoch": 2.49, + "grad_norm": 17.484281539916992, + "learning_rate": 3.377818683847216e-06, + "loss": 0.9784, + "step": 19867 + }, + { + "epoch": 2.49, + "grad_norm": 19.296897888183594, + "learning_rate": 3.3769819687905285e-06, + "loss": 1.0986, + "step": 19868 + }, + { + "epoch": 2.49, + "grad_norm": 341.8594970703125, + "learning_rate": 3.3761452537338413e-06, + "loss": 1.4915, + "step": 19869 + }, + { + "epoch": 2.49, + "grad_norm": 6.280468463897705, + "learning_rate": 3.375308538677154e-06, + "loss": 0.8342, + "step": 19870 + }, + { + "epoch": 2.49, + "grad_norm": 155.24281311035156, + "learning_rate": 3.3744718236204665e-06, + "loss": 0.5996, + "step": 19871 + }, + { + "epoch": 2.49, + "grad_norm": 20.076494216918945, + "learning_rate": 3.3736351085637784e-06, + "loss": 1.6459, + "step": 19872 + }, + { + "epoch": 2.49, + "grad_norm": 6.135019779205322, + "learning_rate": 3.3727983935070912e-06, + "loss": 0.4506, + "step": 19873 + }, + { + "epoch": 2.49, + "grad_norm": 12.224918365478516, + "learning_rate": 3.371961678450404e-06, + "loss": 0.916, + "step": 19874 + }, + { + "epoch": 2.49, + "grad_norm": 27.757692337036133, + "learning_rate": 3.3711249633937164e-06, + "loss": 1.8756, + "step": 19875 + }, + { + "epoch": 2.49, + "grad_norm": 10.30232048034668, + "learning_rate": 3.370288248337029e-06, + "loss": 0.4556, + "step": 19876 + }, + { + "epoch": 2.49, + "grad_norm": 11.233203887939453, + "learning_rate": 3.369451533280342e-06, + "loss": 1.4636, + "step": 19877 + }, + { + "epoch": 2.49, + "grad_norm": 5.675137519836426, + "learning_rate": 3.3686148182236544e-06, + "loss": 1.0059, + "step": 19878 + }, + { + "epoch": 2.49, + "grad_norm": 47.12982177734375, + "learning_rate": 3.3677781031669663e-06, + "loss": 1.6327, + "step": 19879 + }, + { + "epoch": 2.49, + "grad_norm": 197.06178283691406, + "learning_rate": 3.366941388110279e-06, + "loss": 1.3262, + "step": 19880 + }, + { + "epoch": 2.5, + "grad_norm": 20.33978271484375, + "learning_rate": 3.366104673053592e-06, + "loss": 1.2696, + "step": 19881 + }, + { + "epoch": 2.5, + "grad_norm": 15.062004089355469, + "learning_rate": 3.3652679579969043e-06, + "loss": 1.9937, + "step": 19882 + }, + { + "epoch": 2.5, + "grad_norm": 9.023582458496094, + "learning_rate": 3.364431242940217e-06, + "loss": 0.3869, + "step": 19883 + }, + { + "epoch": 2.5, + "grad_norm": 27.49932098388672, + "learning_rate": 3.3635945278835295e-06, + "loss": 1.3255, + "step": 19884 + }, + { + "epoch": 2.5, + "grad_norm": 6.58065128326416, + "learning_rate": 3.3627578128268423e-06, + "loss": 0.5256, + "step": 19885 + }, + { + "epoch": 2.5, + "grad_norm": 10.925614356994629, + "learning_rate": 3.3619210977701543e-06, + "loss": 0.7766, + "step": 19886 + }, + { + "epoch": 2.5, + "grad_norm": 7.92704963684082, + "learning_rate": 3.361084382713467e-06, + "loss": 0.3684, + "step": 19887 + }, + { + "epoch": 2.5, + "grad_norm": 7.970696926116943, + "learning_rate": 3.36024766765678e-06, + "loss": 1.202, + "step": 19888 + }, + { + "epoch": 2.5, + "grad_norm": 73.66069793701172, + "learning_rate": 3.3594109526000922e-06, + "loss": 1.3298, + "step": 19889 + }, + { + "epoch": 2.5, + "grad_norm": 19.252840042114258, + "learning_rate": 3.358574237543405e-06, + "loss": 1.8018, + "step": 19890 + }, + { + "epoch": 2.5, + "grad_norm": 12.6245756149292, + "learning_rate": 3.3577375224867174e-06, + "loss": 1.8099, + "step": 19891 + }, + { + "epoch": 2.5, + "grad_norm": 19.752887725830078, + "learning_rate": 3.3569008074300302e-06, + "loss": 0.9063, + "step": 19892 + }, + { + "epoch": 2.5, + "grad_norm": 47.04121398925781, + "learning_rate": 3.356064092373342e-06, + "loss": 1.9994, + "step": 19893 + }, + { + "epoch": 2.5, + "grad_norm": 20.247621536254883, + "learning_rate": 3.355227377316655e-06, + "loss": 2.1466, + "step": 19894 + }, + { + "epoch": 2.5, + "grad_norm": 5.817601203918457, + "learning_rate": 3.3543906622599674e-06, + "loss": 0.2317, + "step": 19895 + }, + { + "epoch": 2.5, + "grad_norm": 12.793375968933105, + "learning_rate": 3.35355394720328e-06, + "loss": 1.6354, + "step": 19896 + }, + { + "epoch": 2.5, + "grad_norm": 25.820037841796875, + "learning_rate": 3.352717232146593e-06, + "loss": 1.1934, + "step": 19897 + }, + { + "epoch": 2.5, + "grad_norm": 25.317333221435547, + "learning_rate": 3.3518805170899053e-06, + "loss": 1.4453, + "step": 19898 + }, + { + "epoch": 2.5, + "grad_norm": 12.54918384552002, + "learning_rate": 3.351043802033218e-06, + "loss": 1.9853, + "step": 19899 + }, + { + "epoch": 2.5, + "grad_norm": 10.937554359436035, + "learning_rate": 3.35020708697653e-06, + "loss": 0.5546, + "step": 19900 + }, + { + "epoch": 2.5, + "grad_norm": 11.002120018005371, + "learning_rate": 3.349370371919843e-06, + "loss": 0.548, + "step": 19901 + }, + { + "epoch": 2.5, + "grad_norm": 28.960004806518555, + "learning_rate": 3.3485336568631553e-06, + "loss": 1.6595, + "step": 19902 + }, + { + "epoch": 2.5, + "grad_norm": 8.909430503845215, + "learning_rate": 3.347696941806468e-06, + "loss": 0.4689, + "step": 19903 + }, + { + "epoch": 2.5, + "grad_norm": 7.743149757385254, + "learning_rate": 3.346860226749781e-06, + "loss": 0.4208, + "step": 19904 + }, + { + "epoch": 2.5, + "grad_norm": 48.501033782958984, + "learning_rate": 3.3460235116930933e-06, + "loss": 1.1391, + "step": 19905 + }, + { + "epoch": 2.5, + "grad_norm": 7.790011405944824, + "learning_rate": 3.345186796636406e-06, + "loss": 1.8365, + "step": 19906 + }, + { + "epoch": 2.5, + "grad_norm": 10.871684074401855, + "learning_rate": 3.344350081579718e-06, + "loss": 0.418, + "step": 19907 + }, + { + "epoch": 2.5, + "grad_norm": 42.4021110534668, + "learning_rate": 3.343513366523031e-06, + "loss": 0.9568, + "step": 19908 + }, + { + "epoch": 2.5, + "grad_norm": 8.166342735290527, + "learning_rate": 3.342676651466343e-06, + "loss": 0.4243, + "step": 19909 + }, + { + "epoch": 2.5, + "grad_norm": 27.085187911987305, + "learning_rate": 3.341839936409656e-06, + "loss": 1.7439, + "step": 19910 + }, + { + "epoch": 2.5, + "grad_norm": 11.258872032165527, + "learning_rate": 3.341003221352969e-06, + "loss": 1.4339, + "step": 19911 + }, + { + "epoch": 2.5, + "grad_norm": 13.332852363586426, + "learning_rate": 3.340166506296281e-06, + "loss": 0.8385, + "step": 19912 + }, + { + "epoch": 2.5, + "grad_norm": 3.6175317764282227, + "learning_rate": 3.339329791239594e-06, + "loss": 0.3455, + "step": 19913 + }, + { + "epoch": 2.5, + "grad_norm": 8.716652870178223, + "learning_rate": 3.338493076182906e-06, + "loss": 0.758, + "step": 19914 + }, + { + "epoch": 2.5, + "grad_norm": 40.15550231933594, + "learning_rate": 3.3376563611262187e-06, + "loss": 2.2447, + "step": 19915 + }, + { + "epoch": 2.5, + "grad_norm": 7.4539794921875, + "learning_rate": 3.336819646069531e-06, + "loss": 1.1303, + "step": 19916 + }, + { + "epoch": 2.5, + "grad_norm": 15.161958694458008, + "learning_rate": 3.335982931012844e-06, + "loss": 1.203, + "step": 19917 + }, + { + "epoch": 2.5, + "grad_norm": 2.728916645050049, + "learning_rate": 3.3351462159561567e-06, + "loss": 0.0629, + "step": 19918 + }, + { + "epoch": 2.5, + "grad_norm": 12.251533508300781, + "learning_rate": 3.334309500899469e-06, + "loss": 0.5393, + "step": 19919 + }, + { + "epoch": 2.5, + "grad_norm": 17.577667236328125, + "learning_rate": 3.333472785842782e-06, + "loss": 0.5843, + "step": 19920 + }, + { + "epoch": 2.5, + "grad_norm": 30.012460708618164, + "learning_rate": 3.332636070786094e-06, + "loss": 1.8977, + "step": 19921 + }, + { + "epoch": 2.5, + "grad_norm": 6.592525005340576, + "learning_rate": 3.3317993557294067e-06, + "loss": 0.1557, + "step": 19922 + }, + { + "epoch": 2.5, + "grad_norm": 9.875282287597656, + "learning_rate": 3.330962640672719e-06, + "loss": 0.7951, + "step": 19923 + }, + { + "epoch": 2.5, + "grad_norm": 17.681758880615234, + "learning_rate": 3.330125925616032e-06, + "loss": 0.4883, + "step": 19924 + }, + { + "epoch": 2.5, + "grad_norm": 3.01639723777771, + "learning_rate": 3.329289210559344e-06, + "loss": 0.111, + "step": 19925 + }, + { + "epoch": 2.5, + "grad_norm": 21.93477439880371, + "learning_rate": 3.328452495502657e-06, + "loss": 0.7063, + "step": 19926 + }, + { + "epoch": 2.5, + "grad_norm": 8.804841995239258, + "learning_rate": 3.32761578044597e-06, + "loss": 0.7265, + "step": 19927 + }, + { + "epoch": 2.5, + "grad_norm": 19.41964340209961, + "learning_rate": 3.3267790653892818e-06, + "loss": 1.9393, + "step": 19928 + }, + { + "epoch": 2.5, + "grad_norm": 7.241828918457031, + "learning_rate": 3.3259423503325946e-06, + "loss": 0.6794, + "step": 19929 + }, + { + "epoch": 2.5, + "grad_norm": 5.395716190338135, + "learning_rate": 3.325105635275907e-06, + "loss": 0.0924, + "step": 19930 + }, + { + "epoch": 2.5, + "grad_norm": 13.952709197998047, + "learning_rate": 3.3242689202192198e-06, + "loss": 1.1084, + "step": 19931 + }, + { + "epoch": 2.5, + "grad_norm": 9.51970100402832, + "learning_rate": 3.323432205162532e-06, + "loss": 0.6472, + "step": 19932 + }, + { + "epoch": 2.5, + "grad_norm": 26.60560417175293, + "learning_rate": 3.322595490105845e-06, + "loss": 1.0385, + "step": 19933 + }, + { + "epoch": 2.5, + "grad_norm": 22.788707733154297, + "learning_rate": 3.3217587750491577e-06, + "loss": 0.7261, + "step": 19934 + }, + { + "epoch": 2.5, + "grad_norm": 17.438146591186523, + "learning_rate": 3.3209220599924697e-06, + "loss": 1.7893, + "step": 19935 + }, + { + "epoch": 2.5, + "grad_norm": 3.608642578125, + "learning_rate": 3.320085344935782e-06, + "loss": 0.5657, + "step": 19936 + }, + { + "epoch": 2.5, + "grad_norm": 26.571809768676758, + "learning_rate": 3.319248629879095e-06, + "loss": 1.0257, + "step": 19937 + }, + { + "epoch": 2.5, + "grad_norm": 14.450227737426758, + "learning_rate": 3.3184119148224077e-06, + "loss": 1.7862, + "step": 19938 + }, + { + "epoch": 2.5, + "grad_norm": 17.081966400146484, + "learning_rate": 3.31757519976572e-06, + "loss": 1.4273, + "step": 19939 + }, + { + "epoch": 2.5, + "grad_norm": 9.602093696594238, + "learning_rate": 3.316738484709033e-06, + "loss": 0.7716, + "step": 19940 + }, + { + "epoch": 2.5, + "grad_norm": 15.299277305603027, + "learning_rate": 3.315901769652345e-06, + "loss": 0.7906, + "step": 19941 + }, + { + "epoch": 2.5, + "grad_norm": 2.3168649673461914, + "learning_rate": 3.3150650545956576e-06, + "loss": 0.1147, + "step": 19942 + }, + { + "epoch": 2.5, + "grad_norm": 10.020484924316406, + "learning_rate": 3.31422833953897e-06, + "loss": 0.5444, + "step": 19943 + }, + { + "epoch": 2.5, + "grad_norm": 8.818445205688477, + "learning_rate": 3.3133916244822828e-06, + "loss": 0.7655, + "step": 19944 + }, + { + "epoch": 2.5, + "grad_norm": 49.77157974243164, + "learning_rate": 3.3125549094255956e-06, + "loss": 2.2813, + "step": 19945 + }, + { + "epoch": 2.5, + "grad_norm": 18.44257164001465, + "learning_rate": 3.311718194368908e-06, + "loss": 0.9881, + "step": 19946 + }, + { + "epoch": 2.5, + "grad_norm": 75.80842590332031, + "learning_rate": 3.3108814793122208e-06, + "loss": 5.0154, + "step": 19947 + }, + { + "epoch": 2.5, + "grad_norm": 60.95927429199219, + "learning_rate": 3.3100447642555327e-06, + "loss": 0.6309, + "step": 19948 + }, + { + "epoch": 2.5, + "grad_norm": 17.889598846435547, + "learning_rate": 3.3092080491988455e-06, + "loss": 1.1538, + "step": 19949 + }, + { + "epoch": 2.5, + "grad_norm": 5.751458168029785, + "learning_rate": 3.308371334142158e-06, + "loss": 0.3345, + "step": 19950 + }, + { + "epoch": 2.5, + "grad_norm": 26.583431243896484, + "learning_rate": 3.3075346190854707e-06, + "loss": 0.6315, + "step": 19951 + }, + { + "epoch": 2.5, + "grad_norm": 20.479509353637695, + "learning_rate": 3.3066979040287835e-06, + "loss": 3.3615, + "step": 19952 + }, + { + "epoch": 2.5, + "grad_norm": 4.811371326446533, + "learning_rate": 3.305861188972096e-06, + "loss": 0.4642, + "step": 19953 + }, + { + "epoch": 2.5, + "grad_norm": 8.042828559875488, + "learning_rate": 3.3050244739154087e-06, + "loss": 0.9753, + "step": 19954 + }, + { + "epoch": 2.5, + "grad_norm": 14.338342666625977, + "learning_rate": 3.3041877588587206e-06, + "loss": 1.716, + "step": 19955 + }, + { + "epoch": 2.5, + "grad_norm": 9.702662467956543, + "learning_rate": 3.3033510438020334e-06, + "loss": 0.5687, + "step": 19956 + }, + { + "epoch": 2.5, + "grad_norm": 6.313868522644043, + "learning_rate": 3.302514328745346e-06, + "loss": 0.5918, + "step": 19957 + }, + { + "epoch": 2.5, + "grad_norm": 4.752679824829102, + "learning_rate": 3.3016776136886586e-06, + "loss": 0.6654, + "step": 19958 + }, + { + "epoch": 2.5, + "grad_norm": 68.26758575439453, + "learning_rate": 3.300840898631971e-06, + "loss": 1.3103, + "step": 19959 + }, + { + "epoch": 2.5, + "grad_norm": 37.147972106933594, + "learning_rate": 3.300004183575284e-06, + "loss": 1.3215, + "step": 19960 + }, + { + "epoch": 2.51, + "grad_norm": 4.803019046783447, + "learning_rate": 3.2991674685185966e-06, + "loss": 0.1563, + "step": 19961 + }, + { + "epoch": 2.51, + "grad_norm": 7.469912052154541, + "learning_rate": 3.2983307534619086e-06, + "loss": 0.104, + "step": 19962 + }, + { + "epoch": 2.51, + "grad_norm": 18.475568771362305, + "learning_rate": 3.2974940384052214e-06, + "loss": 0.9115, + "step": 19963 + }, + { + "epoch": 2.51, + "grad_norm": 7.950811386108398, + "learning_rate": 3.2966573233485337e-06, + "loss": 0.4655, + "step": 19964 + }, + { + "epoch": 2.51, + "grad_norm": 3.4812703132629395, + "learning_rate": 3.2958206082918465e-06, + "loss": 0.1384, + "step": 19965 + }, + { + "epoch": 2.51, + "grad_norm": 3.8312828540802, + "learning_rate": 3.294983893235159e-06, + "loss": 0.4218, + "step": 19966 + }, + { + "epoch": 2.51, + "grad_norm": 19.539031982421875, + "learning_rate": 3.2941471781784717e-06, + "loss": 0.4395, + "step": 19967 + }, + { + "epoch": 2.51, + "grad_norm": 12.390055656433105, + "learning_rate": 3.2933104631217845e-06, + "loss": 1.0971, + "step": 19968 + }, + { + "epoch": 2.51, + "grad_norm": 11.946459770202637, + "learning_rate": 3.2924737480650965e-06, + "loss": 0.2893, + "step": 19969 + }, + { + "epoch": 2.51, + "grad_norm": 5.550558567047119, + "learning_rate": 3.2916370330084093e-06, + "loss": 0.2716, + "step": 19970 + }, + { + "epoch": 2.51, + "grad_norm": 22.58316993713379, + "learning_rate": 3.2908003179517217e-06, + "loss": 1.5036, + "step": 19971 + }, + { + "epoch": 2.51, + "grad_norm": 39.19320297241211, + "learning_rate": 3.2899636028950345e-06, + "loss": 1.392, + "step": 19972 + }, + { + "epoch": 2.51, + "grad_norm": 11.933780670166016, + "learning_rate": 3.289126887838347e-06, + "loss": 0.567, + "step": 19973 + }, + { + "epoch": 2.51, + "grad_norm": 2.2914276123046875, + "learning_rate": 3.2882901727816596e-06, + "loss": 0.1324, + "step": 19974 + }, + { + "epoch": 2.51, + "grad_norm": 77.90569305419922, + "learning_rate": 3.2874534577249724e-06, + "loss": 1.653, + "step": 19975 + }, + { + "epoch": 2.51, + "grad_norm": 9.612706184387207, + "learning_rate": 3.2866167426682844e-06, + "loss": 0.4752, + "step": 19976 + }, + { + "epoch": 2.51, + "grad_norm": 16.720169067382812, + "learning_rate": 3.2857800276115968e-06, + "loss": 2.4077, + "step": 19977 + }, + { + "epoch": 2.51, + "grad_norm": 18.49120330810547, + "learning_rate": 3.2849433125549096e-06, + "loss": 0.6799, + "step": 19978 + }, + { + "epoch": 2.51, + "grad_norm": 22.447124481201172, + "learning_rate": 3.2841065974982224e-06, + "loss": 1.8789, + "step": 19979 + }, + { + "epoch": 2.51, + "grad_norm": 6.493262767791748, + "learning_rate": 3.2832698824415348e-06, + "loss": 0.4613, + "step": 19980 + }, + { + "epoch": 2.51, + "grad_norm": 57.3089599609375, + "learning_rate": 3.2824331673848476e-06, + "loss": 1.4242, + "step": 19981 + }, + { + "epoch": 2.51, + "grad_norm": 2.4070045948028564, + "learning_rate": 3.2815964523281604e-06, + "loss": 0.17, + "step": 19982 + }, + { + "epoch": 2.51, + "grad_norm": 10.130541801452637, + "learning_rate": 3.2807597372714723e-06, + "loss": 0.6856, + "step": 19983 + }, + { + "epoch": 2.51, + "grad_norm": 21.3435115814209, + "learning_rate": 3.2799230222147847e-06, + "loss": 1.7294, + "step": 19984 + }, + { + "epoch": 2.51, + "grad_norm": 16.479320526123047, + "learning_rate": 3.2790863071580975e-06, + "loss": 1.3781, + "step": 19985 + }, + { + "epoch": 2.51, + "grad_norm": 17.575529098510742, + "learning_rate": 3.2782495921014103e-06, + "loss": 1.0384, + "step": 19986 + }, + { + "epoch": 2.51, + "grad_norm": 48.52682113647461, + "learning_rate": 3.2774128770447227e-06, + "loss": 2.9254, + "step": 19987 + }, + { + "epoch": 2.51, + "grad_norm": 15.873946189880371, + "learning_rate": 3.2765761619880355e-06, + "loss": 1.6611, + "step": 19988 + }, + { + "epoch": 2.51, + "grad_norm": 15.20587158203125, + "learning_rate": 3.275739446931348e-06, + "loss": 0.3941, + "step": 19989 + }, + { + "epoch": 2.51, + "grad_norm": 7.7490363121032715, + "learning_rate": 3.2749027318746602e-06, + "loss": 0.4194, + "step": 19990 + }, + { + "epoch": 2.51, + "grad_norm": 18.263975143432617, + "learning_rate": 3.2740660168179726e-06, + "loss": 0.7866, + "step": 19991 + }, + { + "epoch": 2.51, + "grad_norm": 17.40163230895996, + "learning_rate": 3.2732293017612854e-06, + "loss": 1.5032, + "step": 19992 + }, + { + "epoch": 2.51, + "grad_norm": 21.275117874145508, + "learning_rate": 3.2723925867045982e-06, + "loss": 1.906, + "step": 19993 + }, + { + "epoch": 2.51, + "grad_norm": 12.57906723022461, + "learning_rate": 3.2715558716479106e-06, + "loss": 0.7331, + "step": 19994 + }, + { + "epoch": 2.51, + "grad_norm": 12.67908000946045, + "learning_rate": 3.2707191565912234e-06, + "loss": 0.4227, + "step": 19995 + }, + { + "epoch": 2.51, + "grad_norm": 10.568775177001953, + "learning_rate": 3.2698824415345358e-06, + "loss": 0.8041, + "step": 19996 + }, + { + "epoch": 2.51, + "grad_norm": 6.025681495666504, + "learning_rate": 3.269045726477848e-06, + "loss": 0.1539, + "step": 19997 + }, + { + "epoch": 2.51, + "grad_norm": 13.698433876037598, + "learning_rate": 3.2682090114211605e-06, + "loss": 1.3535, + "step": 19998 + }, + { + "epoch": 2.51, + "grad_norm": 2.734616279602051, + "learning_rate": 3.2673722963644733e-06, + "loss": 0.1233, + "step": 19999 + }, + { + "epoch": 2.51, + "grad_norm": 25.240158081054688, + "learning_rate": 3.2665355813077857e-06, + "loss": 1.041, + "step": 20000 + }, + { + "epoch": 2.51, + "eval_loss": 0.0808543786406517, + "eval_runtime": 95.0651, + "eval_samples_per_second": 37.259, + "eval_steps_per_second": 37.259, + "step": 20000 + }, + { + "epoch": 2.51, + "grad_norm": 39.454795837402344, + "learning_rate": 3.2656988662510985e-06, + "loss": 1.9426, + "step": 20001 + }, + { + "epoch": 2.51, + "grad_norm": 12.454354286193848, + "learning_rate": 3.2648621511944113e-06, + "loss": 0.2503, + "step": 20002 + }, + { + "epoch": 2.51, + "grad_norm": 10.543244361877441, + "learning_rate": 3.2640254361377237e-06, + "loss": 0.2657, + "step": 20003 + }, + { + "epoch": 2.51, + "grad_norm": 75.90203094482422, + "learning_rate": 3.263188721081036e-06, + "loss": 0.8773, + "step": 20004 + }, + { + "epoch": 2.51, + "grad_norm": 7.747319221496582, + "learning_rate": 3.2623520060243484e-06, + "loss": 0.5037, + "step": 20005 + }, + { + "epoch": 2.51, + "grad_norm": 8.718236923217773, + "learning_rate": 3.2615152909676613e-06, + "loss": 0.3986, + "step": 20006 + }, + { + "epoch": 2.51, + "grad_norm": 8.183634757995605, + "learning_rate": 3.2606785759109736e-06, + "loss": 1.8096, + "step": 20007 + }, + { + "epoch": 2.51, + "grad_norm": 27.24098014831543, + "learning_rate": 3.2598418608542864e-06, + "loss": 0.724, + "step": 20008 + }, + { + "epoch": 2.51, + "grad_norm": 10.727471351623535, + "learning_rate": 3.2590051457975992e-06, + "loss": 0.8958, + "step": 20009 + }, + { + "epoch": 2.51, + "grad_norm": 21.750059127807617, + "learning_rate": 3.2581684307409116e-06, + "loss": 2.5499, + "step": 20010 + }, + { + "epoch": 2.51, + "grad_norm": 5.738742351531982, + "learning_rate": 3.257331715684224e-06, + "loss": 1.2078, + "step": 20011 + }, + { + "epoch": 2.51, + "grad_norm": 30.94146728515625, + "learning_rate": 3.2564950006275364e-06, + "loss": 0.4637, + "step": 20012 + }, + { + "epoch": 2.51, + "grad_norm": 10.234822273254395, + "learning_rate": 3.255658285570849e-06, + "loss": 0.6016, + "step": 20013 + }, + { + "epoch": 2.51, + "grad_norm": 20.73149871826172, + "learning_rate": 3.2548215705141615e-06, + "loss": 1.7852, + "step": 20014 + }, + { + "epoch": 2.51, + "grad_norm": 2.1227056980133057, + "learning_rate": 3.2539848554574743e-06, + "loss": 0.1172, + "step": 20015 + }, + { + "epoch": 2.51, + "grad_norm": 26.35969352722168, + "learning_rate": 3.253148140400787e-06, + "loss": 0.9049, + "step": 20016 + }, + { + "epoch": 2.51, + "grad_norm": 10.881176948547363, + "learning_rate": 3.2523114253440995e-06, + "loss": 0.6887, + "step": 20017 + }, + { + "epoch": 2.51, + "grad_norm": 15.723998069763184, + "learning_rate": 3.2514747102874115e-06, + "loss": 0.8719, + "step": 20018 + }, + { + "epoch": 2.51, + "grad_norm": 14.749188423156738, + "learning_rate": 3.2506379952307243e-06, + "loss": 0.9361, + "step": 20019 + }, + { + "epoch": 2.51, + "grad_norm": 20.851654052734375, + "learning_rate": 3.249801280174037e-06, + "loss": 2.1951, + "step": 20020 + }, + { + "epoch": 2.51, + "grad_norm": 14.810416221618652, + "learning_rate": 3.2489645651173495e-06, + "loss": 0.7405, + "step": 20021 + }, + { + "epoch": 2.51, + "grad_norm": 11.574252128601074, + "learning_rate": 3.2481278500606623e-06, + "loss": 0.6396, + "step": 20022 + }, + { + "epoch": 2.51, + "grad_norm": 36.250186920166016, + "learning_rate": 3.247291135003975e-06, + "loss": 0.5705, + "step": 20023 + }, + { + "epoch": 2.51, + "grad_norm": 12.263786315917969, + "learning_rate": 3.2464544199472874e-06, + "loss": 0.3928, + "step": 20024 + }, + { + "epoch": 2.51, + "grad_norm": 25.44635009765625, + "learning_rate": 3.2456177048905994e-06, + "loss": 0.8051, + "step": 20025 + }, + { + "epoch": 2.51, + "grad_norm": 7.821044921875, + "learning_rate": 3.244780989833912e-06, + "loss": 0.604, + "step": 20026 + }, + { + "epoch": 2.51, + "grad_norm": 19.18406105041504, + "learning_rate": 3.243944274777225e-06, + "loss": 0.2634, + "step": 20027 + }, + { + "epoch": 2.51, + "grad_norm": 26.439741134643555, + "learning_rate": 3.2431075597205374e-06, + "loss": 1.8381, + "step": 20028 + }, + { + "epoch": 2.51, + "grad_norm": 11.98216724395752, + "learning_rate": 3.24227084466385e-06, + "loss": 1.3557, + "step": 20029 + }, + { + "epoch": 2.51, + "grad_norm": 11.501047134399414, + "learning_rate": 3.2414341296071626e-06, + "loss": 0.9794, + "step": 20030 + }, + { + "epoch": 2.51, + "grad_norm": 11.660356521606445, + "learning_rate": 3.240597414550475e-06, + "loss": 0.6197, + "step": 20031 + }, + { + "epoch": 2.51, + "grad_norm": 5.886253833770752, + "learning_rate": 3.2397606994937873e-06, + "loss": 0.3043, + "step": 20032 + }, + { + "epoch": 2.51, + "grad_norm": 40.572940826416016, + "learning_rate": 3.2389239844371e-06, + "loss": 0.3478, + "step": 20033 + }, + { + "epoch": 2.51, + "grad_norm": 10.604790687561035, + "learning_rate": 3.238087269380413e-06, + "loss": 0.3787, + "step": 20034 + }, + { + "epoch": 2.51, + "grad_norm": 18.762178421020508, + "learning_rate": 3.2372505543237253e-06, + "loss": 0.559, + "step": 20035 + }, + { + "epoch": 2.51, + "grad_norm": 3.1380579471588135, + "learning_rate": 3.236413839267038e-06, + "loss": 0.2146, + "step": 20036 + }, + { + "epoch": 2.51, + "grad_norm": 12.402771949768066, + "learning_rate": 3.2355771242103505e-06, + "loss": 1.3115, + "step": 20037 + }, + { + "epoch": 2.51, + "grad_norm": 16.927108764648438, + "learning_rate": 3.234740409153663e-06, + "loss": 0.6548, + "step": 20038 + }, + { + "epoch": 2.51, + "grad_norm": 24.82040786743164, + "learning_rate": 3.2339036940969752e-06, + "loss": 0.9741, + "step": 20039 + }, + { + "epoch": 2.51, + "grad_norm": 18.516328811645508, + "learning_rate": 3.233066979040288e-06, + "loss": 1.3821, + "step": 20040 + }, + { + "epoch": 2.52, + "grad_norm": 17.94833755493164, + "learning_rate": 3.2322302639836004e-06, + "loss": 0.525, + "step": 20041 + }, + { + "epoch": 2.52, + "grad_norm": 10.300509452819824, + "learning_rate": 3.2313935489269132e-06, + "loss": 1.9777, + "step": 20042 + }, + { + "epoch": 2.52, + "grad_norm": 25.85494613647461, + "learning_rate": 3.230556833870226e-06, + "loss": 1.0825, + "step": 20043 + }, + { + "epoch": 2.52, + "grad_norm": 16.04183578491211, + "learning_rate": 3.2297201188135384e-06, + "loss": 1.3378, + "step": 20044 + }, + { + "epoch": 2.52, + "grad_norm": 3.046987771987915, + "learning_rate": 3.2288834037568508e-06, + "loss": 0.0908, + "step": 20045 + }, + { + "epoch": 2.52, + "grad_norm": 17.787328720092773, + "learning_rate": 3.228046688700163e-06, + "loss": 1.0892, + "step": 20046 + }, + { + "epoch": 2.52, + "grad_norm": 114.71671295166016, + "learning_rate": 3.227209973643476e-06, + "loss": 1.1975, + "step": 20047 + }, + { + "epoch": 2.52, + "grad_norm": 48.55499267578125, + "learning_rate": 3.2263732585867883e-06, + "loss": 1.4782, + "step": 20048 + }, + { + "epoch": 2.52, + "grad_norm": 1.1245615482330322, + "learning_rate": 3.225536543530101e-06, + "loss": 0.0313, + "step": 20049 + }, + { + "epoch": 2.52, + "grad_norm": 20.053237915039062, + "learning_rate": 3.224699828473414e-06, + "loss": 0.3424, + "step": 20050 + }, + { + "epoch": 2.52, + "grad_norm": 9.567726135253906, + "learning_rate": 3.2238631134167263e-06, + "loss": 1.8609, + "step": 20051 + }, + { + "epoch": 2.52, + "grad_norm": 17.328189849853516, + "learning_rate": 3.2230263983600387e-06, + "loss": 0.6531, + "step": 20052 + }, + { + "epoch": 2.52, + "grad_norm": 13.401254653930664, + "learning_rate": 3.222189683303351e-06, + "loss": 1.1483, + "step": 20053 + }, + { + "epoch": 2.52, + "grad_norm": 16.87504005432129, + "learning_rate": 3.221352968246664e-06, + "loss": 0.5526, + "step": 20054 + }, + { + "epoch": 2.52, + "grad_norm": 15.760570526123047, + "learning_rate": 3.2205162531899763e-06, + "loss": 1.4285, + "step": 20055 + }, + { + "epoch": 2.52, + "grad_norm": 13.635117530822754, + "learning_rate": 3.219679538133289e-06, + "loss": 0.4915, + "step": 20056 + }, + { + "epoch": 2.52, + "grad_norm": 33.641998291015625, + "learning_rate": 3.218842823076602e-06, + "loss": 0.8777, + "step": 20057 + }, + { + "epoch": 2.52, + "grad_norm": 16.73700523376465, + "learning_rate": 3.2180061080199142e-06, + "loss": 0.5806, + "step": 20058 + }, + { + "epoch": 2.52, + "grad_norm": 11.086918830871582, + "learning_rate": 3.217169392963226e-06, + "loss": 0.8107, + "step": 20059 + }, + { + "epoch": 2.52, + "grad_norm": 11.779153823852539, + "learning_rate": 3.216332677906539e-06, + "loss": 1.6249, + "step": 20060 + }, + { + "epoch": 2.52, + "grad_norm": 12.519206047058105, + "learning_rate": 3.215495962849852e-06, + "loss": 1.0151, + "step": 20061 + }, + { + "epoch": 2.52, + "grad_norm": 9.511662483215332, + "learning_rate": 3.214659247793164e-06, + "loss": 1.6965, + "step": 20062 + }, + { + "epoch": 2.52, + "grad_norm": 9.904976844787598, + "learning_rate": 3.213822532736477e-06, + "loss": 0.5901, + "step": 20063 + }, + { + "epoch": 2.52, + "grad_norm": 8.143558502197266, + "learning_rate": 3.2129858176797894e-06, + "loss": 0.58, + "step": 20064 + }, + { + "epoch": 2.52, + "grad_norm": 2.6461517810821533, + "learning_rate": 3.212149102623102e-06, + "loss": 0.0457, + "step": 20065 + }, + { + "epoch": 2.52, + "grad_norm": 13.534331321716309, + "learning_rate": 3.211312387566414e-06, + "loss": 1.4003, + "step": 20066 + }, + { + "epoch": 2.52, + "grad_norm": 11.298065185546875, + "learning_rate": 3.210475672509727e-06, + "loss": 0.6139, + "step": 20067 + }, + { + "epoch": 2.52, + "grad_norm": 16.307167053222656, + "learning_rate": 3.2096389574530397e-06, + "loss": 1.1053, + "step": 20068 + }, + { + "epoch": 2.52, + "grad_norm": 15.244307518005371, + "learning_rate": 3.208802242396352e-06, + "loss": 0.5344, + "step": 20069 + }, + { + "epoch": 2.52, + "grad_norm": 26.728551864624023, + "learning_rate": 3.207965527339665e-06, + "loss": 1.2105, + "step": 20070 + }, + { + "epoch": 2.52, + "grad_norm": 22.271337509155273, + "learning_rate": 3.2071288122829773e-06, + "loss": 1.1998, + "step": 20071 + }, + { + "epoch": 2.52, + "grad_norm": 11.722222328186035, + "learning_rate": 3.20629209722629e-06, + "loss": 0.5946, + "step": 20072 + }, + { + "epoch": 2.52, + "grad_norm": 29.611454010009766, + "learning_rate": 3.205455382169602e-06, + "loss": 0.3169, + "step": 20073 + }, + { + "epoch": 2.52, + "grad_norm": 1529.944091796875, + "learning_rate": 3.204618667112915e-06, + "loss": 0.6748, + "step": 20074 + }, + { + "epoch": 2.52, + "grad_norm": 10.68509292602539, + "learning_rate": 3.2037819520562276e-06, + "loss": 0.7615, + "step": 20075 + }, + { + "epoch": 2.52, + "grad_norm": 18.093368530273438, + "learning_rate": 3.20294523699954e-06, + "loss": 2.1586, + "step": 20076 + }, + { + "epoch": 2.52, + "grad_norm": 5.211366176605225, + "learning_rate": 3.202108521942853e-06, + "loss": 0.2439, + "step": 20077 + }, + { + "epoch": 2.52, + "grad_norm": 25.321666717529297, + "learning_rate": 3.201271806886165e-06, + "loss": 1.5209, + "step": 20078 + }, + { + "epoch": 2.52, + "grad_norm": 71.18042755126953, + "learning_rate": 3.200435091829478e-06, + "loss": 0.9226, + "step": 20079 + }, + { + "epoch": 2.52, + "grad_norm": 8.386185646057129, + "learning_rate": 3.19959837677279e-06, + "loss": 0.2261, + "step": 20080 + }, + { + "epoch": 2.52, + "grad_norm": 17.505979537963867, + "learning_rate": 3.1987616617161028e-06, + "loss": 1.5554, + "step": 20081 + }, + { + "epoch": 2.52, + "grad_norm": 143.35447692871094, + "learning_rate": 3.197924946659415e-06, + "loss": 0.3874, + "step": 20082 + }, + { + "epoch": 2.52, + "grad_norm": 9.683552742004395, + "learning_rate": 3.197088231602728e-06, + "loss": 1.7246, + "step": 20083 + }, + { + "epoch": 2.52, + "grad_norm": 58.537803649902344, + "learning_rate": 3.1962515165460407e-06, + "loss": 0.4977, + "step": 20084 + }, + { + "epoch": 2.52, + "grad_norm": 14.881050109863281, + "learning_rate": 3.195414801489353e-06, + "loss": 0.4981, + "step": 20085 + }, + { + "epoch": 2.52, + "grad_norm": 37.98305130004883, + "learning_rate": 3.194578086432666e-06, + "loss": 2.2355, + "step": 20086 + }, + { + "epoch": 2.52, + "grad_norm": 12.213129043579102, + "learning_rate": 3.193741371375978e-06, + "loss": 0.4452, + "step": 20087 + }, + { + "epoch": 2.52, + "grad_norm": 11.402565002441406, + "learning_rate": 3.1929046563192907e-06, + "loss": 0.6451, + "step": 20088 + }, + { + "epoch": 2.52, + "grad_norm": 29.905174255371094, + "learning_rate": 3.192067941262603e-06, + "loss": 1.5578, + "step": 20089 + }, + { + "epoch": 2.52, + "grad_norm": 11.379555702209473, + "learning_rate": 3.191231226205916e-06, + "loss": 0.7022, + "step": 20090 + }, + { + "epoch": 2.52, + "grad_norm": 15.473186492919922, + "learning_rate": 3.1903945111492287e-06, + "loss": 1.0074, + "step": 20091 + }, + { + "epoch": 2.52, + "grad_norm": 9.624855041503906, + "learning_rate": 3.189557796092541e-06, + "loss": 0.9283, + "step": 20092 + }, + { + "epoch": 2.52, + "grad_norm": 24.56440544128418, + "learning_rate": 3.188721081035854e-06, + "loss": 1.3809, + "step": 20093 + }, + { + "epoch": 2.52, + "grad_norm": 135.54013061523438, + "learning_rate": 3.1878843659791658e-06, + "loss": 0.7548, + "step": 20094 + }, + { + "epoch": 2.52, + "grad_norm": 15.073548316955566, + "learning_rate": 3.1870476509224786e-06, + "loss": 0.3575, + "step": 20095 + }, + { + "epoch": 2.52, + "grad_norm": 15.966330528259277, + "learning_rate": 3.186210935865791e-06, + "loss": 0.6908, + "step": 20096 + }, + { + "epoch": 2.52, + "grad_norm": 10.339390754699707, + "learning_rate": 3.1853742208091038e-06, + "loss": 0.6126, + "step": 20097 + }, + { + "epoch": 2.52, + "grad_norm": 6.085317611694336, + "learning_rate": 3.1845375057524166e-06, + "loss": 1.5773, + "step": 20098 + }, + { + "epoch": 2.52, + "grad_norm": 27.03826141357422, + "learning_rate": 3.183700790695729e-06, + "loss": 1.9262, + "step": 20099 + }, + { + "epoch": 2.52, + "grad_norm": 10.994552612304688, + "learning_rate": 3.1828640756390417e-06, + "loss": 1.7028, + "step": 20100 + }, + { + "epoch": 2.52, + "grad_norm": 5.914076805114746, + "learning_rate": 3.1820273605823537e-06, + "loss": 1.0003, + "step": 20101 + }, + { + "epoch": 2.52, + "grad_norm": 4.8910298347473145, + "learning_rate": 3.1811906455256665e-06, + "loss": 1.4367, + "step": 20102 + }, + { + "epoch": 2.52, + "grad_norm": 14.262541770935059, + "learning_rate": 3.180353930468979e-06, + "loss": 0.1744, + "step": 20103 + }, + { + "epoch": 2.52, + "grad_norm": 18.70867919921875, + "learning_rate": 3.1795172154122917e-06, + "loss": 1.6976, + "step": 20104 + }, + { + "epoch": 2.52, + "grad_norm": 11.734964370727539, + "learning_rate": 3.178680500355604e-06, + "loss": 0.6124, + "step": 20105 + }, + { + "epoch": 2.52, + "grad_norm": 3.6546733379364014, + "learning_rate": 3.177843785298917e-06, + "loss": 1.3167, + "step": 20106 + }, + { + "epoch": 2.52, + "grad_norm": 21.559032440185547, + "learning_rate": 3.1770070702422297e-06, + "loss": 0.9629, + "step": 20107 + }, + { + "epoch": 2.52, + "grad_norm": 22.326847076416016, + "learning_rate": 3.1761703551855416e-06, + "loss": 1.1904, + "step": 20108 + }, + { + "epoch": 2.52, + "grad_norm": 26.877647399902344, + "learning_rate": 3.1753336401288544e-06, + "loss": 1.3934, + "step": 20109 + }, + { + "epoch": 2.52, + "grad_norm": 13.50160026550293, + "learning_rate": 3.174496925072167e-06, + "loss": 1.0344, + "step": 20110 + }, + { + "epoch": 2.52, + "grad_norm": 63.82146072387695, + "learning_rate": 3.1736602100154796e-06, + "loss": 1.1811, + "step": 20111 + }, + { + "epoch": 2.52, + "grad_norm": 10.990103721618652, + "learning_rate": 3.172823494958792e-06, + "loss": 0.4297, + "step": 20112 + }, + { + "epoch": 2.52, + "grad_norm": 10.466145515441895, + "learning_rate": 3.1719867799021048e-06, + "loss": 0.2588, + "step": 20113 + }, + { + "epoch": 2.52, + "grad_norm": 100.87720489501953, + "learning_rate": 3.1711500648454167e-06, + "loss": 1.0303, + "step": 20114 + }, + { + "epoch": 2.52, + "grad_norm": 10.929286003112793, + "learning_rate": 3.1703133497887295e-06, + "loss": 0.6933, + "step": 20115 + }, + { + "epoch": 2.52, + "grad_norm": 348.97802734375, + "learning_rate": 3.1694766347320423e-06, + "loss": 1.9972, + "step": 20116 + }, + { + "epoch": 2.52, + "grad_norm": 15.033949851989746, + "learning_rate": 3.1686399196753547e-06, + "loss": 0.5968, + "step": 20117 + }, + { + "epoch": 2.52, + "grad_norm": 13.286337852478027, + "learning_rate": 3.1678032046186675e-06, + "loss": 1.5688, + "step": 20118 + }, + { + "epoch": 2.52, + "grad_norm": 83.24957275390625, + "learning_rate": 3.16696648956198e-06, + "loss": 2.0411, + "step": 20119 + }, + { + "epoch": 2.53, + "grad_norm": 23.767091751098633, + "learning_rate": 3.1661297745052927e-06, + "loss": 1.3859, + "step": 20120 + }, + { + "epoch": 2.53, + "grad_norm": 7.808661460876465, + "learning_rate": 3.1652930594486047e-06, + "loss": 0.5942, + "step": 20121 + }, + { + "epoch": 2.53, + "grad_norm": 26.495922088623047, + "learning_rate": 3.1644563443919175e-06, + "loss": 1.1367, + "step": 20122 + }, + { + "epoch": 2.53, + "grad_norm": 18.31427764892578, + "learning_rate": 3.16361962933523e-06, + "loss": 1.3548, + "step": 20123 + }, + { + "epoch": 2.53, + "grad_norm": 15.174376487731934, + "learning_rate": 3.1627829142785426e-06, + "loss": 0.6136, + "step": 20124 + }, + { + "epoch": 2.53, + "grad_norm": 4.846739768981934, + "learning_rate": 3.1619461992218554e-06, + "loss": 0.0403, + "step": 20125 + }, + { + "epoch": 2.53, + "grad_norm": 21.373008728027344, + "learning_rate": 3.161109484165168e-06, + "loss": 1.0197, + "step": 20126 + }, + { + "epoch": 2.53, + "grad_norm": 42.04866409301758, + "learning_rate": 3.1602727691084806e-06, + "loss": 0.5158, + "step": 20127 + }, + { + "epoch": 2.53, + "grad_norm": 30.856718063354492, + "learning_rate": 3.1594360540517926e-06, + "loss": 3.1075, + "step": 20128 + }, + { + "epoch": 2.53, + "grad_norm": 9.922351837158203, + "learning_rate": 3.1585993389951054e-06, + "loss": 1.4879, + "step": 20129 + }, + { + "epoch": 2.53, + "grad_norm": 19.042163848876953, + "learning_rate": 3.1577626239384178e-06, + "loss": 1.5319, + "step": 20130 + }, + { + "epoch": 2.53, + "grad_norm": 26.65334701538086, + "learning_rate": 3.1569259088817306e-06, + "loss": 0.5933, + "step": 20131 + }, + { + "epoch": 2.53, + "grad_norm": 20.031673431396484, + "learning_rate": 3.1560891938250434e-06, + "loss": 0.939, + "step": 20132 + }, + { + "epoch": 2.53, + "grad_norm": 9.657144546508789, + "learning_rate": 3.1552524787683557e-06, + "loss": 0.474, + "step": 20133 + }, + { + "epoch": 2.53, + "grad_norm": 9.074506759643555, + "learning_rate": 3.1544157637116685e-06, + "loss": 1.8527, + "step": 20134 + }, + { + "epoch": 2.53, + "grad_norm": 18.564651489257812, + "learning_rate": 3.1535790486549805e-06, + "loss": 0.4606, + "step": 20135 + }, + { + "epoch": 2.53, + "grad_norm": 25.953868865966797, + "learning_rate": 3.1527423335982933e-06, + "loss": 0.716, + "step": 20136 + }, + { + "epoch": 2.53, + "grad_norm": 18.119686126708984, + "learning_rate": 3.1519056185416057e-06, + "loss": 1.8624, + "step": 20137 + }, + { + "epoch": 2.53, + "grad_norm": 13.626513481140137, + "learning_rate": 3.1510689034849185e-06, + "loss": 1.674, + "step": 20138 + }, + { + "epoch": 2.53, + "grad_norm": 6.355057716369629, + "learning_rate": 3.1502321884282313e-06, + "loss": 0.4331, + "step": 20139 + }, + { + "epoch": 2.53, + "grad_norm": 16.89761734008789, + "learning_rate": 3.1493954733715437e-06, + "loss": 1.8812, + "step": 20140 + }, + { + "epoch": 2.53, + "grad_norm": 32.7131233215332, + "learning_rate": 3.1485587583148565e-06, + "loss": 1.5138, + "step": 20141 + }, + { + "epoch": 2.53, + "grad_norm": 20.496959686279297, + "learning_rate": 3.1477220432581684e-06, + "loss": 0.4932, + "step": 20142 + }, + { + "epoch": 2.53, + "grad_norm": 38.02460861206055, + "learning_rate": 3.1468853282014812e-06, + "loss": 0.3606, + "step": 20143 + }, + { + "epoch": 2.53, + "grad_norm": 20.9803409576416, + "learning_rate": 3.1460486131447936e-06, + "loss": 0.8481, + "step": 20144 + }, + { + "epoch": 2.53, + "grad_norm": 7.6487627029418945, + "learning_rate": 3.1452118980881064e-06, + "loss": 0.1384, + "step": 20145 + }, + { + "epoch": 2.53, + "grad_norm": 25.933940887451172, + "learning_rate": 3.1443751830314188e-06, + "loss": 0.9629, + "step": 20146 + }, + { + "epoch": 2.53, + "grad_norm": 138.8772735595703, + "learning_rate": 3.1435384679747316e-06, + "loss": 1.5815, + "step": 20147 + }, + { + "epoch": 2.53, + "grad_norm": 2.3213183879852295, + "learning_rate": 3.1427017529180444e-06, + "loss": 0.0642, + "step": 20148 + }, + { + "epoch": 2.53, + "grad_norm": 12.18281364440918, + "learning_rate": 3.1418650378613563e-06, + "loss": 1.4909, + "step": 20149 + }, + { + "epoch": 2.53, + "grad_norm": 11.219976425170898, + "learning_rate": 3.141028322804669e-06, + "loss": 0.8455, + "step": 20150 + }, + { + "epoch": 2.53, + "grad_norm": 19.861003875732422, + "learning_rate": 3.1401916077479815e-06, + "loss": 1.4017, + "step": 20151 + }, + { + "epoch": 2.53, + "grad_norm": 24.716703414916992, + "learning_rate": 3.1393548926912943e-06, + "loss": 2.7867, + "step": 20152 + }, + { + "epoch": 2.53, + "grad_norm": 74.31487274169922, + "learning_rate": 3.1385181776346067e-06, + "loss": 0.9074, + "step": 20153 + }, + { + "epoch": 2.53, + "grad_norm": 15.867815971374512, + "learning_rate": 3.1376814625779195e-06, + "loss": 1.1159, + "step": 20154 + }, + { + "epoch": 2.53, + "grad_norm": 18.009172439575195, + "learning_rate": 3.1368447475212323e-06, + "loss": 0.5687, + "step": 20155 + }, + { + "epoch": 2.53, + "grad_norm": 15.600923538208008, + "learning_rate": 3.1360080324645443e-06, + "loss": 0.3674, + "step": 20156 + }, + { + "epoch": 2.53, + "grad_norm": 559.9266967773438, + "learning_rate": 3.135171317407857e-06, + "loss": 2.0866, + "step": 20157 + }, + { + "epoch": 2.53, + "grad_norm": 11.303703308105469, + "learning_rate": 3.1343346023511694e-06, + "loss": 0.85, + "step": 20158 + }, + { + "epoch": 2.53, + "grad_norm": 17.95802879333496, + "learning_rate": 3.1334978872944822e-06, + "loss": 0.9572, + "step": 20159 + }, + { + "epoch": 2.53, + "grad_norm": 14.622774124145508, + "learning_rate": 3.1326611722377946e-06, + "loss": 1.3898, + "step": 20160 + }, + { + "epoch": 2.53, + "grad_norm": 9.754227638244629, + "learning_rate": 3.1318244571811074e-06, + "loss": 0.821, + "step": 20161 + }, + { + "epoch": 2.53, + "grad_norm": 6.329394817352295, + "learning_rate": 3.1309877421244202e-06, + "loss": 0.5835, + "step": 20162 + }, + { + "epoch": 2.53, + "grad_norm": 472.1472473144531, + "learning_rate": 3.130151027067732e-06, + "loss": 2.2185, + "step": 20163 + }, + { + "epoch": 2.53, + "grad_norm": 8.972721099853516, + "learning_rate": 3.1293143120110445e-06, + "loss": 0.298, + "step": 20164 + }, + { + "epoch": 2.53, + "grad_norm": 9.203935623168945, + "learning_rate": 3.1284775969543573e-06, + "loss": 1.1663, + "step": 20165 + }, + { + "epoch": 2.53, + "grad_norm": 28.42084312438965, + "learning_rate": 3.12764088189767e-06, + "loss": 0.9144, + "step": 20166 + }, + { + "epoch": 2.53, + "grad_norm": 239.5991668701172, + "learning_rate": 3.1268041668409825e-06, + "loss": 1.6324, + "step": 20167 + }, + { + "epoch": 2.53, + "grad_norm": 41.38919448852539, + "learning_rate": 3.1259674517842953e-06, + "loss": 1.3735, + "step": 20168 + }, + { + "epoch": 2.53, + "grad_norm": 7.04861307144165, + "learning_rate": 3.1251307367276077e-06, + "loss": 0.3261, + "step": 20169 + }, + { + "epoch": 2.53, + "grad_norm": 7.5475077629089355, + "learning_rate": 3.12429402167092e-06, + "loss": 0.3679, + "step": 20170 + }, + { + "epoch": 2.53, + "grad_norm": 25.39781951904297, + "learning_rate": 3.1234573066142325e-06, + "loss": 0.6875, + "step": 20171 + }, + { + "epoch": 2.53, + "grad_norm": 21.164522171020508, + "learning_rate": 3.1226205915575453e-06, + "loss": 1.5491, + "step": 20172 + }, + { + "epoch": 2.53, + "grad_norm": 5.918303966522217, + "learning_rate": 3.121783876500858e-06, + "loss": 0.3879, + "step": 20173 + }, + { + "epoch": 2.53, + "grad_norm": 10.785502433776855, + "learning_rate": 3.1209471614441704e-06, + "loss": 1.1789, + "step": 20174 + }, + { + "epoch": 2.53, + "grad_norm": 5.143613338470459, + "learning_rate": 3.1201104463874832e-06, + "loss": 0.6366, + "step": 20175 + }, + { + "epoch": 2.53, + "grad_norm": 8.28675365447998, + "learning_rate": 3.1192737313307956e-06, + "loss": 0.4371, + "step": 20176 + }, + { + "epoch": 2.53, + "grad_norm": 37.711341857910156, + "learning_rate": 3.118437016274108e-06, + "loss": 0.2428, + "step": 20177 + }, + { + "epoch": 2.53, + "grad_norm": 38.26509475708008, + "learning_rate": 3.1176003012174204e-06, + "loss": 2.5128, + "step": 20178 + }, + { + "epoch": 2.53, + "grad_norm": 39.81582260131836, + "learning_rate": 3.116763586160733e-06, + "loss": 1.7895, + "step": 20179 + }, + { + "epoch": 2.53, + "grad_norm": 10.409308433532715, + "learning_rate": 3.115926871104046e-06, + "loss": 1.2166, + "step": 20180 + }, + { + "epoch": 2.53, + "grad_norm": 11.536985397338867, + "learning_rate": 3.1150901560473584e-06, + "loss": 1.8168, + "step": 20181 + }, + { + "epoch": 2.53, + "grad_norm": 17.858381271362305, + "learning_rate": 3.114253440990671e-06, + "loss": 0.383, + "step": 20182 + }, + { + "epoch": 2.53, + "grad_norm": 36.268795013427734, + "learning_rate": 3.1134167259339835e-06, + "loss": 2.7144, + "step": 20183 + }, + { + "epoch": 2.53, + "grad_norm": 6.797853946685791, + "learning_rate": 3.112580010877296e-06, + "loss": 0.3438, + "step": 20184 + }, + { + "epoch": 2.53, + "grad_norm": 4.46155309677124, + "learning_rate": 3.1117432958206083e-06, + "loss": 0.1619, + "step": 20185 + }, + { + "epoch": 2.53, + "grad_norm": 17.920578002929688, + "learning_rate": 3.110906580763921e-06, + "loss": 1.1489, + "step": 20186 + }, + { + "epoch": 2.53, + "grad_norm": 3.990722179412842, + "learning_rate": 3.1100698657072335e-06, + "loss": 0.0987, + "step": 20187 + }, + { + "epoch": 2.53, + "grad_norm": 29.773059844970703, + "learning_rate": 3.1092331506505463e-06, + "loss": 1.033, + "step": 20188 + }, + { + "epoch": 2.53, + "grad_norm": 17.676555633544922, + "learning_rate": 3.108396435593859e-06, + "loss": 1.1681, + "step": 20189 + }, + { + "epoch": 2.53, + "grad_norm": 16.256336212158203, + "learning_rate": 3.1075597205371715e-06, + "loss": 2.0528, + "step": 20190 + }, + { + "epoch": 2.53, + "grad_norm": 5.727039337158203, + "learning_rate": 3.106723005480484e-06, + "loss": 0.435, + "step": 20191 + }, + { + "epoch": 2.53, + "grad_norm": 7.141747951507568, + "learning_rate": 3.1058862904237962e-06, + "loss": 0.1768, + "step": 20192 + }, + { + "epoch": 2.53, + "grad_norm": 18.85256576538086, + "learning_rate": 3.105049575367109e-06, + "loss": 0.7793, + "step": 20193 + }, + { + "epoch": 2.53, + "grad_norm": 35.84891128540039, + "learning_rate": 3.1042128603104214e-06, + "loss": 1.0192, + "step": 20194 + }, + { + "epoch": 2.53, + "grad_norm": 25.55327033996582, + "learning_rate": 3.103376145253734e-06, + "loss": 2.2958, + "step": 20195 + }, + { + "epoch": 2.53, + "grad_norm": 9.817205429077148, + "learning_rate": 3.102539430197047e-06, + "loss": 0.2239, + "step": 20196 + }, + { + "epoch": 2.53, + "grad_norm": 46.79560470581055, + "learning_rate": 3.1017027151403594e-06, + "loss": 0.8706, + "step": 20197 + }, + { + "epoch": 2.53, + "grad_norm": 17.943552017211914, + "learning_rate": 3.1008660000836718e-06, + "loss": 1.6181, + "step": 20198 + }, + { + "epoch": 2.53, + "grad_norm": 81.96236419677734, + "learning_rate": 3.100029285026984e-06, + "loss": 2.0724, + "step": 20199 + }, + { + "epoch": 2.54, + "grad_norm": 16.38736915588379, + "learning_rate": 3.099192569970297e-06, + "loss": 1.3343, + "step": 20200 + }, + { + "epoch": 2.54, + "grad_norm": 9.381766319274902, + "learning_rate": 3.0983558549136093e-06, + "loss": 0.3703, + "step": 20201 + }, + { + "epoch": 2.54, + "grad_norm": 16.660558700561523, + "learning_rate": 3.097519139856922e-06, + "loss": 0.7836, + "step": 20202 + }, + { + "epoch": 2.54, + "grad_norm": 18.434297561645508, + "learning_rate": 3.096682424800235e-06, + "loss": 1.3671, + "step": 20203 + }, + { + "epoch": 2.54, + "grad_norm": 6.595559597015381, + "learning_rate": 3.095845709743547e-06, + "loss": 1.5472, + "step": 20204 + }, + { + "epoch": 2.54, + "grad_norm": 1.3526664972305298, + "learning_rate": 3.0950089946868593e-06, + "loss": 0.0462, + "step": 20205 + }, + { + "epoch": 2.54, + "grad_norm": 16.824804306030273, + "learning_rate": 3.094172279630172e-06, + "loss": 0.5207, + "step": 20206 + }, + { + "epoch": 2.54, + "grad_norm": 18.006778717041016, + "learning_rate": 3.093335564573485e-06, + "loss": 0.8913, + "step": 20207 + }, + { + "epoch": 2.54, + "grad_norm": 26.328611373901367, + "learning_rate": 3.0924988495167972e-06, + "loss": 1.2545, + "step": 20208 + }, + { + "epoch": 2.54, + "grad_norm": 25.5814266204834, + "learning_rate": 3.09166213446011e-06, + "loss": 0.9104, + "step": 20209 + }, + { + "epoch": 2.54, + "grad_norm": 61.892154693603516, + "learning_rate": 3.0908254194034224e-06, + "loss": 1.1347, + "step": 20210 + }, + { + "epoch": 2.54, + "grad_norm": 15.218506813049316, + "learning_rate": 3.089988704346735e-06, + "loss": 1.982, + "step": 20211 + }, + { + "epoch": 2.54, + "grad_norm": 16.15838623046875, + "learning_rate": 3.089151989290047e-06, + "loss": 1.277, + "step": 20212 + }, + { + "epoch": 2.54, + "grad_norm": 25.595287322998047, + "learning_rate": 3.08831527423336e-06, + "loss": 0.8428, + "step": 20213 + }, + { + "epoch": 2.54, + "grad_norm": 5.472421646118164, + "learning_rate": 3.0874785591766728e-06, + "loss": 0.4231, + "step": 20214 + }, + { + "epoch": 2.54, + "grad_norm": 1.6701686382293701, + "learning_rate": 3.086641844119985e-06, + "loss": 0.0995, + "step": 20215 + }, + { + "epoch": 2.54, + "grad_norm": 30.975154876708984, + "learning_rate": 3.085805129063298e-06, + "loss": 3.2722, + "step": 20216 + }, + { + "epoch": 2.54, + "grad_norm": 18.317169189453125, + "learning_rate": 3.0849684140066103e-06, + "loss": 0.5636, + "step": 20217 + }, + { + "epoch": 2.54, + "grad_norm": 5.2889814376831055, + "learning_rate": 3.0841316989499227e-06, + "loss": 0.2052, + "step": 20218 + }, + { + "epoch": 2.54, + "grad_norm": 14.267701148986816, + "learning_rate": 3.083294983893235e-06, + "loss": 2.1028, + "step": 20219 + }, + { + "epoch": 2.54, + "grad_norm": 9.833152770996094, + "learning_rate": 3.082458268836548e-06, + "loss": 0.8909, + "step": 20220 + }, + { + "epoch": 2.54, + "grad_norm": 18.517662048339844, + "learning_rate": 3.0816215537798607e-06, + "loss": 1.3313, + "step": 20221 + }, + { + "epoch": 2.54, + "grad_norm": 16.407983779907227, + "learning_rate": 3.080784838723173e-06, + "loss": 0.6524, + "step": 20222 + }, + { + "epoch": 2.54, + "grad_norm": 16.671092987060547, + "learning_rate": 3.079948123666486e-06, + "loss": 1.1518, + "step": 20223 + }, + { + "epoch": 2.54, + "grad_norm": 7.368433952331543, + "learning_rate": 3.0791114086097983e-06, + "loss": 0.5794, + "step": 20224 + }, + { + "epoch": 2.54, + "grad_norm": 16.057140350341797, + "learning_rate": 3.0782746935531106e-06, + "loss": 1.0874, + "step": 20225 + }, + { + "epoch": 2.54, + "grad_norm": 52.69207763671875, + "learning_rate": 3.077437978496423e-06, + "loss": 2.3154, + "step": 20226 + }, + { + "epoch": 2.54, + "grad_norm": 15.292400360107422, + "learning_rate": 3.076601263439736e-06, + "loss": 1.1101, + "step": 20227 + }, + { + "epoch": 2.54, + "grad_norm": 8.293540954589844, + "learning_rate": 3.075764548383048e-06, + "loss": 3.1448, + "step": 20228 + }, + { + "epoch": 2.54, + "grad_norm": 0.3503389358520508, + "learning_rate": 3.074927833326361e-06, + "loss": 0.0068, + "step": 20229 + }, + { + "epoch": 2.54, + "grad_norm": 13.330592155456543, + "learning_rate": 3.074091118269674e-06, + "loss": 0.7966, + "step": 20230 + }, + { + "epoch": 2.54, + "grad_norm": 13.415767669677734, + "learning_rate": 3.073254403212986e-06, + "loss": 1.176, + "step": 20231 + }, + { + "epoch": 2.54, + "grad_norm": 5.649020195007324, + "learning_rate": 3.0724176881562986e-06, + "loss": 1.315, + "step": 20232 + }, + { + "epoch": 2.54, + "grad_norm": 10.661545753479004, + "learning_rate": 3.071580973099611e-06, + "loss": 0.5251, + "step": 20233 + }, + { + "epoch": 2.54, + "grad_norm": 7.858932018280029, + "learning_rate": 3.0707442580429237e-06, + "loss": 0.8182, + "step": 20234 + }, + { + "epoch": 2.54, + "grad_norm": 9.638903617858887, + "learning_rate": 3.069907542986236e-06, + "loss": 0.2427, + "step": 20235 + }, + { + "epoch": 2.54, + "grad_norm": 12.211078643798828, + "learning_rate": 3.069070827929549e-06, + "loss": 0.9441, + "step": 20236 + }, + { + "epoch": 2.54, + "grad_norm": 8.404287338256836, + "learning_rate": 3.0682341128728617e-06, + "loss": 0.3233, + "step": 20237 + }, + { + "epoch": 2.54, + "grad_norm": 29.669578552246094, + "learning_rate": 3.067397397816174e-06, + "loss": 0.7231, + "step": 20238 + }, + { + "epoch": 2.54, + "grad_norm": 19.04340171813965, + "learning_rate": 3.0665606827594865e-06, + "loss": 0.6144, + "step": 20239 + }, + { + "epoch": 2.54, + "grad_norm": 4.203525066375732, + "learning_rate": 3.065723967702799e-06, + "loss": 1.0986, + "step": 20240 + }, + { + "epoch": 2.54, + "grad_norm": 8.037447929382324, + "learning_rate": 3.0648872526461117e-06, + "loss": 0.6277, + "step": 20241 + }, + { + "epoch": 2.54, + "grad_norm": 36.33158493041992, + "learning_rate": 3.064050537589424e-06, + "loss": 1.2463, + "step": 20242 + }, + { + "epoch": 2.54, + "grad_norm": 8.591270446777344, + "learning_rate": 3.063213822532737e-06, + "loss": 0.8348, + "step": 20243 + }, + { + "epoch": 2.54, + "grad_norm": 15.740802764892578, + "learning_rate": 3.0623771074760496e-06, + "loss": 0.4979, + "step": 20244 + }, + { + "epoch": 2.54, + "grad_norm": 14.099591255187988, + "learning_rate": 3.061540392419362e-06, + "loss": 1.0288, + "step": 20245 + }, + { + "epoch": 2.54, + "grad_norm": 16.36825942993164, + "learning_rate": 3.060703677362674e-06, + "loss": 0.52, + "step": 20246 + }, + { + "epoch": 2.54, + "grad_norm": 9.811111450195312, + "learning_rate": 3.0598669623059868e-06, + "loss": 0.2784, + "step": 20247 + }, + { + "epoch": 2.54, + "grad_norm": 17.12701988220215, + "learning_rate": 3.0590302472492996e-06, + "loss": 1.1396, + "step": 20248 + }, + { + "epoch": 2.54, + "grad_norm": 11.216033935546875, + "learning_rate": 3.058193532192612e-06, + "loss": 0.8465, + "step": 20249 + }, + { + "epoch": 2.54, + "grad_norm": 2.6195974349975586, + "learning_rate": 3.0573568171359247e-06, + "loss": 0.108, + "step": 20250 + }, + { + "epoch": 2.54, + "grad_norm": 3.237502098083496, + "learning_rate": 3.056520102079237e-06, + "loss": 0.2449, + "step": 20251 + }, + { + "epoch": 2.54, + "grad_norm": 27.812013626098633, + "learning_rate": 3.05568338702255e-06, + "loss": 0.866, + "step": 20252 + }, + { + "epoch": 2.54, + "grad_norm": 5.755734920501709, + "learning_rate": 3.054846671965862e-06, + "loss": 0.3416, + "step": 20253 + }, + { + "epoch": 2.54, + "grad_norm": 6.678841590881348, + "learning_rate": 3.0540099569091747e-06, + "loss": 0.5921, + "step": 20254 + }, + { + "epoch": 2.54, + "grad_norm": 11.03495979309082, + "learning_rate": 3.0531732418524875e-06, + "loss": 1.387, + "step": 20255 + }, + { + "epoch": 2.54, + "grad_norm": 24.642271041870117, + "learning_rate": 3.0523365267958e-06, + "loss": 1.3909, + "step": 20256 + }, + { + "epoch": 2.54, + "grad_norm": 8.620196342468262, + "learning_rate": 3.0514998117391127e-06, + "loss": 0.7151, + "step": 20257 + }, + { + "epoch": 2.54, + "grad_norm": 18.334409713745117, + "learning_rate": 3.050663096682425e-06, + "loss": 0.344, + "step": 20258 + }, + { + "epoch": 2.54, + "grad_norm": 2.8685402870178223, + "learning_rate": 3.049826381625738e-06, + "loss": 0.0499, + "step": 20259 + }, + { + "epoch": 2.54, + "grad_norm": 7.711960792541504, + "learning_rate": 3.04898966656905e-06, + "loss": 1.4663, + "step": 20260 + }, + { + "epoch": 2.54, + "grad_norm": 127.82076263427734, + "learning_rate": 3.0481529515123626e-06, + "loss": 1.5027, + "step": 20261 + }, + { + "epoch": 2.54, + "grad_norm": 16.69835662841797, + "learning_rate": 3.0473162364556754e-06, + "loss": 0.9731, + "step": 20262 + }, + { + "epoch": 2.54, + "grad_norm": 46.56307601928711, + "learning_rate": 3.0464795213989878e-06, + "loss": 1.4718, + "step": 20263 + }, + { + "epoch": 2.54, + "grad_norm": 1.6854890584945679, + "learning_rate": 3.0456428063423006e-06, + "loss": 0.0537, + "step": 20264 + }, + { + "epoch": 2.54, + "grad_norm": 7.84578275680542, + "learning_rate": 3.044806091285613e-06, + "loss": 1.0442, + "step": 20265 + }, + { + "epoch": 2.54, + "grad_norm": 64.02957153320312, + "learning_rate": 3.0439693762289258e-06, + "loss": 0.9431, + "step": 20266 + }, + { + "epoch": 2.54, + "grad_norm": 6.88743257522583, + "learning_rate": 3.0431326611722377e-06, + "loss": 0.9714, + "step": 20267 + }, + { + "epoch": 2.54, + "grad_norm": 8.80877685546875, + "learning_rate": 3.0422959461155505e-06, + "loss": 0.5565, + "step": 20268 + }, + { + "epoch": 2.54, + "grad_norm": 105.63014221191406, + "learning_rate": 3.041459231058863e-06, + "loss": 0.6282, + "step": 20269 + }, + { + "epoch": 2.54, + "grad_norm": 60.46650695800781, + "learning_rate": 3.0406225160021757e-06, + "loss": 2.6536, + "step": 20270 + }, + { + "epoch": 2.54, + "grad_norm": 19.91202735900879, + "learning_rate": 3.0397858009454885e-06, + "loss": 1.1612, + "step": 20271 + }, + { + "epoch": 2.54, + "grad_norm": 15.705401420593262, + "learning_rate": 3.038949085888801e-06, + "loss": 1.0115, + "step": 20272 + }, + { + "epoch": 2.54, + "grad_norm": 12.167220115661621, + "learning_rate": 3.0381123708321137e-06, + "loss": 1.4634, + "step": 20273 + }, + { + "epoch": 2.54, + "grad_norm": 85.41754150390625, + "learning_rate": 3.0372756557754256e-06, + "loss": 3.3383, + "step": 20274 + }, + { + "epoch": 2.54, + "grad_norm": 5.647180557250977, + "learning_rate": 3.0364389407187384e-06, + "loss": 0.2814, + "step": 20275 + }, + { + "epoch": 2.54, + "grad_norm": 8.535072326660156, + "learning_rate": 3.035602225662051e-06, + "loss": 1.3482, + "step": 20276 + }, + { + "epoch": 2.54, + "grad_norm": 4.6375298500061035, + "learning_rate": 3.0347655106053636e-06, + "loss": 0.1065, + "step": 20277 + }, + { + "epoch": 2.54, + "grad_norm": 44.29484558105469, + "learning_rate": 3.0339287955486764e-06, + "loss": 1.8078, + "step": 20278 + }, + { + "epoch": 2.54, + "grad_norm": 15.028776168823242, + "learning_rate": 3.033092080491989e-06, + "loss": 0.9637, + "step": 20279 + }, + { + "epoch": 2.55, + "grad_norm": 34.271339416503906, + "learning_rate": 3.0322553654353016e-06, + "loss": 1.7199, + "step": 20280 + }, + { + "epoch": 2.55, + "grad_norm": 25.27798080444336, + "learning_rate": 3.0314186503786136e-06, + "loss": 0.869, + "step": 20281 + }, + { + "epoch": 2.55, + "grad_norm": 429.1591796875, + "learning_rate": 3.0305819353219264e-06, + "loss": 2.6978, + "step": 20282 + }, + { + "epoch": 2.55, + "grad_norm": 9.644042015075684, + "learning_rate": 3.0297452202652387e-06, + "loss": 0.7662, + "step": 20283 + }, + { + "epoch": 2.55, + "grad_norm": 17.65026092529297, + "learning_rate": 3.0289085052085515e-06, + "loss": 1.4505, + "step": 20284 + }, + { + "epoch": 2.55, + "grad_norm": 40.25577926635742, + "learning_rate": 3.0280717901518643e-06, + "loss": 1.4754, + "step": 20285 + }, + { + "epoch": 2.55, + "grad_norm": 11.539678573608398, + "learning_rate": 3.0272350750951767e-06, + "loss": 0.1681, + "step": 20286 + }, + { + "epoch": 2.55, + "grad_norm": 3.209381580352783, + "learning_rate": 3.0263983600384895e-06, + "loss": 0.2759, + "step": 20287 + }, + { + "epoch": 2.55, + "grad_norm": 5.28467321395874, + "learning_rate": 3.0255616449818015e-06, + "loss": 0.4294, + "step": 20288 + }, + { + "epoch": 2.55, + "grad_norm": 10.976964950561523, + "learning_rate": 3.0247249299251143e-06, + "loss": 0.4842, + "step": 20289 + }, + { + "epoch": 2.55, + "grad_norm": 8.156913757324219, + "learning_rate": 3.0238882148684267e-06, + "loss": 1.174, + "step": 20290 + }, + { + "epoch": 2.55, + "grad_norm": 18.176769256591797, + "learning_rate": 3.0230514998117395e-06, + "loss": 0.8032, + "step": 20291 + }, + { + "epoch": 2.55, + "grad_norm": 13.518056869506836, + "learning_rate": 3.022214784755052e-06, + "loss": 0.6218, + "step": 20292 + }, + { + "epoch": 2.55, + "grad_norm": 8.877962112426758, + "learning_rate": 3.0213780696983646e-06, + "loss": 0.9831, + "step": 20293 + }, + { + "epoch": 2.55, + "grad_norm": 16.532529830932617, + "learning_rate": 3.0205413546416766e-06, + "loss": 1.6324, + "step": 20294 + }, + { + "epoch": 2.55, + "grad_norm": 14.755925178527832, + "learning_rate": 3.0197046395849894e-06, + "loss": 1.2597, + "step": 20295 + }, + { + "epoch": 2.55, + "grad_norm": 13.560070037841797, + "learning_rate": 3.018867924528302e-06, + "loss": 0.7244, + "step": 20296 + }, + { + "epoch": 2.55, + "grad_norm": 15.837381362915039, + "learning_rate": 3.0180312094716146e-06, + "loss": 2.0255, + "step": 20297 + }, + { + "epoch": 2.55, + "grad_norm": 2.487107515335083, + "learning_rate": 3.0171944944149274e-06, + "loss": 0.2261, + "step": 20298 + }, + { + "epoch": 2.55, + "grad_norm": 11.393579483032227, + "learning_rate": 3.0163577793582398e-06, + "loss": 0.5304, + "step": 20299 + }, + { + "epoch": 2.55, + "grad_norm": 19.697843551635742, + "learning_rate": 3.0155210643015526e-06, + "loss": 0.6604, + "step": 20300 + }, + { + "epoch": 2.55, + "grad_norm": 434.9746398925781, + "learning_rate": 3.0146843492448645e-06, + "loss": 1.8302, + "step": 20301 + }, + { + "epoch": 2.55, + "grad_norm": 29.384225845336914, + "learning_rate": 3.0138476341881773e-06, + "loss": 1.663, + "step": 20302 + }, + { + "epoch": 2.55, + "grad_norm": 24.34198760986328, + "learning_rate": 3.01301091913149e-06, + "loss": 1.1187, + "step": 20303 + }, + { + "epoch": 2.55, + "grad_norm": 9.198288917541504, + "learning_rate": 3.0121742040748025e-06, + "loss": 0.5688, + "step": 20304 + }, + { + "epoch": 2.55, + "grad_norm": 6.318452835083008, + "learning_rate": 3.0113374890181153e-06, + "loss": 1.6171, + "step": 20305 + }, + { + "epoch": 2.55, + "grad_norm": 32.8675651550293, + "learning_rate": 3.0105007739614277e-06, + "loss": 2.4725, + "step": 20306 + }, + { + "epoch": 2.55, + "grad_norm": 223.1664276123047, + "learning_rate": 3.0096640589047405e-06, + "loss": 2.5462, + "step": 20307 + }, + { + "epoch": 2.55, + "grad_norm": 10.688501358032227, + "learning_rate": 3.0088273438480524e-06, + "loss": 0.4247, + "step": 20308 + }, + { + "epoch": 2.55, + "grad_norm": 23.226240158081055, + "learning_rate": 3.0079906287913652e-06, + "loss": 0.6099, + "step": 20309 + }, + { + "epoch": 2.55, + "grad_norm": 3.763749599456787, + "learning_rate": 3.0071539137346776e-06, + "loss": 0.1512, + "step": 20310 + }, + { + "epoch": 2.55, + "grad_norm": 14.509944915771484, + "learning_rate": 3.0063171986779904e-06, + "loss": 1.2319, + "step": 20311 + }, + { + "epoch": 2.55, + "grad_norm": 12.403829574584961, + "learning_rate": 3.0054804836213032e-06, + "loss": 0.5058, + "step": 20312 + }, + { + "epoch": 2.55, + "grad_norm": 25.466928482055664, + "learning_rate": 3.0046437685646156e-06, + "loss": 1.4126, + "step": 20313 + }, + { + "epoch": 2.55, + "grad_norm": 17.499757766723633, + "learning_rate": 3.0038070535079284e-06, + "loss": 1.2541, + "step": 20314 + }, + { + "epoch": 2.55, + "grad_norm": 10.804409980773926, + "learning_rate": 3.0029703384512403e-06, + "loss": 0.4218, + "step": 20315 + }, + { + "epoch": 2.55, + "grad_norm": 13.253870964050293, + "learning_rate": 3.002133623394553e-06, + "loss": 0.518, + "step": 20316 + }, + { + "epoch": 2.55, + "grad_norm": 5.310703754425049, + "learning_rate": 3.0012969083378655e-06, + "loss": 0.187, + "step": 20317 + }, + { + "epoch": 2.55, + "grad_norm": 25.528017044067383, + "learning_rate": 3.0004601932811783e-06, + "loss": 0.6514, + "step": 20318 + }, + { + "epoch": 2.55, + "grad_norm": 12.111616134643555, + "learning_rate": 2.999623478224491e-06, + "loss": 0.8347, + "step": 20319 + }, + { + "epoch": 2.55, + "grad_norm": 20.67764663696289, + "learning_rate": 2.9987867631678035e-06, + "loss": 0.5942, + "step": 20320 + }, + { + "epoch": 2.55, + "grad_norm": 5.750455379486084, + "learning_rate": 2.9979500481111163e-06, + "loss": 0.1571, + "step": 20321 + }, + { + "epoch": 2.55, + "grad_norm": 8.119806289672852, + "learning_rate": 2.9971133330544283e-06, + "loss": 0.1028, + "step": 20322 + }, + { + "epoch": 2.55, + "grad_norm": 0.8741098046302795, + "learning_rate": 2.996276617997741e-06, + "loss": 0.0391, + "step": 20323 + }, + { + "epoch": 2.55, + "grad_norm": 15.446863174438477, + "learning_rate": 2.9954399029410534e-06, + "loss": 1.6925, + "step": 20324 + }, + { + "epoch": 2.55, + "grad_norm": 12.320019721984863, + "learning_rate": 2.9946031878843662e-06, + "loss": 0.6637, + "step": 20325 + }, + { + "epoch": 2.55, + "grad_norm": 19.610437393188477, + "learning_rate": 2.993766472827679e-06, + "loss": 1.6855, + "step": 20326 + }, + { + "epoch": 2.55, + "grad_norm": 21.2055606842041, + "learning_rate": 2.9929297577709914e-06, + "loss": 1.7123, + "step": 20327 + }, + { + "epoch": 2.55, + "grad_norm": 6.5165486335754395, + "learning_rate": 2.9920930427143042e-06, + "loss": 0.1991, + "step": 20328 + }, + { + "epoch": 2.55, + "grad_norm": 29.296640396118164, + "learning_rate": 2.991256327657616e-06, + "loss": 0.3101, + "step": 20329 + }, + { + "epoch": 2.55, + "grad_norm": 41.530357360839844, + "learning_rate": 2.990419612600929e-06, + "loss": 2.4167, + "step": 20330 + }, + { + "epoch": 2.55, + "grad_norm": 6.716545104980469, + "learning_rate": 2.9895828975442414e-06, + "loss": 0.5641, + "step": 20331 + }, + { + "epoch": 2.55, + "grad_norm": 57.292720794677734, + "learning_rate": 2.988746182487554e-06, + "loss": 0.7099, + "step": 20332 + }, + { + "epoch": 2.55, + "grad_norm": 57.8223991394043, + "learning_rate": 2.9879094674308665e-06, + "loss": 2.4506, + "step": 20333 + }, + { + "epoch": 2.55, + "grad_norm": 18.16729736328125, + "learning_rate": 2.9870727523741793e-06, + "loss": 0.7348, + "step": 20334 + }, + { + "epoch": 2.55, + "grad_norm": 7.2328925132751465, + "learning_rate": 2.986236037317492e-06, + "loss": 2.323, + "step": 20335 + }, + { + "epoch": 2.55, + "grad_norm": 9.983031272888184, + "learning_rate": 2.985399322260804e-06, + "loss": 0.6804, + "step": 20336 + }, + { + "epoch": 2.55, + "grad_norm": 165.5161590576172, + "learning_rate": 2.984562607204117e-06, + "loss": 1.3491, + "step": 20337 + }, + { + "epoch": 2.55, + "grad_norm": 10.617986679077148, + "learning_rate": 2.9837258921474293e-06, + "loss": 0.2747, + "step": 20338 + }, + { + "epoch": 2.55, + "grad_norm": 11.300695419311523, + "learning_rate": 2.982889177090742e-06, + "loss": 0.497, + "step": 20339 + }, + { + "epoch": 2.55, + "grad_norm": 9.961305618286133, + "learning_rate": 2.9820524620340545e-06, + "loss": 0.6737, + "step": 20340 + }, + { + "epoch": 2.55, + "grad_norm": 21.09966278076172, + "learning_rate": 2.9812157469773673e-06, + "loss": 0.6899, + "step": 20341 + }, + { + "epoch": 2.55, + "grad_norm": 12.62860107421875, + "learning_rate": 2.98037903192068e-06, + "loss": 0.5219, + "step": 20342 + }, + { + "epoch": 2.55, + "grad_norm": 18.826616287231445, + "learning_rate": 2.979542316863992e-06, + "loss": 0.3157, + "step": 20343 + }, + { + "epoch": 2.55, + "grad_norm": 26.843746185302734, + "learning_rate": 2.978705601807305e-06, + "loss": 0.947, + "step": 20344 + }, + { + "epoch": 2.55, + "grad_norm": 12.240778923034668, + "learning_rate": 2.977868886750617e-06, + "loss": 0.5789, + "step": 20345 + }, + { + "epoch": 2.55, + "grad_norm": 18.644920349121094, + "learning_rate": 2.97703217169393e-06, + "loss": 1.1404, + "step": 20346 + }, + { + "epoch": 2.55, + "grad_norm": 20.328208923339844, + "learning_rate": 2.9761954566372424e-06, + "loss": 1.1604, + "step": 20347 + }, + { + "epoch": 2.55, + "grad_norm": 31.434791564941406, + "learning_rate": 2.975358741580555e-06, + "loss": 1.3904, + "step": 20348 + }, + { + "epoch": 2.55, + "grad_norm": 5.116194725036621, + "learning_rate": 2.974522026523868e-06, + "loss": 0.3696, + "step": 20349 + }, + { + "epoch": 2.55, + "grad_norm": 14.46107292175293, + "learning_rate": 2.97368531146718e-06, + "loss": 2.9684, + "step": 20350 + }, + { + "epoch": 2.55, + "grad_norm": 46.29600143432617, + "learning_rate": 2.9728485964104923e-06, + "loss": 1.2229, + "step": 20351 + }, + { + "epoch": 2.55, + "grad_norm": 7.156410217285156, + "learning_rate": 2.972011881353805e-06, + "loss": 0.507, + "step": 20352 + }, + { + "epoch": 2.55, + "grad_norm": 13.746549606323242, + "learning_rate": 2.971175166297118e-06, + "loss": 1.8664, + "step": 20353 + }, + { + "epoch": 2.55, + "grad_norm": 71.00946807861328, + "learning_rate": 2.9703384512404303e-06, + "loss": 0.9309, + "step": 20354 + }, + { + "epoch": 2.55, + "grad_norm": 5.014029502868652, + "learning_rate": 2.969501736183743e-06, + "loss": 0.4989, + "step": 20355 + }, + { + "epoch": 2.55, + "grad_norm": 11.859457015991211, + "learning_rate": 2.9686650211270555e-06, + "loss": 0.4248, + "step": 20356 + }, + { + "epoch": 2.55, + "grad_norm": 4.653107166290283, + "learning_rate": 2.967828306070368e-06, + "loss": 0.306, + "step": 20357 + }, + { + "epoch": 2.55, + "grad_norm": 20.030885696411133, + "learning_rate": 2.9669915910136802e-06, + "loss": 1.5919, + "step": 20358 + }, + { + "epoch": 2.56, + "grad_norm": 7.168249607086182, + "learning_rate": 2.966154875956993e-06, + "loss": 0.2919, + "step": 20359 + }, + { + "epoch": 2.56, + "grad_norm": 3.518740653991699, + "learning_rate": 2.965318160900306e-06, + "loss": 1.2664, + "step": 20360 + }, + { + "epoch": 2.56, + "grad_norm": 19.980792999267578, + "learning_rate": 2.9644814458436182e-06, + "loss": 1.2647, + "step": 20361 + }, + { + "epoch": 2.56, + "grad_norm": 6.87418270111084, + "learning_rate": 2.963644730786931e-06, + "loss": 0.4924, + "step": 20362 + }, + { + "epoch": 2.56, + "grad_norm": 13.117971420288086, + "learning_rate": 2.9628080157302434e-06, + "loss": 0.4402, + "step": 20363 + }, + { + "epoch": 2.56, + "grad_norm": 155.3637237548828, + "learning_rate": 2.9619713006735558e-06, + "loss": 2.9843, + "step": 20364 + }, + { + "epoch": 2.56, + "grad_norm": 5.862758159637451, + "learning_rate": 2.961134585616868e-06, + "loss": 0.2754, + "step": 20365 + }, + { + "epoch": 2.56, + "grad_norm": 17.832685470581055, + "learning_rate": 2.960297870560181e-06, + "loss": 1.018, + "step": 20366 + }, + { + "epoch": 2.56, + "grad_norm": 14.653788566589355, + "learning_rate": 2.9594611555034938e-06, + "loss": 0.7364, + "step": 20367 + }, + { + "epoch": 2.56, + "grad_norm": 16.753034591674805, + "learning_rate": 2.958624440446806e-06, + "loss": 1.3807, + "step": 20368 + }, + { + "epoch": 2.56, + "grad_norm": 5.317260265350342, + "learning_rate": 2.957787725390119e-06, + "loss": 0.3076, + "step": 20369 + }, + { + "epoch": 2.56, + "grad_norm": 19.307132720947266, + "learning_rate": 2.9569510103334313e-06, + "loss": 0.7773, + "step": 20370 + }, + { + "epoch": 2.56, + "grad_norm": 6.9656662940979, + "learning_rate": 2.9561142952767437e-06, + "loss": 0.4826, + "step": 20371 + }, + { + "epoch": 2.56, + "grad_norm": 7.019276142120361, + "learning_rate": 2.955277580220056e-06, + "loss": 0.3119, + "step": 20372 + }, + { + "epoch": 2.56, + "grad_norm": 3.526972770690918, + "learning_rate": 2.954440865163369e-06, + "loss": 0.3393, + "step": 20373 + }, + { + "epoch": 2.56, + "grad_norm": 11.408474922180176, + "learning_rate": 2.9536041501066813e-06, + "loss": 2.8712, + "step": 20374 + }, + { + "epoch": 2.56, + "grad_norm": 14.502580642700195, + "learning_rate": 2.952767435049994e-06, + "loss": 0.6733, + "step": 20375 + }, + { + "epoch": 2.56, + "grad_norm": 20.133806228637695, + "learning_rate": 2.951930719993307e-06, + "loss": 1.2049, + "step": 20376 + }, + { + "epoch": 2.56, + "grad_norm": 25.461997985839844, + "learning_rate": 2.9510940049366192e-06, + "loss": 0.4696, + "step": 20377 + }, + { + "epoch": 2.56, + "grad_norm": 10.181278228759766, + "learning_rate": 2.9502572898799316e-06, + "loss": 1.8776, + "step": 20378 + }, + { + "epoch": 2.56, + "grad_norm": 15.376132011413574, + "learning_rate": 2.949420574823244e-06, + "loss": 1.0615, + "step": 20379 + }, + { + "epoch": 2.56, + "grad_norm": 25.411516189575195, + "learning_rate": 2.948583859766557e-06, + "loss": 1.2451, + "step": 20380 + }, + { + "epoch": 2.56, + "grad_norm": 107.49919891357422, + "learning_rate": 2.947747144709869e-06, + "loss": 1.8063, + "step": 20381 + }, + { + "epoch": 2.56, + "grad_norm": 12.071283340454102, + "learning_rate": 2.946910429653182e-06, + "loss": 0.4915, + "step": 20382 + }, + { + "epoch": 2.56, + "grad_norm": 22.164794921875, + "learning_rate": 2.9460737145964948e-06, + "loss": 2.6652, + "step": 20383 + }, + { + "epoch": 2.56, + "grad_norm": 26.348634719848633, + "learning_rate": 2.9452369995398067e-06, + "loss": 1.0793, + "step": 20384 + }, + { + "epoch": 2.56, + "grad_norm": 9.477949142456055, + "learning_rate": 2.944400284483119e-06, + "loss": 0.4536, + "step": 20385 + }, + { + "epoch": 2.56, + "grad_norm": 29.26583480834961, + "learning_rate": 2.943563569426432e-06, + "loss": 1.5501, + "step": 20386 + }, + { + "epoch": 2.56, + "grad_norm": 25.907440185546875, + "learning_rate": 2.9427268543697447e-06, + "loss": 1.4007, + "step": 20387 + }, + { + "epoch": 2.56, + "grad_norm": 30.64686393737793, + "learning_rate": 2.941890139313057e-06, + "loss": 1.2803, + "step": 20388 + }, + { + "epoch": 2.56, + "grad_norm": 37.722618103027344, + "learning_rate": 2.94105342425637e-06, + "loss": 0.8912, + "step": 20389 + }, + { + "epoch": 2.56, + "grad_norm": 6.256129264831543, + "learning_rate": 2.9402167091996827e-06, + "loss": 0.0959, + "step": 20390 + }, + { + "epoch": 2.56, + "grad_norm": 15.23316764831543, + "learning_rate": 2.9393799941429947e-06, + "loss": 0.642, + "step": 20391 + }, + { + "epoch": 2.56, + "grad_norm": 18.358585357666016, + "learning_rate": 2.938543279086307e-06, + "loss": 0.6939, + "step": 20392 + }, + { + "epoch": 2.56, + "grad_norm": 20.84736442565918, + "learning_rate": 2.93770656402962e-06, + "loss": 1.0631, + "step": 20393 + }, + { + "epoch": 2.56, + "grad_norm": 3.9894471168518066, + "learning_rate": 2.9368698489729326e-06, + "loss": 0.2495, + "step": 20394 + }, + { + "epoch": 2.56, + "grad_norm": 20.384599685668945, + "learning_rate": 2.936033133916245e-06, + "loss": 2.1595, + "step": 20395 + }, + { + "epoch": 2.56, + "grad_norm": 5.945254325866699, + "learning_rate": 2.935196418859558e-06, + "loss": 0.4665, + "step": 20396 + }, + { + "epoch": 2.56, + "grad_norm": 6.275689601898193, + "learning_rate": 2.93435970380287e-06, + "loss": 0.6667, + "step": 20397 + }, + { + "epoch": 2.56, + "grad_norm": 12.31562614440918, + "learning_rate": 2.9335229887461826e-06, + "loss": 0.3503, + "step": 20398 + }, + { + "epoch": 2.56, + "grad_norm": 4.085023880004883, + "learning_rate": 2.932686273689495e-06, + "loss": 0.1873, + "step": 20399 + }, + { + "epoch": 2.56, + "grad_norm": 47.73721694946289, + "learning_rate": 2.9318495586328077e-06, + "loss": 1.5451, + "step": 20400 + }, + { + "epoch": 2.56, + "eval_loss": 0.08377895504236221, + "eval_runtime": 95.4086, + "eval_samples_per_second": 37.125, + "eval_steps_per_second": 37.125, + "step": 20400 + }, + { + "epoch": 2.56, + "grad_norm": 11.437736511230469, + "learning_rate": 2.9310128435761206e-06, + "loss": 0.221, + "step": 20401 + }, + { + "epoch": 2.56, + "grad_norm": 3.5437731742858887, + "learning_rate": 2.930176128519433e-06, + "loss": 0.152, + "step": 20402 + }, + { + "epoch": 2.56, + "grad_norm": 106.44786071777344, + "learning_rate": 2.9293394134627457e-06, + "loss": 0.878, + "step": 20403 + }, + { + "epoch": 2.56, + "grad_norm": 8.05051326751709, + "learning_rate": 2.928502698406058e-06, + "loss": 1.2269, + "step": 20404 + }, + { + "epoch": 2.56, + "grad_norm": 13.124874114990234, + "learning_rate": 2.9276659833493705e-06, + "loss": 0.6975, + "step": 20405 + }, + { + "epoch": 2.56, + "grad_norm": 17.221309661865234, + "learning_rate": 2.926829268292683e-06, + "loss": 0.6167, + "step": 20406 + }, + { + "epoch": 2.56, + "grad_norm": 13.0005464553833, + "learning_rate": 2.9259925532359957e-06, + "loss": 0.4355, + "step": 20407 + }, + { + "epoch": 2.56, + "grad_norm": 16.78082275390625, + "learning_rate": 2.9251558381793085e-06, + "loss": 0.7117, + "step": 20408 + }, + { + "epoch": 2.56, + "grad_norm": 26.075725555419922, + "learning_rate": 2.924319123122621e-06, + "loss": 1.4162, + "step": 20409 + }, + { + "epoch": 2.56, + "grad_norm": 25.144702911376953, + "learning_rate": 2.9234824080659337e-06, + "loss": 1.8183, + "step": 20410 + }, + { + "epoch": 2.56, + "grad_norm": 7.116442680358887, + "learning_rate": 2.922645693009246e-06, + "loss": 1.6661, + "step": 20411 + }, + { + "epoch": 2.56, + "grad_norm": 16.31544303894043, + "learning_rate": 2.9218089779525584e-06, + "loss": 1.7183, + "step": 20412 + }, + { + "epoch": 2.56, + "grad_norm": 14.379830360412598, + "learning_rate": 2.9209722628958708e-06, + "loss": 0.426, + "step": 20413 + }, + { + "epoch": 2.56, + "grad_norm": 6.947141647338867, + "learning_rate": 2.9201355478391836e-06, + "loss": 0.7284, + "step": 20414 + }, + { + "epoch": 2.56, + "grad_norm": 8.293611526489258, + "learning_rate": 2.919298832782496e-06, + "loss": 0.9619, + "step": 20415 + }, + { + "epoch": 2.56, + "grad_norm": 8.01059627532959, + "learning_rate": 2.9184621177258088e-06, + "loss": 0.3516, + "step": 20416 + }, + { + "epoch": 2.56, + "grad_norm": 22.62220573425293, + "learning_rate": 2.9176254026691216e-06, + "loss": 1.1697, + "step": 20417 + }, + { + "epoch": 2.56, + "grad_norm": 20.267784118652344, + "learning_rate": 2.916788687612434e-06, + "loss": 1.1011, + "step": 20418 + }, + { + "epoch": 2.56, + "grad_norm": 4.148131370544434, + "learning_rate": 2.9159519725557463e-06, + "loss": 0.3629, + "step": 20419 + }, + { + "epoch": 2.56, + "grad_norm": 38.091129302978516, + "learning_rate": 2.9151152574990587e-06, + "loss": 0.6832, + "step": 20420 + }, + { + "epoch": 2.56, + "grad_norm": 34.675601959228516, + "learning_rate": 2.9142785424423715e-06, + "loss": 1.2377, + "step": 20421 + }, + { + "epoch": 2.56, + "grad_norm": 30.98348045349121, + "learning_rate": 2.913441827385684e-06, + "loss": 1.6618, + "step": 20422 + }, + { + "epoch": 2.56, + "grad_norm": 13.842818260192871, + "learning_rate": 2.9126051123289967e-06, + "loss": 0.7118, + "step": 20423 + }, + { + "epoch": 2.56, + "grad_norm": 9.606385231018066, + "learning_rate": 2.9117683972723095e-06, + "loss": 0.9371, + "step": 20424 + }, + { + "epoch": 2.56, + "grad_norm": 29.835241317749023, + "learning_rate": 2.910931682215622e-06, + "loss": 0.8631, + "step": 20425 + }, + { + "epoch": 2.56, + "grad_norm": 23.59840965270996, + "learning_rate": 2.910094967158934e-06, + "loss": 1.1475, + "step": 20426 + }, + { + "epoch": 2.56, + "grad_norm": 22.209251403808594, + "learning_rate": 2.9092582521022466e-06, + "loss": 0.854, + "step": 20427 + }, + { + "epoch": 2.56, + "grad_norm": 15.59139347076416, + "learning_rate": 2.9084215370455594e-06, + "loss": 0.8305, + "step": 20428 + }, + { + "epoch": 2.56, + "grad_norm": 57.197265625, + "learning_rate": 2.907584821988872e-06, + "loss": 1.7459, + "step": 20429 + }, + { + "epoch": 2.56, + "grad_norm": 35.6335563659668, + "learning_rate": 2.9067481069321846e-06, + "loss": 2.1248, + "step": 20430 + }, + { + "epoch": 2.56, + "grad_norm": 19.992061614990234, + "learning_rate": 2.9059113918754974e-06, + "loss": 1.3376, + "step": 20431 + }, + { + "epoch": 2.56, + "grad_norm": 17.135761260986328, + "learning_rate": 2.9050746768188098e-06, + "loss": 0.8492, + "step": 20432 + }, + { + "epoch": 2.56, + "grad_norm": 14.199894905090332, + "learning_rate": 2.9042379617621217e-06, + "loss": 0.7577, + "step": 20433 + }, + { + "epoch": 2.56, + "grad_norm": 9.9739351272583, + "learning_rate": 2.9034012467054345e-06, + "loss": 0.4578, + "step": 20434 + }, + { + "epoch": 2.56, + "grad_norm": 19.584999084472656, + "learning_rate": 2.9025645316487473e-06, + "loss": 1.0605, + "step": 20435 + }, + { + "epoch": 2.56, + "grad_norm": 2.4521515369415283, + "learning_rate": 2.9017278165920597e-06, + "loss": 0.1298, + "step": 20436 + }, + { + "epoch": 2.56, + "grad_norm": 1.8404103517532349, + "learning_rate": 2.9008911015353725e-06, + "loss": 0.0307, + "step": 20437 + }, + { + "epoch": 2.56, + "grad_norm": 5.019158363342285, + "learning_rate": 2.900054386478685e-06, + "loss": 0.3227, + "step": 20438 + }, + { + "epoch": 2.57, + "grad_norm": 188.9952392578125, + "learning_rate": 2.8992176714219977e-06, + "loss": 1.2798, + "step": 20439 + }, + { + "epoch": 2.57, + "grad_norm": 9.685225486755371, + "learning_rate": 2.8983809563653097e-06, + "loss": 2.7265, + "step": 20440 + }, + { + "epoch": 2.57, + "grad_norm": 8.216483116149902, + "learning_rate": 2.8975442413086225e-06, + "loss": 0.3263, + "step": 20441 + }, + { + "epoch": 2.57, + "grad_norm": 25.34651756286621, + "learning_rate": 2.8967075262519353e-06, + "loss": 0.7347, + "step": 20442 + }, + { + "epoch": 2.57, + "grad_norm": 15.343186378479004, + "learning_rate": 2.8958708111952476e-06, + "loss": 0.7736, + "step": 20443 + }, + { + "epoch": 2.57, + "grad_norm": 10.678083419799805, + "learning_rate": 2.8950340961385604e-06, + "loss": 0.7355, + "step": 20444 + }, + { + "epoch": 2.57, + "grad_norm": 13.670865058898926, + "learning_rate": 2.894197381081873e-06, + "loss": 1.5327, + "step": 20445 + }, + { + "epoch": 2.57, + "grad_norm": 13.043383598327637, + "learning_rate": 2.8933606660251856e-06, + "loss": 0.8371, + "step": 20446 + }, + { + "epoch": 2.57, + "grad_norm": 15.65070629119873, + "learning_rate": 2.8925239509684976e-06, + "loss": 0.7348, + "step": 20447 + }, + { + "epoch": 2.57, + "grad_norm": 5.623363971710205, + "learning_rate": 2.8916872359118104e-06, + "loss": 0.5187, + "step": 20448 + }, + { + "epoch": 2.57, + "grad_norm": 13.084575653076172, + "learning_rate": 2.8908505208551228e-06, + "loss": 1.3617, + "step": 20449 + }, + { + "epoch": 2.57, + "grad_norm": 5.675920486450195, + "learning_rate": 2.8900138057984356e-06, + "loss": 0.2809, + "step": 20450 + }, + { + "epoch": 2.57, + "grad_norm": 15.445809364318848, + "learning_rate": 2.8891770907417484e-06, + "loss": 0.7837, + "step": 20451 + }, + { + "epoch": 2.57, + "grad_norm": 9.706493377685547, + "learning_rate": 2.8883403756850607e-06, + "loss": 2.1096, + "step": 20452 + }, + { + "epoch": 2.57, + "grad_norm": 12.382505416870117, + "learning_rate": 2.8875036606283735e-06, + "loss": 0.9842, + "step": 20453 + }, + { + "epoch": 2.57, + "grad_norm": 17.63967514038086, + "learning_rate": 2.8866669455716855e-06, + "loss": 0.5586, + "step": 20454 + }, + { + "epoch": 2.57, + "grad_norm": 6.769794464111328, + "learning_rate": 2.8858302305149983e-06, + "loss": 0.3738, + "step": 20455 + }, + { + "epoch": 2.57, + "grad_norm": 156.72103881835938, + "learning_rate": 2.8849935154583107e-06, + "loss": 2.5701, + "step": 20456 + }, + { + "epoch": 2.57, + "grad_norm": 7.69674015045166, + "learning_rate": 2.8841568004016235e-06, + "loss": 0.42, + "step": 20457 + }, + { + "epoch": 2.57, + "grad_norm": 6.638584613800049, + "learning_rate": 2.8833200853449363e-06, + "loss": 0.1432, + "step": 20458 + }, + { + "epoch": 2.57, + "grad_norm": 12.168359756469727, + "learning_rate": 2.8824833702882487e-06, + "loss": 0.616, + "step": 20459 + }, + { + "epoch": 2.57, + "grad_norm": 11.028854370117188, + "learning_rate": 2.8816466552315615e-06, + "loss": 0.3877, + "step": 20460 + }, + { + "epoch": 2.57, + "grad_norm": 16.257770538330078, + "learning_rate": 2.8808099401748734e-06, + "loss": 1.3306, + "step": 20461 + }, + { + "epoch": 2.57, + "grad_norm": 13.83847427368164, + "learning_rate": 2.8799732251181862e-06, + "loss": 1.0759, + "step": 20462 + }, + { + "epoch": 2.57, + "grad_norm": 7.097414970397949, + "learning_rate": 2.8791365100614986e-06, + "loss": 0.4918, + "step": 20463 + }, + { + "epoch": 2.57, + "grad_norm": 15.4981107711792, + "learning_rate": 2.8782997950048114e-06, + "loss": 0.5027, + "step": 20464 + }, + { + "epoch": 2.57, + "grad_norm": 8.238839149475098, + "learning_rate": 2.877463079948124e-06, + "loss": 0.2851, + "step": 20465 + }, + { + "epoch": 2.57, + "grad_norm": 53.47518539428711, + "learning_rate": 2.8766263648914366e-06, + "loss": 0.889, + "step": 20466 + }, + { + "epoch": 2.57, + "grad_norm": 8.977937698364258, + "learning_rate": 2.8757896498347494e-06, + "loss": 0.25, + "step": 20467 + }, + { + "epoch": 2.57, + "grad_norm": 36.48308563232422, + "learning_rate": 2.8749529347780613e-06, + "loss": 1.6408, + "step": 20468 + }, + { + "epoch": 2.57, + "grad_norm": 17.047109603881836, + "learning_rate": 2.874116219721374e-06, + "loss": 0.6356, + "step": 20469 + }, + { + "epoch": 2.57, + "grad_norm": 15.693798065185547, + "learning_rate": 2.8732795046646865e-06, + "loss": 1.3917, + "step": 20470 + }, + { + "epoch": 2.57, + "grad_norm": 71.5500259399414, + "learning_rate": 2.8724427896079993e-06, + "loss": 0.9439, + "step": 20471 + }, + { + "epoch": 2.57, + "grad_norm": 21.63862419128418, + "learning_rate": 2.871606074551312e-06, + "loss": 0.925, + "step": 20472 + }, + { + "epoch": 2.57, + "grad_norm": 84.76802825927734, + "learning_rate": 2.8707693594946245e-06, + "loss": 1.9239, + "step": 20473 + }, + { + "epoch": 2.57, + "grad_norm": 18.43046760559082, + "learning_rate": 2.8699326444379364e-06, + "loss": 0.9415, + "step": 20474 + }, + { + "epoch": 2.57, + "grad_norm": 31.983963012695312, + "learning_rate": 2.8690959293812492e-06, + "loss": 1.5056, + "step": 20475 + }, + { + "epoch": 2.57, + "grad_norm": 6.9821014404296875, + "learning_rate": 2.868259214324562e-06, + "loss": 1.4984, + "step": 20476 + }, + { + "epoch": 2.57, + "grad_norm": 7.553549766540527, + "learning_rate": 2.8674224992678744e-06, + "loss": 1.2257, + "step": 20477 + }, + { + "epoch": 2.57, + "grad_norm": 13.389091491699219, + "learning_rate": 2.8665857842111872e-06, + "loss": 1.2428, + "step": 20478 + }, + { + "epoch": 2.57, + "grad_norm": 67.30017852783203, + "learning_rate": 2.8657490691544996e-06, + "loss": 1.2801, + "step": 20479 + }, + { + "epoch": 2.57, + "grad_norm": 12.074796676635742, + "learning_rate": 2.8649123540978124e-06, + "loss": 0.6248, + "step": 20480 + }, + { + "epoch": 2.57, + "grad_norm": 8.72959041595459, + "learning_rate": 2.8640756390411244e-06, + "loss": 0.5387, + "step": 20481 + }, + { + "epoch": 2.57, + "grad_norm": 10.535429000854492, + "learning_rate": 2.863238923984437e-06, + "loss": 0.5849, + "step": 20482 + }, + { + "epoch": 2.57, + "grad_norm": 5.176032066345215, + "learning_rate": 2.86240220892775e-06, + "loss": 0.355, + "step": 20483 + }, + { + "epoch": 2.57, + "grad_norm": 4.493794918060303, + "learning_rate": 2.8615654938710623e-06, + "loss": 0.1702, + "step": 20484 + }, + { + "epoch": 2.57, + "grad_norm": 29.761999130249023, + "learning_rate": 2.860728778814375e-06, + "loss": 1.107, + "step": 20485 + }, + { + "epoch": 2.57, + "grad_norm": 13.900596618652344, + "learning_rate": 2.8598920637576875e-06, + "loss": 2.0491, + "step": 20486 + }, + { + "epoch": 2.57, + "grad_norm": 15.221857070922852, + "learning_rate": 2.8590553487010003e-06, + "loss": 0.3947, + "step": 20487 + }, + { + "epoch": 2.57, + "grad_norm": 44.226226806640625, + "learning_rate": 2.8582186336443123e-06, + "loss": 2.401, + "step": 20488 + }, + { + "epoch": 2.57, + "grad_norm": 16.93692970275879, + "learning_rate": 2.857381918587625e-06, + "loss": 1.8102, + "step": 20489 + }, + { + "epoch": 2.57, + "grad_norm": 16.462223052978516, + "learning_rate": 2.8565452035309375e-06, + "loss": 0.8821, + "step": 20490 + }, + { + "epoch": 2.57, + "grad_norm": 16.876440048217773, + "learning_rate": 2.8557084884742503e-06, + "loss": 0.4465, + "step": 20491 + }, + { + "epoch": 2.57, + "grad_norm": 11.117237091064453, + "learning_rate": 2.854871773417563e-06, + "loss": 0.4618, + "step": 20492 + }, + { + "epoch": 2.57, + "grad_norm": 22.775096893310547, + "learning_rate": 2.8540350583608754e-06, + "loss": 1.4381, + "step": 20493 + }, + { + "epoch": 2.57, + "grad_norm": 21.94539451599121, + "learning_rate": 2.8531983433041882e-06, + "loss": 1.6329, + "step": 20494 + }, + { + "epoch": 2.57, + "grad_norm": 15.146343231201172, + "learning_rate": 2.8523616282475e-06, + "loss": 0.6614, + "step": 20495 + }, + { + "epoch": 2.57, + "grad_norm": 9.918079376220703, + "learning_rate": 2.851524913190813e-06, + "loss": 0.4859, + "step": 20496 + }, + { + "epoch": 2.57, + "grad_norm": 18.03789520263672, + "learning_rate": 2.8506881981341254e-06, + "loss": 2.1463, + "step": 20497 + }, + { + "epoch": 2.57, + "grad_norm": 5.780584335327148, + "learning_rate": 2.849851483077438e-06, + "loss": 1.5282, + "step": 20498 + }, + { + "epoch": 2.57, + "grad_norm": 10.86877155303955, + "learning_rate": 2.849014768020751e-06, + "loss": 0.425, + "step": 20499 + }, + { + "epoch": 2.57, + "grad_norm": 8.41763973236084, + "learning_rate": 2.8481780529640634e-06, + "loss": 0.9491, + "step": 20500 + }, + { + "epoch": 2.57, + "grad_norm": 11.646855354309082, + "learning_rate": 2.847341337907376e-06, + "loss": 1.9672, + "step": 20501 + }, + { + "epoch": 2.57, + "grad_norm": 19.35943031311035, + "learning_rate": 2.846504622850688e-06, + "loss": 1.1775, + "step": 20502 + }, + { + "epoch": 2.57, + "grad_norm": 34.64932632446289, + "learning_rate": 2.845667907794001e-06, + "loss": 1.2362, + "step": 20503 + }, + { + "epoch": 2.57, + "grad_norm": 10.847381591796875, + "learning_rate": 2.8448311927373133e-06, + "loss": 1.8057, + "step": 20504 + }, + { + "epoch": 2.57, + "grad_norm": 4.648735523223877, + "learning_rate": 2.843994477680626e-06, + "loss": 0.1642, + "step": 20505 + }, + { + "epoch": 2.57, + "grad_norm": 29.921693801879883, + "learning_rate": 2.843157762623939e-06, + "loss": 1.4823, + "step": 20506 + }, + { + "epoch": 2.57, + "grad_norm": 21.329360961914062, + "learning_rate": 2.8423210475672513e-06, + "loss": 0.9152, + "step": 20507 + }, + { + "epoch": 2.57, + "grad_norm": 9.264352798461914, + "learning_rate": 2.841484332510564e-06, + "loss": 0.5901, + "step": 20508 + }, + { + "epoch": 2.57, + "grad_norm": 10.696232795715332, + "learning_rate": 2.840647617453876e-06, + "loss": 2.0568, + "step": 20509 + }, + { + "epoch": 2.57, + "grad_norm": 104.81800079345703, + "learning_rate": 2.839810902397189e-06, + "loss": 1.7419, + "step": 20510 + }, + { + "epoch": 2.57, + "grad_norm": 56.361446380615234, + "learning_rate": 2.8389741873405012e-06, + "loss": 2.2017, + "step": 20511 + }, + { + "epoch": 2.57, + "grad_norm": 21.71269989013672, + "learning_rate": 2.838137472283814e-06, + "loss": 3.0182, + "step": 20512 + }, + { + "epoch": 2.57, + "grad_norm": 17.92374038696289, + "learning_rate": 2.8373007572271264e-06, + "loss": 1.7016, + "step": 20513 + }, + { + "epoch": 2.57, + "grad_norm": 7.789238929748535, + "learning_rate": 2.836464042170439e-06, + "loss": 1.2383, + "step": 20514 + }, + { + "epoch": 2.57, + "grad_norm": 7.504380702972412, + "learning_rate": 2.835627327113752e-06, + "loss": 0.3392, + "step": 20515 + }, + { + "epoch": 2.57, + "grad_norm": 7.736235618591309, + "learning_rate": 2.834790612057064e-06, + "loss": 1.0989, + "step": 20516 + }, + { + "epoch": 2.57, + "grad_norm": 9.0857515335083, + "learning_rate": 2.8339538970003768e-06, + "loss": 1.3186, + "step": 20517 + }, + { + "epoch": 2.57, + "grad_norm": 172.78436279296875, + "learning_rate": 2.833117181943689e-06, + "loss": 2.1974, + "step": 20518 + }, + { + "epoch": 2.58, + "grad_norm": 25.050010681152344, + "learning_rate": 2.832280466887002e-06, + "loss": 0.5577, + "step": 20519 + }, + { + "epoch": 2.58, + "grad_norm": 3.163219451904297, + "learning_rate": 2.8314437518303143e-06, + "loss": 0.0758, + "step": 20520 + }, + { + "epoch": 2.58, + "grad_norm": 9.777112007141113, + "learning_rate": 2.830607036773627e-06, + "loss": 0.5397, + "step": 20521 + }, + { + "epoch": 2.58, + "grad_norm": 4.4373345375061035, + "learning_rate": 2.82977032171694e-06, + "loss": 0.1589, + "step": 20522 + }, + { + "epoch": 2.58, + "grad_norm": 12.718113899230957, + "learning_rate": 2.828933606660252e-06, + "loss": 0.3339, + "step": 20523 + }, + { + "epoch": 2.58, + "grad_norm": 17.83120346069336, + "learning_rate": 2.8280968916035647e-06, + "loss": 1.3957, + "step": 20524 + }, + { + "epoch": 2.58, + "grad_norm": 44.10799789428711, + "learning_rate": 2.827260176546877e-06, + "loss": 1.1544, + "step": 20525 + }, + { + "epoch": 2.58, + "grad_norm": 35.75393295288086, + "learning_rate": 2.82642346149019e-06, + "loss": 3.0461, + "step": 20526 + }, + { + "epoch": 2.58, + "grad_norm": 9.007635116577148, + "learning_rate": 2.8255867464335022e-06, + "loss": 0.9127, + "step": 20527 + }, + { + "epoch": 2.58, + "grad_norm": 10.208179473876953, + "learning_rate": 2.824750031376815e-06, + "loss": 0.5074, + "step": 20528 + }, + { + "epoch": 2.58, + "grad_norm": 33.458282470703125, + "learning_rate": 2.823913316320128e-06, + "loss": 1.4324, + "step": 20529 + }, + { + "epoch": 2.58, + "grad_norm": 5.232382297515869, + "learning_rate": 2.82307660126344e-06, + "loss": 0.4225, + "step": 20530 + }, + { + "epoch": 2.58, + "grad_norm": 19.59880828857422, + "learning_rate": 2.822239886206752e-06, + "loss": 1.464, + "step": 20531 + }, + { + "epoch": 2.58, + "grad_norm": 8.07803726196289, + "learning_rate": 2.821403171150065e-06, + "loss": 1.4669, + "step": 20532 + }, + { + "epoch": 2.58, + "grad_norm": 7.953825950622559, + "learning_rate": 2.8205664560933778e-06, + "loss": 0.4613, + "step": 20533 + }, + { + "epoch": 2.58, + "grad_norm": 22.497541427612305, + "learning_rate": 2.81972974103669e-06, + "loss": 1.5066, + "step": 20534 + }, + { + "epoch": 2.58, + "grad_norm": 107.39096069335938, + "learning_rate": 2.818893025980003e-06, + "loss": 2.8367, + "step": 20535 + }, + { + "epoch": 2.58, + "grad_norm": 18.257076263427734, + "learning_rate": 2.8180563109233158e-06, + "loss": 1.0713, + "step": 20536 + }, + { + "epoch": 2.58, + "grad_norm": 27.29205322265625, + "learning_rate": 2.8172195958666277e-06, + "loss": 1.4309, + "step": 20537 + }, + { + "epoch": 2.58, + "grad_norm": 11.839692115783691, + "learning_rate": 2.81638288080994e-06, + "loss": 0.8319, + "step": 20538 + }, + { + "epoch": 2.58, + "grad_norm": 8.173250198364258, + "learning_rate": 2.815546165753253e-06, + "loss": 0.2603, + "step": 20539 + }, + { + "epoch": 2.58, + "grad_norm": 10.04826831817627, + "learning_rate": 2.8147094506965657e-06, + "loss": 0.5927, + "step": 20540 + }, + { + "epoch": 2.58, + "grad_norm": 9.568615913391113, + "learning_rate": 2.813872735639878e-06, + "loss": 0.4649, + "step": 20541 + }, + { + "epoch": 2.58, + "grad_norm": 2.9093575477600098, + "learning_rate": 2.813036020583191e-06, + "loss": 0.1411, + "step": 20542 + }, + { + "epoch": 2.58, + "grad_norm": 514.5388793945312, + "learning_rate": 2.8121993055265033e-06, + "loss": 1.5976, + "step": 20543 + }, + { + "epoch": 2.58, + "grad_norm": 4.592569351196289, + "learning_rate": 2.8113625904698156e-06, + "loss": 0.1667, + "step": 20544 + }, + { + "epoch": 2.58, + "grad_norm": 14.131592750549316, + "learning_rate": 2.810525875413128e-06, + "loss": 0.8904, + "step": 20545 + }, + { + "epoch": 2.58, + "grad_norm": 11.625960350036621, + "learning_rate": 2.809689160356441e-06, + "loss": 0.429, + "step": 20546 + }, + { + "epoch": 2.58, + "grad_norm": 20.848854064941406, + "learning_rate": 2.8088524452997536e-06, + "loss": 1.1286, + "step": 20547 + }, + { + "epoch": 2.58, + "grad_norm": 70.20594024658203, + "learning_rate": 2.808015730243066e-06, + "loss": 2.5154, + "step": 20548 + }, + { + "epoch": 2.58, + "grad_norm": 4.269267559051514, + "learning_rate": 2.807179015186379e-06, + "loss": 0.3758, + "step": 20549 + }, + { + "epoch": 2.58, + "grad_norm": 6.539968013763428, + "learning_rate": 2.806342300129691e-06, + "loss": 0.0825, + "step": 20550 + }, + { + "epoch": 2.58, + "grad_norm": 17.589025497436523, + "learning_rate": 2.8055055850730036e-06, + "loss": 1.48, + "step": 20551 + }, + { + "epoch": 2.58, + "grad_norm": 19.457406997680664, + "learning_rate": 2.804668870016316e-06, + "loss": 0.7478, + "step": 20552 + }, + { + "epoch": 2.58, + "grad_norm": 22.284839630126953, + "learning_rate": 2.8038321549596287e-06, + "loss": 1.1945, + "step": 20553 + }, + { + "epoch": 2.58, + "grad_norm": 13.882135391235352, + "learning_rate": 2.802995439902941e-06, + "loss": 0.636, + "step": 20554 + }, + { + "epoch": 2.58, + "grad_norm": 60.36022186279297, + "learning_rate": 2.802158724846254e-06, + "loss": 1.6826, + "step": 20555 + }, + { + "epoch": 2.58, + "grad_norm": 24.295879364013672, + "learning_rate": 2.8013220097895667e-06, + "loss": 0.9273, + "step": 20556 + }, + { + "epoch": 2.58, + "grad_norm": 3.3162782192230225, + "learning_rate": 2.800485294732879e-06, + "loss": 0.1991, + "step": 20557 + }, + { + "epoch": 2.58, + "grad_norm": 5.362887859344482, + "learning_rate": 2.7996485796761915e-06, + "loss": 0.3085, + "step": 20558 + }, + { + "epoch": 2.58, + "grad_norm": 9.203067779541016, + "learning_rate": 2.798811864619504e-06, + "loss": 0.2045, + "step": 20559 + }, + { + "epoch": 2.58, + "grad_norm": 4.6134934425354, + "learning_rate": 2.7979751495628167e-06, + "loss": 0.432, + "step": 20560 + }, + { + "epoch": 2.58, + "grad_norm": 10.109713554382324, + "learning_rate": 2.797138434506129e-06, + "loss": 2.1859, + "step": 20561 + }, + { + "epoch": 2.58, + "grad_norm": 13.00163459777832, + "learning_rate": 2.796301719449442e-06, + "loss": 0.5977, + "step": 20562 + }, + { + "epoch": 2.58, + "grad_norm": 8.053282737731934, + "learning_rate": 2.7954650043927546e-06, + "loss": 1.171, + "step": 20563 + }, + { + "epoch": 2.58, + "grad_norm": 6.237462997436523, + "learning_rate": 2.7946282893360666e-06, + "loss": 0.3323, + "step": 20564 + }, + { + "epoch": 2.58, + "grad_norm": 15.753913879394531, + "learning_rate": 2.7937915742793794e-06, + "loss": 0.8277, + "step": 20565 + }, + { + "epoch": 2.58, + "grad_norm": 4.748144149780273, + "learning_rate": 2.7929548592226918e-06, + "loss": 0.2545, + "step": 20566 + }, + { + "epoch": 2.58, + "grad_norm": 7.599785327911377, + "learning_rate": 2.7921181441660046e-06, + "loss": 0.7825, + "step": 20567 + }, + { + "epoch": 2.58, + "grad_norm": 17.133930206298828, + "learning_rate": 2.791281429109317e-06, + "loss": 1.7605, + "step": 20568 + }, + { + "epoch": 2.58, + "grad_norm": 18.69016456604004, + "learning_rate": 2.7904447140526297e-06, + "loss": 1.2244, + "step": 20569 + }, + { + "epoch": 2.58, + "grad_norm": 10.473901748657227, + "learning_rate": 2.7896079989959426e-06, + "loss": 0.9986, + "step": 20570 + }, + { + "epoch": 2.58, + "grad_norm": 4.099677562713623, + "learning_rate": 2.7887712839392545e-06, + "loss": 0.3759, + "step": 20571 + }, + { + "epoch": 2.58, + "grad_norm": 30.630308151245117, + "learning_rate": 2.787934568882567e-06, + "loss": 1.5521, + "step": 20572 + }, + { + "epoch": 2.58, + "grad_norm": 28.276050567626953, + "learning_rate": 2.7870978538258797e-06, + "loss": 0.907, + "step": 20573 + }, + { + "epoch": 2.58, + "grad_norm": 7.844485282897949, + "learning_rate": 2.7862611387691925e-06, + "loss": 1.8047, + "step": 20574 + }, + { + "epoch": 2.58, + "grad_norm": 17.103439331054688, + "learning_rate": 2.785424423712505e-06, + "loss": 0.5604, + "step": 20575 + }, + { + "epoch": 2.58, + "grad_norm": 11.931612968444824, + "learning_rate": 2.7845877086558177e-06, + "loss": 0.6014, + "step": 20576 + }, + { + "epoch": 2.58, + "grad_norm": 14.460848808288574, + "learning_rate": 2.7837509935991305e-06, + "loss": 0.6772, + "step": 20577 + }, + { + "epoch": 2.58, + "grad_norm": 7.24898624420166, + "learning_rate": 2.7829142785424424e-06, + "loss": 0.5393, + "step": 20578 + }, + { + "epoch": 2.58, + "grad_norm": 8.05289363861084, + "learning_rate": 2.782077563485755e-06, + "loss": 0.3949, + "step": 20579 + }, + { + "epoch": 2.58, + "grad_norm": 10.37386417388916, + "learning_rate": 2.7812408484290676e-06, + "loss": 0.6299, + "step": 20580 + }, + { + "epoch": 2.58, + "grad_norm": 29.61414909362793, + "learning_rate": 2.7804041333723804e-06, + "loss": 1.3309, + "step": 20581 + }, + { + "epoch": 2.58, + "grad_norm": 14.088113784790039, + "learning_rate": 2.7795674183156928e-06, + "loss": 0.4617, + "step": 20582 + }, + { + "epoch": 2.58, + "grad_norm": 7.285881519317627, + "learning_rate": 2.7787307032590056e-06, + "loss": 0.5989, + "step": 20583 + }, + { + "epoch": 2.58, + "grad_norm": 12.213241577148438, + "learning_rate": 2.777893988202318e-06, + "loss": 0.6455, + "step": 20584 + }, + { + "epoch": 2.58, + "grad_norm": 8.219286918640137, + "learning_rate": 2.7770572731456303e-06, + "loss": 0.2536, + "step": 20585 + }, + { + "epoch": 2.58, + "grad_norm": 139.68544006347656, + "learning_rate": 2.7762205580889427e-06, + "loss": 0.7538, + "step": 20586 + }, + { + "epoch": 2.58, + "grad_norm": 16.023963928222656, + "learning_rate": 2.7753838430322555e-06, + "loss": 1.807, + "step": 20587 + }, + { + "epoch": 2.58, + "grad_norm": 3.934849262237549, + "learning_rate": 2.7745471279755683e-06, + "loss": 0.2923, + "step": 20588 + }, + { + "epoch": 2.58, + "grad_norm": 25.14888572692871, + "learning_rate": 2.7737104129188807e-06, + "loss": 1.7516, + "step": 20589 + }, + { + "epoch": 2.58, + "grad_norm": 11.77205753326416, + "learning_rate": 2.7728736978621935e-06, + "loss": 0.9633, + "step": 20590 + }, + { + "epoch": 2.58, + "grad_norm": 10.894165992736816, + "learning_rate": 2.772036982805506e-06, + "loss": 0.6362, + "step": 20591 + }, + { + "epoch": 2.58, + "grad_norm": 16.57927894592285, + "learning_rate": 2.7712002677488183e-06, + "loss": 0.4187, + "step": 20592 + }, + { + "epoch": 2.58, + "grad_norm": 15.53189754486084, + "learning_rate": 2.7703635526921306e-06, + "loss": 0.5672, + "step": 20593 + }, + { + "epoch": 2.58, + "grad_norm": 18.74542808532715, + "learning_rate": 2.7695268376354434e-06, + "loss": 0.7687, + "step": 20594 + }, + { + "epoch": 2.58, + "grad_norm": 29.633365631103516, + "learning_rate": 2.768690122578756e-06, + "loss": 0.8001, + "step": 20595 + }, + { + "epoch": 2.58, + "grad_norm": 10.552553176879883, + "learning_rate": 2.7678534075220686e-06, + "loss": 1.1081, + "step": 20596 + }, + { + "epoch": 2.58, + "grad_norm": 9.91150188446045, + "learning_rate": 2.7670166924653814e-06, + "loss": 0.1865, + "step": 20597 + }, + { + "epoch": 2.59, + "grad_norm": 45.85779571533203, + "learning_rate": 2.766179977408694e-06, + "loss": 3.4106, + "step": 20598 + }, + { + "epoch": 2.59, + "grad_norm": 20.061241149902344, + "learning_rate": 2.765343262352006e-06, + "loss": 0.3929, + "step": 20599 + }, + { + "epoch": 2.59, + "grad_norm": 26.48193359375, + "learning_rate": 2.7645065472953186e-06, + "loss": 0.6826, + "step": 20600 + }, + { + "epoch": 2.59, + "grad_norm": 10.189993858337402, + "learning_rate": 2.7636698322386314e-06, + "loss": 0.6115, + "step": 20601 + }, + { + "epoch": 2.59, + "grad_norm": 14.606910705566406, + "learning_rate": 2.7628331171819437e-06, + "loss": 0.6867, + "step": 20602 + }, + { + "epoch": 2.59, + "grad_norm": 3.479612112045288, + "learning_rate": 2.7619964021252565e-06, + "loss": 0.097, + "step": 20603 + }, + { + "epoch": 2.59, + "grad_norm": 12.336150169372559, + "learning_rate": 2.7611596870685693e-06, + "loss": 1.465, + "step": 20604 + }, + { + "epoch": 2.59, + "grad_norm": 6.281243801116943, + "learning_rate": 2.7603229720118817e-06, + "loss": 0.2678, + "step": 20605 + }, + { + "epoch": 2.59, + "grad_norm": 12.324202537536621, + "learning_rate": 2.759486256955194e-06, + "loss": 1.1456, + "step": 20606 + }, + { + "epoch": 2.59, + "grad_norm": 43.20732498168945, + "learning_rate": 2.7586495418985065e-06, + "loss": 2.068, + "step": 20607 + }, + { + "epoch": 2.59, + "grad_norm": 15.503894805908203, + "learning_rate": 2.7578128268418193e-06, + "loss": 1.0782, + "step": 20608 + }, + { + "epoch": 2.59, + "grad_norm": 106.66864013671875, + "learning_rate": 2.7569761117851317e-06, + "loss": 0.611, + "step": 20609 + }, + { + "epoch": 2.59, + "grad_norm": 91.14187622070312, + "learning_rate": 2.7561393967284445e-06, + "loss": 2.117, + "step": 20610 + }, + { + "epoch": 2.59, + "grad_norm": 14.441734313964844, + "learning_rate": 2.7553026816717573e-06, + "loss": 0.9908, + "step": 20611 + }, + { + "epoch": 2.59, + "grad_norm": 3.973798990249634, + "learning_rate": 2.7544659666150696e-06, + "loss": 0.1908, + "step": 20612 + }, + { + "epoch": 2.59, + "grad_norm": 15.52254581451416, + "learning_rate": 2.7536292515583816e-06, + "loss": 0.8581, + "step": 20613 + }, + { + "epoch": 2.59, + "grad_norm": 14.244958877563477, + "learning_rate": 2.7527925365016944e-06, + "loss": 1.3271, + "step": 20614 + }, + { + "epoch": 2.59, + "grad_norm": 14.75009822845459, + "learning_rate": 2.751955821445007e-06, + "loss": 1.0654, + "step": 20615 + }, + { + "epoch": 2.59, + "grad_norm": 3.933854103088379, + "learning_rate": 2.7511191063883196e-06, + "loss": 0.3563, + "step": 20616 + }, + { + "epoch": 2.59, + "grad_norm": 20.3519229888916, + "learning_rate": 2.7502823913316324e-06, + "loss": 1.71, + "step": 20617 + }, + { + "epoch": 2.59, + "grad_norm": 15.010092735290527, + "learning_rate": 2.7494456762749448e-06, + "loss": 0.6177, + "step": 20618 + }, + { + "epoch": 2.59, + "grad_norm": 6.581335544586182, + "learning_rate": 2.7486089612182576e-06, + "loss": 0.4071, + "step": 20619 + }, + { + "epoch": 2.59, + "grad_norm": 36.17319869995117, + "learning_rate": 2.7477722461615695e-06, + "loss": 1.2728, + "step": 20620 + }, + { + "epoch": 2.59, + "grad_norm": 28.553016662597656, + "learning_rate": 2.7469355311048823e-06, + "loss": 1.0801, + "step": 20621 + }, + { + "epoch": 2.59, + "grad_norm": 7.8814311027526855, + "learning_rate": 2.746098816048195e-06, + "loss": 0.2771, + "step": 20622 + }, + { + "epoch": 2.59, + "grad_norm": 31.427291870117188, + "learning_rate": 2.7452621009915075e-06, + "loss": 1.36, + "step": 20623 + }, + { + "epoch": 2.59, + "grad_norm": 17.095361709594727, + "learning_rate": 2.7444253859348203e-06, + "loss": 1.497, + "step": 20624 + }, + { + "epoch": 2.59, + "grad_norm": 4.985144138336182, + "learning_rate": 2.7435886708781327e-06, + "loss": 0.1673, + "step": 20625 + }, + { + "epoch": 2.59, + "grad_norm": 20.383758544921875, + "learning_rate": 2.7427519558214455e-06, + "loss": 1.276, + "step": 20626 + }, + { + "epoch": 2.59, + "grad_norm": 15.193656921386719, + "learning_rate": 2.7419152407647574e-06, + "loss": 1.4189, + "step": 20627 + }, + { + "epoch": 2.59, + "grad_norm": 5.478861331939697, + "learning_rate": 2.7410785257080702e-06, + "loss": 0.1422, + "step": 20628 + }, + { + "epoch": 2.59, + "grad_norm": 4.593821048736572, + "learning_rate": 2.740241810651383e-06, + "loss": 0.1697, + "step": 20629 + }, + { + "epoch": 2.59, + "grad_norm": 17.445035934448242, + "learning_rate": 2.7394050955946954e-06, + "loss": 0.7018, + "step": 20630 + }, + { + "epoch": 2.59, + "grad_norm": 10.315011978149414, + "learning_rate": 2.7385683805380082e-06, + "loss": 0.9595, + "step": 20631 + }, + { + "epoch": 2.59, + "grad_norm": 7.475622653961182, + "learning_rate": 2.7377316654813206e-06, + "loss": 2.1201, + "step": 20632 + }, + { + "epoch": 2.59, + "grad_norm": 15.363032341003418, + "learning_rate": 2.7368949504246334e-06, + "loss": 2.8288, + "step": 20633 + }, + { + "epoch": 2.59, + "grad_norm": 10.100433349609375, + "learning_rate": 2.7360582353679453e-06, + "loss": 1.7027, + "step": 20634 + }, + { + "epoch": 2.59, + "grad_norm": 7.44243049621582, + "learning_rate": 2.735221520311258e-06, + "loss": 0.7286, + "step": 20635 + }, + { + "epoch": 2.59, + "grad_norm": 13.29871940612793, + "learning_rate": 2.7343848052545705e-06, + "loss": 1.0735, + "step": 20636 + }, + { + "epoch": 2.59, + "grad_norm": 39.645599365234375, + "learning_rate": 2.7335480901978833e-06, + "loss": 0.4428, + "step": 20637 + }, + { + "epoch": 2.59, + "grad_norm": 10.973965644836426, + "learning_rate": 2.732711375141196e-06, + "loss": 1.4176, + "step": 20638 + }, + { + "epoch": 2.59, + "grad_norm": 9.956299781799316, + "learning_rate": 2.7318746600845085e-06, + "loss": 1.6256, + "step": 20639 + }, + { + "epoch": 2.59, + "grad_norm": 10.297553062438965, + "learning_rate": 2.7310379450278213e-06, + "loss": 0.2793, + "step": 20640 + }, + { + "epoch": 2.59, + "grad_norm": 5.664786338806152, + "learning_rate": 2.7302012299711333e-06, + "loss": 0.4077, + "step": 20641 + }, + { + "epoch": 2.59, + "grad_norm": 19.875402450561523, + "learning_rate": 2.729364514914446e-06, + "loss": 1.0372, + "step": 20642 + }, + { + "epoch": 2.59, + "grad_norm": 11.465612411499023, + "learning_rate": 2.7285277998577584e-06, + "loss": 0.5423, + "step": 20643 + }, + { + "epoch": 2.59, + "grad_norm": 17.82269859313965, + "learning_rate": 2.7276910848010712e-06, + "loss": 0.5204, + "step": 20644 + }, + { + "epoch": 2.59, + "grad_norm": 26.352035522460938, + "learning_rate": 2.726854369744384e-06, + "loss": 1.8282, + "step": 20645 + }, + { + "epoch": 2.59, + "grad_norm": 31.096120834350586, + "learning_rate": 2.7260176546876964e-06, + "loss": 1.3168, + "step": 20646 + }, + { + "epoch": 2.59, + "grad_norm": 12.857887268066406, + "learning_rate": 2.7251809396310092e-06, + "loss": 0.4651, + "step": 20647 + }, + { + "epoch": 2.59, + "grad_norm": 15.641700744628906, + "learning_rate": 2.724344224574321e-06, + "loss": 2.7795, + "step": 20648 + }, + { + "epoch": 2.59, + "grad_norm": 4.491549015045166, + "learning_rate": 2.723507509517634e-06, + "loss": 0.2071, + "step": 20649 + }, + { + "epoch": 2.59, + "grad_norm": 3.6830925941467285, + "learning_rate": 2.7226707944609464e-06, + "loss": 0.5324, + "step": 20650 + }, + { + "epoch": 2.59, + "grad_norm": 11.230381965637207, + "learning_rate": 2.721834079404259e-06, + "loss": 0.2106, + "step": 20651 + }, + { + "epoch": 2.59, + "grad_norm": 7.093985080718994, + "learning_rate": 2.720997364347572e-06, + "loss": 0.7449, + "step": 20652 + }, + { + "epoch": 2.59, + "grad_norm": 14.01636791229248, + "learning_rate": 2.7201606492908843e-06, + "loss": 0.2445, + "step": 20653 + }, + { + "epoch": 2.59, + "grad_norm": 8.7313871383667, + "learning_rate": 2.7193239342341963e-06, + "loss": 1.6454, + "step": 20654 + }, + { + "epoch": 2.59, + "grad_norm": 19.767362594604492, + "learning_rate": 2.718487219177509e-06, + "loss": 0.7345, + "step": 20655 + }, + { + "epoch": 2.59, + "grad_norm": 92.36847686767578, + "learning_rate": 2.717650504120822e-06, + "loss": 3.7043, + "step": 20656 + }, + { + "epoch": 2.59, + "grad_norm": 71.1680908203125, + "learning_rate": 2.7168137890641343e-06, + "loss": 1.4763, + "step": 20657 + }, + { + "epoch": 2.59, + "grad_norm": 4.164688587188721, + "learning_rate": 2.715977074007447e-06, + "loss": 0.3721, + "step": 20658 + }, + { + "epoch": 2.59, + "grad_norm": 31.245023727416992, + "learning_rate": 2.7151403589507595e-06, + "loss": 1.552, + "step": 20659 + }, + { + "epoch": 2.59, + "grad_norm": 21.062349319458008, + "learning_rate": 2.7143036438940723e-06, + "loss": 0.4429, + "step": 20660 + }, + { + "epoch": 2.59, + "grad_norm": 2.783233404159546, + "learning_rate": 2.7134669288373842e-06, + "loss": 0.1205, + "step": 20661 + }, + { + "epoch": 2.59, + "grad_norm": 10.794570922851562, + "learning_rate": 2.712630213780697e-06, + "loss": 1.1679, + "step": 20662 + }, + { + "epoch": 2.59, + "grad_norm": 9.779132843017578, + "learning_rate": 2.71179349872401e-06, + "loss": 0.6626, + "step": 20663 + }, + { + "epoch": 2.59, + "grad_norm": 17.937089920043945, + "learning_rate": 2.710956783667322e-06, + "loss": 0.7875, + "step": 20664 + }, + { + "epoch": 2.59, + "grad_norm": 7.253830909729004, + "learning_rate": 2.710120068610635e-06, + "loss": 0.4595, + "step": 20665 + }, + { + "epoch": 2.59, + "grad_norm": 26.90309715270996, + "learning_rate": 2.7092833535539474e-06, + "loss": 1.2822, + "step": 20666 + }, + { + "epoch": 2.59, + "grad_norm": 14.526180267333984, + "learning_rate": 2.70844663849726e-06, + "loss": 0.3683, + "step": 20667 + }, + { + "epoch": 2.59, + "grad_norm": 28.100963592529297, + "learning_rate": 2.707609923440572e-06, + "loss": 1.9676, + "step": 20668 + }, + { + "epoch": 2.59, + "grad_norm": 86.18717956542969, + "learning_rate": 2.706773208383885e-06, + "loss": 1.5185, + "step": 20669 + }, + { + "epoch": 2.59, + "grad_norm": 45.607810974121094, + "learning_rate": 2.7059364933271977e-06, + "loss": 3.1953, + "step": 20670 + }, + { + "epoch": 2.59, + "grad_norm": 5.311668872833252, + "learning_rate": 2.70509977827051e-06, + "loss": 0.4793, + "step": 20671 + }, + { + "epoch": 2.59, + "grad_norm": 3.7336161136627197, + "learning_rate": 2.704263063213823e-06, + "loss": 0.1693, + "step": 20672 + }, + { + "epoch": 2.59, + "grad_norm": 26.051963806152344, + "learning_rate": 2.7034263481571353e-06, + "loss": 1.4203, + "step": 20673 + }, + { + "epoch": 2.59, + "grad_norm": 13.890101432800293, + "learning_rate": 2.702589633100448e-06, + "loss": 0.6348, + "step": 20674 + }, + { + "epoch": 2.59, + "grad_norm": 14.046887397766113, + "learning_rate": 2.70175291804376e-06, + "loss": 0.6411, + "step": 20675 + }, + { + "epoch": 2.59, + "grad_norm": 66.71613311767578, + "learning_rate": 2.700916202987073e-06, + "loss": 1.0706, + "step": 20676 + }, + { + "epoch": 2.59, + "grad_norm": 9.172269821166992, + "learning_rate": 2.7000794879303852e-06, + "loss": 0.3427, + "step": 20677 + }, + { + "epoch": 2.6, + "grad_norm": 58.57754898071289, + "learning_rate": 2.699242772873698e-06, + "loss": 1.3343, + "step": 20678 + }, + { + "epoch": 2.6, + "grad_norm": 22.161231994628906, + "learning_rate": 2.698406057817011e-06, + "loss": 0.9301, + "step": 20679 + }, + { + "epoch": 2.6, + "grad_norm": 20.561626434326172, + "learning_rate": 2.6975693427603232e-06, + "loss": 1.6658, + "step": 20680 + }, + { + "epoch": 2.6, + "grad_norm": 14.982219696044922, + "learning_rate": 2.696732627703636e-06, + "loss": 1.2091, + "step": 20681 + }, + { + "epoch": 2.6, + "grad_norm": 16.142711639404297, + "learning_rate": 2.695895912646948e-06, + "loss": 0.5801, + "step": 20682 + }, + { + "epoch": 2.6, + "grad_norm": 11.208925247192383, + "learning_rate": 2.6950591975902608e-06, + "loss": 0.8101, + "step": 20683 + }, + { + "epoch": 2.6, + "grad_norm": 23.70677375793457, + "learning_rate": 2.694222482533573e-06, + "loss": 2.8049, + "step": 20684 + }, + { + "epoch": 2.6, + "grad_norm": 59.326507568359375, + "learning_rate": 2.693385767476886e-06, + "loss": 1.0946, + "step": 20685 + }, + { + "epoch": 2.6, + "grad_norm": 50.2004280090332, + "learning_rate": 2.6925490524201988e-06, + "loss": 1.8323, + "step": 20686 + }, + { + "epoch": 2.6, + "grad_norm": 19.800365447998047, + "learning_rate": 2.691712337363511e-06, + "loss": 1.8256, + "step": 20687 + }, + { + "epoch": 2.6, + "grad_norm": 743.9962158203125, + "learning_rate": 2.690875622306824e-06, + "loss": 1.9372, + "step": 20688 + }, + { + "epoch": 2.6, + "grad_norm": 20.274946212768555, + "learning_rate": 2.690038907250136e-06, + "loss": 1.0267, + "step": 20689 + }, + { + "epoch": 2.6, + "grad_norm": 18.31287384033203, + "learning_rate": 2.6892021921934487e-06, + "loss": 0.5088, + "step": 20690 + }, + { + "epoch": 2.6, + "grad_norm": 49.05681610107422, + "learning_rate": 2.688365477136761e-06, + "loss": 1.7442, + "step": 20691 + }, + { + "epoch": 2.6, + "grad_norm": 10.268267631530762, + "learning_rate": 2.687528762080074e-06, + "loss": 0.5306, + "step": 20692 + }, + { + "epoch": 2.6, + "grad_norm": 26.432613372802734, + "learning_rate": 2.6866920470233867e-06, + "loss": 0.9283, + "step": 20693 + }, + { + "epoch": 2.6, + "grad_norm": 2.937655210494995, + "learning_rate": 2.685855331966699e-06, + "loss": 0.2135, + "step": 20694 + }, + { + "epoch": 2.6, + "grad_norm": 8.145177841186523, + "learning_rate": 2.685018616910012e-06, + "loss": 0.2753, + "step": 20695 + }, + { + "epoch": 2.6, + "grad_norm": 13.875649452209473, + "learning_rate": 2.684181901853324e-06, + "loss": 0.5858, + "step": 20696 + }, + { + "epoch": 2.6, + "grad_norm": 41.01677703857422, + "learning_rate": 2.6833451867966366e-06, + "loss": 1.9673, + "step": 20697 + }, + { + "epoch": 2.6, + "grad_norm": 13.268399238586426, + "learning_rate": 2.682508471739949e-06, + "loss": 1.0622, + "step": 20698 + }, + { + "epoch": 2.6, + "grad_norm": 8.663495063781738, + "learning_rate": 2.681671756683262e-06, + "loss": 0.5457, + "step": 20699 + }, + { + "epoch": 2.6, + "grad_norm": 14.659046173095703, + "learning_rate": 2.680835041626574e-06, + "loss": 0.5068, + "step": 20700 + }, + { + "epoch": 2.6, + "grad_norm": 16.38235092163086, + "learning_rate": 2.679998326569887e-06, + "loss": 0.3942, + "step": 20701 + }, + { + "epoch": 2.6, + "grad_norm": 68.92447662353516, + "learning_rate": 2.6791616115131998e-06, + "loss": 1.9818, + "step": 20702 + }, + { + "epoch": 2.6, + "grad_norm": 2.925363779067993, + "learning_rate": 2.6783248964565117e-06, + "loss": 0.204, + "step": 20703 + }, + { + "epoch": 2.6, + "grad_norm": 222.7730712890625, + "learning_rate": 2.6774881813998245e-06, + "loss": 1.5032, + "step": 20704 + }, + { + "epoch": 2.6, + "grad_norm": 12.716299057006836, + "learning_rate": 2.676651466343137e-06, + "loss": 1.4752, + "step": 20705 + }, + { + "epoch": 2.6, + "grad_norm": 9.49696159362793, + "learning_rate": 2.6758147512864497e-06, + "loss": 0.743, + "step": 20706 + }, + { + "epoch": 2.6, + "grad_norm": 11.136838912963867, + "learning_rate": 2.674978036229762e-06, + "loss": 0.2524, + "step": 20707 + }, + { + "epoch": 2.6, + "grad_norm": 20.2912540435791, + "learning_rate": 2.674141321173075e-06, + "loss": 0.4454, + "step": 20708 + }, + { + "epoch": 2.6, + "grad_norm": 14.758130073547363, + "learning_rate": 2.6733046061163877e-06, + "loss": 1.7816, + "step": 20709 + }, + { + "epoch": 2.6, + "grad_norm": 16.230806350708008, + "learning_rate": 2.6724678910596997e-06, + "loss": 1.0516, + "step": 20710 + }, + { + "epoch": 2.6, + "grad_norm": 5.329649448394775, + "learning_rate": 2.6716311760030125e-06, + "loss": 0.1994, + "step": 20711 + }, + { + "epoch": 2.6, + "grad_norm": 74.24472045898438, + "learning_rate": 2.670794460946325e-06, + "loss": 1.2867, + "step": 20712 + }, + { + "epoch": 2.6, + "grad_norm": 3.160773515701294, + "learning_rate": 2.6699577458896376e-06, + "loss": 0.1505, + "step": 20713 + }, + { + "epoch": 2.6, + "grad_norm": 52.667510986328125, + "learning_rate": 2.66912103083295e-06, + "loss": 3.4129, + "step": 20714 + }, + { + "epoch": 2.6, + "grad_norm": 8.15816879272461, + "learning_rate": 2.668284315776263e-06, + "loss": 0.5077, + "step": 20715 + }, + { + "epoch": 2.6, + "grad_norm": 18.959095001220703, + "learning_rate": 2.6674476007195756e-06, + "loss": 1.0521, + "step": 20716 + }, + { + "epoch": 2.6, + "grad_norm": 14.158047676086426, + "learning_rate": 2.6666108856628876e-06, + "loss": 2.0452, + "step": 20717 + }, + { + "epoch": 2.6, + "grad_norm": 10.375843048095703, + "learning_rate": 2.6657741706062e-06, + "loss": 0.461, + "step": 20718 + }, + { + "epoch": 2.6, + "grad_norm": 14.602218627929688, + "learning_rate": 2.6649374555495127e-06, + "loss": 0.7571, + "step": 20719 + }, + { + "epoch": 2.6, + "grad_norm": 12.750484466552734, + "learning_rate": 2.6641007404928256e-06, + "loss": 2.059, + "step": 20720 + }, + { + "epoch": 2.6, + "grad_norm": 6.2839674949646, + "learning_rate": 2.663264025436138e-06, + "loss": 1.0687, + "step": 20721 + }, + { + "epoch": 2.6, + "grad_norm": 7.21674919128418, + "learning_rate": 2.6624273103794507e-06, + "loss": 0.4122, + "step": 20722 + }, + { + "epoch": 2.6, + "grad_norm": 12.052151679992676, + "learning_rate": 2.661590595322763e-06, + "loss": 0.6591, + "step": 20723 + }, + { + "epoch": 2.6, + "grad_norm": 12.120726585388184, + "learning_rate": 2.6607538802660755e-06, + "loss": 0.1971, + "step": 20724 + }, + { + "epoch": 2.6, + "grad_norm": 39.09234619140625, + "learning_rate": 2.659917165209388e-06, + "loss": 1.1153, + "step": 20725 + }, + { + "epoch": 2.6, + "grad_norm": 17.80829429626465, + "learning_rate": 2.6590804501527007e-06, + "loss": 3.4628, + "step": 20726 + }, + { + "epoch": 2.6, + "grad_norm": 27.43482208251953, + "learning_rate": 2.6582437350960135e-06, + "loss": 0.8496, + "step": 20727 + }, + { + "epoch": 2.6, + "grad_norm": 24.271743774414062, + "learning_rate": 2.657407020039326e-06, + "loss": 1.3373, + "step": 20728 + }, + { + "epoch": 2.6, + "grad_norm": 28.531177520751953, + "learning_rate": 2.6565703049826386e-06, + "loss": 1.9993, + "step": 20729 + }, + { + "epoch": 2.6, + "grad_norm": 29.271854400634766, + "learning_rate": 2.655733589925951e-06, + "loss": 1.1161, + "step": 20730 + }, + { + "epoch": 2.6, + "grad_norm": 9.77495288848877, + "learning_rate": 2.6548968748692634e-06, + "loss": 0.5064, + "step": 20731 + }, + { + "epoch": 2.6, + "grad_norm": 8.433061599731445, + "learning_rate": 2.6540601598125758e-06, + "loss": 0.5588, + "step": 20732 + }, + { + "epoch": 2.6, + "grad_norm": 16.231603622436523, + "learning_rate": 2.6532234447558886e-06, + "loss": 1.2543, + "step": 20733 + }, + { + "epoch": 2.6, + "grad_norm": 16.82666778564453, + "learning_rate": 2.6523867296992014e-06, + "loss": 0.6834, + "step": 20734 + }, + { + "epoch": 2.6, + "grad_norm": 34.19341278076172, + "learning_rate": 2.6515500146425138e-06, + "loss": 1.0255, + "step": 20735 + }, + { + "epoch": 2.6, + "grad_norm": 29.689189910888672, + "learning_rate": 2.6507132995858266e-06, + "loss": 0.9617, + "step": 20736 + }, + { + "epoch": 2.6, + "grad_norm": 17.329444885253906, + "learning_rate": 2.649876584529139e-06, + "loss": 0.4705, + "step": 20737 + }, + { + "epoch": 2.6, + "grad_norm": 10.019248008728027, + "learning_rate": 2.6490398694724513e-06, + "loss": 2.0945, + "step": 20738 + }, + { + "epoch": 2.6, + "grad_norm": 8.300518035888672, + "learning_rate": 2.6482031544157637e-06, + "loss": 1.34, + "step": 20739 + }, + { + "epoch": 2.6, + "grad_norm": 15.227872848510742, + "learning_rate": 2.6473664393590765e-06, + "loss": 1.0722, + "step": 20740 + }, + { + "epoch": 2.6, + "grad_norm": 22.152156829833984, + "learning_rate": 2.646529724302389e-06, + "loss": 0.3817, + "step": 20741 + }, + { + "epoch": 2.6, + "grad_norm": 30.404407501220703, + "learning_rate": 2.6456930092457017e-06, + "loss": 0.8752, + "step": 20742 + }, + { + "epoch": 2.6, + "grad_norm": 11.703365325927734, + "learning_rate": 2.6448562941890145e-06, + "loss": 1.8923, + "step": 20743 + }, + { + "epoch": 2.6, + "grad_norm": 18.769855499267578, + "learning_rate": 2.6440195791323264e-06, + "loss": 0.882, + "step": 20744 + }, + { + "epoch": 2.6, + "grad_norm": 8.443937301635742, + "learning_rate": 2.6431828640756392e-06, + "loss": 0.8397, + "step": 20745 + }, + { + "epoch": 2.6, + "grad_norm": 15.188029289245605, + "learning_rate": 2.6423461490189516e-06, + "loss": 1.2173, + "step": 20746 + }, + { + "epoch": 2.6, + "grad_norm": 17.374069213867188, + "learning_rate": 2.6415094339622644e-06, + "loss": 0.918, + "step": 20747 + }, + { + "epoch": 2.6, + "grad_norm": 7.522876262664795, + "learning_rate": 2.640672718905577e-06, + "loss": 0.703, + "step": 20748 + }, + { + "epoch": 2.6, + "grad_norm": 16.831457138061523, + "learning_rate": 2.6398360038488896e-06, + "loss": 1.0171, + "step": 20749 + }, + { + "epoch": 2.6, + "grad_norm": 19.82137107849121, + "learning_rate": 2.6389992887922024e-06, + "loss": 0.7962, + "step": 20750 + }, + { + "epoch": 2.6, + "grad_norm": 2.5512492656707764, + "learning_rate": 2.6381625737355144e-06, + "loss": 0.0613, + "step": 20751 + }, + { + "epoch": 2.6, + "grad_norm": 17.773103713989258, + "learning_rate": 2.637325858678827e-06, + "loss": 0.988, + "step": 20752 + }, + { + "epoch": 2.6, + "grad_norm": 20.804609298706055, + "learning_rate": 2.6364891436221395e-06, + "loss": 1.9218, + "step": 20753 + }, + { + "epoch": 2.6, + "grad_norm": 41.56431198120117, + "learning_rate": 2.6356524285654523e-06, + "loss": 0.4315, + "step": 20754 + }, + { + "epoch": 2.6, + "grad_norm": 22.21440887451172, + "learning_rate": 2.6348157135087647e-06, + "loss": 1.3783, + "step": 20755 + }, + { + "epoch": 2.6, + "grad_norm": 7.141530990600586, + "learning_rate": 2.6339789984520775e-06, + "loss": 0.5846, + "step": 20756 + }, + { + "epoch": 2.6, + "grad_norm": 4.123556613922119, + "learning_rate": 2.6331422833953903e-06, + "loss": 0.4207, + "step": 20757 + }, + { + "epoch": 2.61, + "grad_norm": 18.96649169921875, + "learning_rate": 2.6323055683387023e-06, + "loss": 0.8664, + "step": 20758 + }, + { + "epoch": 2.61, + "grad_norm": 11.451469421386719, + "learning_rate": 2.6314688532820147e-06, + "loss": 0.5508, + "step": 20759 + }, + { + "epoch": 2.61, + "grad_norm": 1.617785930633545, + "learning_rate": 2.6306321382253275e-06, + "loss": 0.0696, + "step": 20760 + }, + { + "epoch": 2.61, + "grad_norm": 2.501220226287842, + "learning_rate": 2.6297954231686403e-06, + "loss": 0.0922, + "step": 20761 + }, + { + "epoch": 2.61, + "grad_norm": 8.61351490020752, + "learning_rate": 2.6289587081119526e-06, + "loss": 1.9235, + "step": 20762 + }, + { + "epoch": 2.61, + "grad_norm": 33.558815002441406, + "learning_rate": 2.6281219930552654e-06, + "loss": 0.4547, + "step": 20763 + }, + { + "epoch": 2.61, + "grad_norm": 58.36433029174805, + "learning_rate": 2.627285277998578e-06, + "loss": 0.8337, + "step": 20764 + }, + { + "epoch": 2.61, + "grad_norm": 41.084014892578125, + "learning_rate": 2.62644856294189e-06, + "loss": 1.5015, + "step": 20765 + }, + { + "epoch": 2.61, + "grad_norm": 5.41900110244751, + "learning_rate": 2.6256118478852026e-06, + "loss": 0.1704, + "step": 20766 + }, + { + "epoch": 2.61, + "grad_norm": 14.344040870666504, + "learning_rate": 2.6247751328285154e-06, + "loss": 0.9146, + "step": 20767 + }, + { + "epoch": 2.61, + "grad_norm": 11.356485366821289, + "learning_rate": 2.623938417771828e-06, + "loss": 0.6074, + "step": 20768 + }, + { + "epoch": 2.61, + "grad_norm": 16.03408432006836, + "learning_rate": 2.6231017027151406e-06, + "loss": 1.0224, + "step": 20769 + }, + { + "epoch": 2.61, + "grad_norm": 10.44325065612793, + "learning_rate": 2.6222649876584534e-06, + "loss": 1.5056, + "step": 20770 + }, + { + "epoch": 2.61, + "grad_norm": 10.945624351501465, + "learning_rate": 2.6214282726017657e-06, + "loss": 0.9261, + "step": 20771 + }, + { + "epoch": 2.61, + "grad_norm": 37.368927001953125, + "learning_rate": 2.620591557545078e-06, + "loss": 2.5891, + "step": 20772 + }, + { + "epoch": 2.61, + "grad_norm": 21.220455169677734, + "learning_rate": 2.6197548424883905e-06, + "loss": 0.7615, + "step": 20773 + }, + { + "epoch": 2.61, + "grad_norm": 16.384571075439453, + "learning_rate": 2.6189181274317033e-06, + "loss": 1.7023, + "step": 20774 + }, + { + "epoch": 2.61, + "grad_norm": 20.78117561340332, + "learning_rate": 2.618081412375016e-06, + "loss": 1.7378, + "step": 20775 + }, + { + "epoch": 2.61, + "grad_norm": 21.69114112854004, + "learning_rate": 2.6172446973183285e-06, + "loss": 0.93, + "step": 20776 + }, + { + "epoch": 2.61, + "grad_norm": 17.99045753479004, + "learning_rate": 2.6164079822616413e-06, + "loss": 0.5936, + "step": 20777 + }, + { + "epoch": 2.61, + "grad_norm": 53.71336364746094, + "learning_rate": 2.6155712672049537e-06, + "loss": 2.641, + "step": 20778 + }, + { + "epoch": 2.61, + "grad_norm": 40.88047409057617, + "learning_rate": 2.614734552148266e-06, + "loss": 0.7631, + "step": 20779 + }, + { + "epoch": 2.61, + "grad_norm": 3.7886650562286377, + "learning_rate": 2.6138978370915784e-06, + "loss": 0.3358, + "step": 20780 + }, + { + "epoch": 2.61, + "grad_norm": 104.24272155761719, + "learning_rate": 2.6130611220348912e-06, + "loss": 1.4389, + "step": 20781 + }, + { + "epoch": 2.61, + "grad_norm": 10.23959732055664, + "learning_rate": 2.6122244069782036e-06, + "loss": 1.752, + "step": 20782 + }, + { + "epoch": 2.61, + "grad_norm": 18.32854461669922, + "learning_rate": 2.6113876919215164e-06, + "loss": 1.1634, + "step": 20783 + }, + { + "epoch": 2.61, + "grad_norm": 12.131913185119629, + "learning_rate": 2.610550976864829e-06, + "loss": 0.943, + "step": 20784 + }, + { + "epoch": 2.61, + "grad_norm": 4.605217456817627, + "learning_rate": 2.6097142618081416e-06, + "loss": 0.4482, + "step": 20785 + }, + { + "epoch": 2.61, + "grad_norm": 17.47404670715332, + "learning_rate": 2.608877546751454e-06, + "loss": 1.6336, + "step": 20786 + }, + { + "epoch": 2.61, + "grad_norm": 11.874710083007812, + "learning_rate": 2.6080408316947663e-06, + "loss": 0.8419, + "step": 20787 + }, + { + "epoch": 2.61, + "grad_norm": 38.75798797607422, + "learning_rate": 2.607204116638079e-06, + "loss": 2.9181, + "step": 20788 + }, + { + "epoch": 2.61, + "grad_norm": 37.21906661987305, + "learning_rate": 2.6063674015813915e-06, + "loss": 0.9856, + "step": 20789 + }, + { + "epoch": 2.61, + "grad_norm": 12.168079376220703, + "learning_rate": 2.6055306865247043e-06, + "loss": 0.9537, + "step": 20790 + }, + { + "epoch": 2.61, + "grad_norm": 4.899741172790527, + "learning_rate": 2.604693971468017e-06, + "loss": 0.594, + "step": 20791 + }, + { + "epoch": 2.61, + "grad_norm": 14.10857105255127, + "learning_rate": 2.6038572564113295e-06, + "loss": 0.7821, + "step": 20792 + }, + { + "epoch": 2.61, + "grad_norm": 42.20453643798828, + "learning_rate": 2.603020541354642e-06, + "loss": 1.8697, + "step": 20793 + }, + { + "epoch": 2.61, + "grad_norm": 18.523605346679688, + "learning_rate": 2.6021838262979542e-06, + "loss": 0.7861, + "step": 20794 + }, + { + "epoch": 2.61, + "grad_norm": 10.07702350616455, + "learning_rate": 2.601347111241267e-06, + "loss": 2.0771, + "step": 20795 + }, + { + "epoch": 2.61, + "grad_norm": 26.76637077331543, + "learning_rate": 2.6005103961845794e-06, + "loss": 1.5734, + "step": 20796 + }, + { + "epoch": 2.61, + "grad_norm": 12.20970630645752, + "learning_rate": 2.5996736811278922e-06, + "loss": 0.6719, + "step": 20797 + }, + { + "epoch": 2.61, + "grad_norm": 18.1197452545166, + "learning_rate": 2.598836966071205e-06, + "loss": 1.5184, + "step": 20798 + }, + { + "epoch": 2.61, + "grad_norm": 79.8078842163086, + "learning_rate": 2.5980002510145174e-06, + "loss": 1.9163, + "step": 20799 + }, + { + "epoch": 2.61, + "grad_norm": 11.29383659362793, + "learning_rate": 2.5971635359578294e-06, + "loss": 1.6318, + "step": 20800 + }, + { + "epoch": 2.61, + "eval_loss": 0.08012223988771439, + "eval_runtime": 95.7084, + "eval_samples_per_second": 37.008, + "eval_steps_per_second": 37.008, + "step": 20800 + }, + { + "epoch": 2.61, + "grad_norm": 27.131946563720703, + "learning_rate": 2.596326820901142e-06, + "loss": 0.7197, + "step": 20801 + }, + { + "epoch": 2.61, + "grad_norm": 8.807607650756836, + "learning_rate": 2.595490105844455e-06, + "loss": 0.3941, + "step": 20802 + }, + { + "epoch": 2.61, + "grad_norm": 35.305213928222656, + "learning_rate": 2.5946533907877673e-06, + "loss": 0.9097, + "step": 20803 + }, + { + "epoch": 2.61, + "grad_norm": 26.552886962890625, + "learning_rate": 2.59381667573108e-06, + "loss": 3.5247, + "step": 20804 + }, + { + "epoch": 2.61, + "grad_norm": 14.467986106872559, + "learning_rate": 2.5929799606743925e-06, + "loss": 1.6095, + "step": 20805 + }, + { + "epoch": 2.61, + "grad_norm": 16.631698608398438, + "learning_rate": 2.5921432456177053e-06, + "loss": 0.4805, + "step": 20806 + }, + { + "epoch": 2.61, + "grad_norm": 11.22146987915039, + "learning_rate": 2.5913065305610173e-06, + "loss": 0.6858, + "step": 20807 + }, + { + "epoch": 2.61, + "grad_norm": 4.90317440032959, + "learning_rate": 2.59046981550433e-06, + "loss": 0.1063, + "step": 20808 + }, + { + "epoch": 2.61, + "grad_norm": 12.939689636230469, + "learning_rate": 2.589633100447643e-06, + "loss": 0.3212, + "step": 20809 + }, + { + "epoch": 2.61, + "grad_norm": 10.119101524353027, + "learning_rate": 2.5887963853909553e-06, + "loss": 0.0638, + "step": 20810 + }, + { + "epoch": 2.61, + "grad_norm": 5.70673131942749, + "learning_rate": 2.587959670334268e-06, + "loss": 0.8487, + "step": 20811 + }, + { + "epoch": 2.61, + "grad_norm": 14.181622505187988, + "learning_rate": 2.5871229552775804e-06, + "loss": 1.0025, + "step": 20812 + }, + { + "epoch": 2.61, + "grad_norm": 291.0864562988281, + "learning_rate": 2.5862862402208932e-06, + "loss": 0.7056, + "step": 20813 + }, + { + "epoch": 2.61, + "grad_norm": 16.884033203125, + "learning_rate": 2.585449525164205e-06, + "loss": 0.8283, + "step": 20814 + }, + { + "epoch": 2.61, + "grad_norm": 13.720187187194824, + "learning_rate": 2.584612810107518e-06, + "loss": 0.6934, + "step": 20815 + }, + { + "epoch": 2.61, + "grad_norm": 27.1307315826416, + "learning_rate": 2.583776095050831e-06, + "loss": 0.8664, + "step": 20816 + }, + { + "epoch": 2.61, + "grad_norm": 15.082032203674316, + "learning_rate": 2.582939379994143e-06, + "loss": 0.5332, + "step": 20817 + }, + { + "epoch": 2.61, + "grad_norm": 18.318191528320312, + "learning_rate": 2.582102664937456e-06, + "loss": 0.9966, + "step": 20818 + }, + { + "epoch": 2.61, + "grad_norm": 6.35767126083374, + "learning_rate": 2.5812659498807684e-06, + "loss": 0.3516, + "step": 20819 + }, + { + "epoch": 2.61, + "grad_norm": 7.867492198944092, + "learning_rate": 2.580429234824081e-06, + "loss": 0.3813, + "step": 20820 + }, + { + "epoch": 2.61, + "grad_norm": 1.804530143737793, + "learning_rate": 2.579592519767393e-06, + "loss": 0.0702, + "step": 20821 + }, + { + "epoch": 2.61, + "grad_norm": 13.48747444152832, + "learning_rate": 2.578755804710706e-06, + "loss": 1.7444, + "step": 20822 + }, + { + "epoch": 2.61, + "grad_norm": 58.505714416503906, + "learning_rate": 2.5779190896540183e-06, + "loss": 0.99, + "step": 20823 + }, + { + "epoch": 2.61, + "grad_norm": 10.6947603225708, + "learning_rate": 2.577082374597331e-06, + "loss": 0.7755, + "step": 20824 + }, + { + "epoch": 2.61, + "grad_norm": 35.80773162841797, + "learning_rate": 2.576245659540644e-06, + "loss": 2.4354, + "step": 20825 + }, + { + "epoch": 2.61, + "grad_norm": 13.465628623962402, + "learning_rate": 2.5754089444839563e-06, + "loss": 0.59, + "step": 20826 + }, + { + "epoch": 2.61, + "grad_norm": 8.14944076538086, + "learning_rate": 2.5745722294272687e-06, + "loss": 2.6683, + "step": 20827 + }, + { + "epoch": 2.61, + "grad_norm": 21.839962005615234, + "learning_rate": 2.573735514370581e-06, + "loss": 1.3925, + "step": 20828 + }, + { + "epoch": 2.61, + "grad_norm": 20.817468643188477, + "learning_rate": 2.572898799313894e-06, + "loss": 2.7904, + "step": 20829 + }, + { + "epoch": 2.61, + "grad_norm": 26.890777587890625, + "learning_rate": 2.5720620842572062e-06, + "loss": 1.6562, + "step": 20830 + }, + { + "epoch": 2.61, + "grad_norm": 7.3743767738342285, + "learning_rate": 2.571225369200519e-06, + "loss": 0.5765, + "step": 20831 + }, + { + "epoch": 2.61, + "grad_norm": 104.87767028808594, + "learning_rate": 2.570388654143832e-06, + "loss": 0.805, + "step": 20832 + }, + { + "epoch": 2.61, + "grad_norm": 10.344313621520996, + "learning_rate": 2.569551939087144e-06, + "loss": 0.104, + "step": 20833 + }, + { + "epoch": 2.61, + "grad_norm": 39.46104431152344, + "learning_rate": 2.568715224030456e-06, + "loss": 1.033, + "step": 20834 + }, + { + "epoch": 2.61, + "grad_norm": 9.544102668762207, + "learning_rate": 2.567878508973769e-06, + "loss": 0.9579, + "step": 20835 + }, + { + "epoch": 2.61, + "grad_norm": 11.002922058105469, + "learning_rate": 2.5670417939170818e-06, + "loss": 0.6541, + "step": 20836 + }, + { + "epoch": 2.62, + "grad_norm": 105.73807525634766, + "learning_rate": 2.566205078860394e-06, + "loss": 1.4776, + "step": 20837 + }, + { + "epoch": 2.62, + "grad_norm": 8.665227890014648, + "learning_rate": 2.565368363803707e-06, + "loss": 0.6, + "step": 20838 + }, + { + "epoch": 2.62, + "grad_norm": 15.708761215209961, + "learning_rate": 2.5645316487470197e-06, + "loss": 1.0036, + "step": 20839 + }, + { + "epoch": 2.62, + "grad_norm": 10.720295906066895, + "learning_rate": 2.563694933690332e-06, + "loss": 0.848, + "step": 20840 + }, + { + "epoch": 2.62, + "grad_norm": 11.769108772277832, + "learning_rate": 2.562858218633644e-06, + "loss": 2.2958, + "step": 20841 + }, + { + "epoch": 2.62, + "grad_norm": 75.2760238647461, + "learning_rate": 2.562021503576957e-06, + "loss": 1.1938, + "step": 20842 + }, + { + "epoch": 2.62, + "grad_norm": 21.13252830505371, + "learning_rate": 2.5611847885202697e-06, + "loss": 1.2499, + "step": 20843 + }, + { + "epoch": 2.62, + "grad_norm": 19.715547561645508, + "learning_rate": 2.560348073463582e-06, + "loss": 1.2686, + "step": 20844 + }, + { + "epoch": 2.62, + "grad_norm": 17.419666290283203, + "learning_rate": 2.559511358406895e-06, + "loss": 0.7819, + "step": 20845 + }, + { + "epoch": 2.62, + "grad_norm": 15.564249038696289, + "learning_rate": 2.5586746433502072e-06, + "loss": 0.6963, + "step": 20846 + }, + { + "epoch": 2.62, + "grad_norm": 92.1093978881836, + "learning_rate": 2.55783792829352e-06, + "loss": 3.1407, + "step": 20847 + }, + { + "epoch": 2.62, + "grad_norm": 42.56705856323242, + "learning_rate": 2.557001213236832e-06, + "loss": 1.3469, + "step": 20848 + }, + { + "epoch": 2.62, + "grad_norm": 10.735899925231934, + "learning_rate": 2.556164498180145e-06, + "loss": 0.5852, + "step": 20849 + }, + { + "epoch": 2.62, + "grad_norm": 14.951603889465332, + "learning_rate": 2.5553277831234576e-06, + "loss": 0.3663, + "step": 20850 + }, + { + "epoch": 2.62, + "grad_norm": 13.693647384643555, + "learning_rate": 2.55449106806677e-06, + "loss": 0.6136, + "step": 20851 + }, + { + "epoch": 2.62, + "grad_norm": 10.393128395080566, + "learning_rate": 2.5536543530100828e-06, + "loss": 0.2843, + "step": 20852 + }, + { + "epoch": 2.62, + "grad_norm": 25.11324119567871, + "learning_rate": 2.552817637953395e-06, + "loss": 0.9215, + "step": 20853 + }, + { + "epoch": 2.62, + "grad_norm": 21.964876174926758, + "learning_rate": 2.551980922896708e-06, + "loss": 1.7843, + "step": 20854 + }, + { + "epoch": 2.62, + "grad_norm": 18.30719757080078, + "learning_rate": 2.55114420784002e-06, + "loss": 1.5675, + "step": 20855 + }, + { + "epoch": 2.62, + "grad_norm": 6.828226089477539, + "learning_rate": 2.5503074927833327e-06, + "loss": 0.9239, + "step": 20856 + }, + { + "epoch": 2.62, + "grad_norm": 30.28997802734375, + "learning_rate": 2.5494707777266455e-06, + "loss": 1.8447, + "step": 20857 + }, + { + "epoch": 2.62, + "grad_norm": 7.1686320304870605, + "learning_rate": 2.548634062669958e-06, + "loss": 0.6699, + "step": 20858 + }, + { + "epoch": 2.62, + "grad_norm": 10.375819206237793, + "learning_rate": 2.5477973476132707e-06, + "loss": 0.3918, + "step": 20859 + }, + { + "epoch": 2.62, + "grad_norm": 116.3935546875, + "learning_rate": 2.546960632556583e-06, + "loss": 0.8598, + "step": 20860 + }, + { + "epoch": 2.62, + "grad_norm": 43.87374496459961, + "learning_rate": 2.546123917499896e-06, + "loss": 1.1581, + "step": 20861 + }, + { + "epoch": 2.62, + "grad_norm": 46.51468276977539, + "learning_rate": 2.545287202443208e-06, + "loss": 1.136, + "step": 20862 + }, + { + "epoch": 2.62, + "grad_norm": 17.168806076049805, + "learning_rate": 2.5444504873865206e-06, + "loss": 1.3335, + "step": 20863 + }, + { + "epoch": 2.62, + "grad_norm": 6.863992691040039, + "learning_rate": 2.543613772329833e-06, + "loss": 0.3917, + "step": 20864 + }, + { + "epoch": 2.62, + "grad_norm": 36.62272644042969, + "learning_rate": 2.542777057273146e-06, + "loss": 1.0975, + "step": 20865 + }, + { + "epoch": 2.62, + "grad_norm": 60.48590087890625, + "learning_rate": 2.5419403422164586e-06, + "loss": 1.7068, + "step": 20866 + }, + { + "epoch": 2.62, + "grad_norm": 3.954254388809204, + "learning_rate": 2.541103627159771e-06, + "loss": 0.1199, + "step": 20867 + }, + { + "epoch": 2.62, + "grad_norm": 232.37388610839844, + "learning_rate": 2.540266912103084e-06, + "loss": 1.3999, + "step": 20868 + }, + { + "epoch": 2.62, + "grad_norm": 8.824796676635742, + "learning_rate": 2.5394301970463957e-06, + "loss": 0.4704, + "step": 20869 + }, + { + "epoch": 2.62, + "grad_norm": 2.114812135696411, + "learning_rate": 2.5385934819897086e-06, + "loss": 0.0959, + "step": 20870 + }, + { + "epoch": 2.62, + "grad_norm": 4.9958624839782715, + "learning_rate": 2.537756766933021e-06, + "loss": 0.1728, + "step": 20871 + }, + { + "epoch": 2.62, + "grad_norm": 14.779882431030273, + "learning_rate": 2.5369200518763337e-06, + "loss": 0.7662, + "step": 20872 + }, + { + "epoch": 2.62, + "grad_norm": 10.361798286437988, + "learning_rate": 2.5360833368196465e-06, + "loss": 1.5673, + "step": 20873 + }, + { + "epoch": 2.62, + "grad_norm": 6.132584571838379, + "learning_rate": 2.535246621762959e-06, + "loss": 0.5689, + "step": 20874 + }, + { + "epoch": 2.62, + "grad_norm": 13.274500846862793, + "learning_rate": 2.5344099067062717e-06, + "loss": 0.598, + "step": 20875 + }, + { + "epoch": 2.62, + "grad_norm": 12.637325286865234, + "learning_rate": 2.5335731916495837e-06, + "loss": 0.4602, + "step": 20876 + }, + { + "epoch": 2.62, + "grad_norm": 10.414531707763672, + "learning_rate": 2.5327364765928965e-06, + "loss": 0.3249, + "step": 20877 + }, + { + "epoch": 2.62, + "grad_norm": 7.475411415100098, + "learning_rate": 2.531899761536209e-06, + "loss": 1.3807, + "step": 20878 + }, + { + "epoch": 2.62, + "grad_norm": 34.25215148925781, + "learning_rate": 2.5310630464795216e-06, + "loss": 1.2075, + "step": 20879 + }, + { + "epoch": 2.62, + "grad_norm": 20.318708419799805, + "learning_rate": 2.5302263314228345e-06, + "loss": 2.0952, + "step": 20880 + }, + { + "epoch": 2.62, + "grad_norm": 16.875774383544922, + "learning_rate": 2.529389616366147e-06, + "loss": 2.1584, + "step": 20881 + }, + { + "epoch": 2.62, + "grad_norm": 13.094490051269531, + "learning_rate": 2.5285529013094596e-06, + "loss": 1.6302, + "step": 20882 + }, + { + "epoch": 2.62, + "grad_norm": 23.368240356445312, + "learning_rate": 2.5277161862527716e-06, + "loss": 0.7334, + "step": 20883 + }, + { + "epoch": 2.62, + "grad_norm": 10.924906730651855, + "learning_rate": 2.5268794711960844e-06, + "loss": 1.2668, + "step": 20884 + }, + { + "epoch": 2.62, + "grad_norm": 14.63503646850586, + "learning_rate": 2.5260427561393968e-06, + "loss": 0.242, + "step": 20885 + }, + { + "epoch": 2.62, + "grad_norm": 13.004878044128418, + "learning_rate": 2.5252060410827096e-06, + "loss": 1.1775, + "step": 20886 + }, + { + "epoch": 2.62, + "grad_norm": 8.729975700378418, + "learning_rate": 2.524369326026022e-06, + "loss": 1.2534, + "step": 20887 + }, + { + "epoch": 2.62, + "grad_norm": 24.22922706604004, + "learning_rate": 2.5235326109693347e-06, + "loss": 0.6346, + "step": 20888 + }, + { + "epoch": 2.62, + "grad_norm": 21.758121490478516, + "learning_rate": 2.5226958959126475e-06, + "loss": 0.6034, + "step": 20889 + }, + { + "epoch": 2.62, + "grad_norm": 14.686166763305664, + "learning_rate": 2.5218591808559595e-06, + "loss": 0.9316, + "step": 20890 + }, + { + "epoch": 2.62, + "grad_norm": 10.645651817321777, + "learning_rate": 2.5210224657992723e-06, + "loss": 0.7517, + "step": 20891 + }, + { + "epoch": 2.62, + "grad_norm": 3.477910280227661, + "learning_rate": 2.5201857507425847e-06, + "loss": 0.2708, + "step": 20892 + }, + { + "epoch": 2.62, + "grad_norm": 9.044672966003418, + "learning_rate": 2.5193490356858975e-06, + "loss": 1.2164, + "step": 20893 + }, + { + "epoch": 2.62, + "grad_norm": 26.40950584411621, + "learning_rate": 2.51851232062921e-06, + "loss": 1.8651, + "step": 20894 + }, + { + "epoch": 2.62, + "grad_norm": 11.842805862426758, + "learning_rate": 2.5176756055725227e-06, + "loss": 0.5844, + "step": 20895 + }, + { + "epoch": 2.62, + "grad_norm": 15.054704666137695, + "learning_rate": 2.5168388905158355e-06, + "loss": 0.3755, + "step": 20896 + }, + { + "epoch": 2.62, + "grad_norm": 12.256747245788574, + "learning_rate": 2.5160021754591474e-06, + "loss": 1.0146, + "step": 20897 + }, + { + "epoch": 2.62, + "grad_norm": 11.8973970413208, + "learning_rate": 2.51516546040246e-06, + "loss": 0.7053, + "step": 20898 + }, + { + "epoch": 2.62, + "grad_norm": 9.270255088806152, + "learning_rate": 2.5143287453457726e-06, + "loss": 0.736, + "step": 20899 + }, + { + "epoch": 2.62, + "grad_norm": 15.950983047485352, + "learning_rate": 2.5134920302890854e-06, + "loss": 0.7235, + "step": 20900 + }, + { + "epoch": 2.62, + "grad_norm": 4.982297897338867, + "learning_rate": 2.5126553152323978e-06, + "loss": 0.5387, + "step": 20901 + }, + { + "epoch": 2.62, + "grad_norm": 8.136774063110352, + "learning_rate": 2.5118186001757106e-06, + "loss": 0.3629, + "step": 20902 + }, + { + "epoch": 2.62, + "grad_norm": 26.423789978027344, + "learning_rate": 2.5109818851190234e-06, + "loss": 1.4861, + "step": 20903 + }, + { + "epoch": 2.62, + "grad_norm": 26.003742218017578, + "learning_rate": 2.5101451700623353e-06, + "loss": 0.5039, + "step": 20904 + }, + { + "epoch": 2.62, + "grad_norm": 3.174650192260742, + "learning_rate": 2.5093084550056477e-06, + "loss": 0.0985, + "step": 20905 + }, + { + "epoch": 2.62, + "grad_norm": 23.96674346923828, + "learning_rate": 2.5084717399489605e-06, + "loss": 2.6189, + "step": 20906 + }, + { + "epoch": 2.62, + "grad_norm": 20.15669059753418, + "learning_rate": 2.5076350248922733e-06, + "loss": 1.8705, + "step": 20907 + }, + { + "epoch": 2.62, + "grad_norm": 4.903242588043213, + "learning_rate": 2.5067983098355857e-06, + "loss": 0.8006, + "step": 20908 + }, + { + "epoch": 2.62, + "grad_norm": 36.92401885986328, + "learning_rate": 2.5059615947788985e-06, + "loss": 1.8831, + "step": 20909 + }, + { + "epoch": 2.62, + "grad_norm": 11.049201965332031, + "learning_rate": 2.505124879722211e-06, + "loss": 1.0461, + "step": 20910 + }, + { + "epoch": 2.62, + "grad_norm": 11.211837768554688, + "learning_rate": 2.5042881646655233e-06, + "loss": 0.9679, + "step": 20911 + }, + { + "epoch": 2.62, + "grad_norm": 27.66070556640625, + "learning_rate": 2.5034514496088356e-06, + "loss": 1.9762, + "step": 20912 + }, + { + "epoch": 2.62, + "grad_norm": 6.565810680389404, + "learning_rate": 2.5026147345521484e-06, + "loss": 0.1723, + "step": 20913 + }, + { + "epoch": 2.62, + "grad_norm": 11.280132293701172, + "learning_rate": 2.5017780194954612e-06, + "loss": 1.4626, + "step": 20914 + }, + { + "epoch": 2.62, + "grad_norm": 34.04676818847656, + "learning_rate": 2.5009413044387736e-06, + "loss": 0.9478, + "step": 20915 + }, + { + "epoch": 2.62, + "grad_norm": 31.019742965698242, + "learning_rate": 2.5001045893820864e-06, + "loss": 0.7875, + "step": 20916 + }, + { + "epoch": 2.63, + "grad_norm": 16.54743194580078, + "learning_rate": 2.499267874325399e-06, + "loss": 0.9548, + "step": 20917 + }, + { + "epoch": 2.63, + "grad_norm": 22.695558547973633, + "learning_rate": 2.498431159268711e-06, + "loss": 3.3674, + "step": 20918 + }, + { + "epoch": 2.63, + "grad_norm": 8.537609100341797, + "learning_rate": 2.4975944442120236e-06, + "loss": 0.4449, + "step": 20919 + }, + { + "epoch": 2.63, + "grad_norm": 5.5578932762146, + "learning_rate": 2.4967577291553364e-06, + "loss": 0.6365, + "step": 20920 + }, + { + "epoch": 2.63, + "grad_norm": 69.54683685302734, + "learning_rate": 2.495921014098649e-06, + "loss": 2.8953, + "step": 20921 + }, + { + "epoch": 2.63, + "grad_norm": 19.73796844482422, + "learning_rate": 2.4950842990419615e-06, + "loss": 2.6961, + "step": 20922 + }, + { + "epoch": 2.63, + "grad_norm": 20.43222999572754, + "learning_rate": 2.494247583985274e-06, + "loss": 1.7699, + "step": 20923 + }, + { + "epoch": 2.63, + "grad_norm": 16.330984115600586, + "learning_rate": 2.4934108689285867e-06, + "loss": 0.7528, + "step": 20924 + }, + { + "epoch": 2.63, + "grad_norm": 9.302873611450195, + "learning_rate": 2.492574153871899e-06, + "loss": 0.2775, + "step": 20925 + }, + { + "epoch": 2.63, + "grad_norm": 17.74481201171875, + "learning_rate": 2.4917374388152115e-06, + "loss": 0.7376, + "step": 20926 + }, + { + "epoch": 2.63, + "grad_norm": 21.127634048461914, + "learning_rate": 2.4909007237585243e-06, + "loss": 0.8145, + "step": 20927 + }, + { + "epoch": 2.63, + "grad_norm": 15.73021125793457, + "learning_rate": 2.4900640087018367e-06, + "loss": 1.2041, + "step": 20928 + }, + { + "epoch": 2.63, + "grad_norm": 11.025052070617676, + "learning_rate": 2.4892272936451495e-06, + "loss": 1.8824, + "step": 20929 + }, + { + "epoch": 2.63, + "grad_norm": 17.70407485961914, + "learning_rate": 2.488390578588462e-06, + "loss": 0.5918, + "step": 20930 + }, + { + "epoch": 2.63, + "grad_norm": 19.538415908813477, + "learning_rate": 2.4875538635317746e-06, + "loss": 0.3756, + "step": 20931 + }, + { + "epoch": 2.63, + "grad_norm": 12.627802848815918, + "learning_rate": 2.486717148475087e-06, + "loss": 0.7633, + "step": 20932 + }, + { + "epoch": 2.63, + "grad_norm": 23.011611938476562, + "learning_rate": 2.4858804334183994e-06, + "loss": 1.0695, + "step": 20933 + }, + { + "epoch": 2.63, + "grad_norm": 4.7653117179870605, + "learning_rate": 2.485043718361712e-06, + "loss": 1.0073, + "step": 20934 + }, + { + "epoch": 2.63, + "grad_norm": 12.809354782104492, + "learning_rate": 2.4842070033050246e-06, + "loss": 1.3837, + "step": 20935 + }, + { + "epoch": 2.63, + "grad_norm": 3.444727659225464, + "learning_rate": 2.4833702882483374e-06, + "loss": 0.1017, + "step": 20936 + }, + { + "epoch": 2.63, + "grad_norm": 8.391605377197266, + "learning_rate": 2.4825335731916498e-06, + "loss": 0.39, + "step": 20937 + }, + { + "epoch": 2.63, + "grad_norm": 33.942481994628906, + "learning_rate": 2.4816968581349626e-06, + "loss": 1.4194, + "step": 20938 + }, + { + "epoch": 2.63, + "grad_norm": 4.683876991271973, + "learning_rate": 2.480860143078275e-06, + "loss": 0.0859, + "step": 20939 + }, + { + "epoch": 2.63, + "grad_norm": 6.307435035705566, + "learning_rate": 2.4800234280215873e-06, + "loss": 0.4947, + "step": 20940 + }, + { + "epoch": 2.63, + "grad_norm": 10.36919116973877, + "learning_rate": 2.4791867129649e-06, + "loss": 1.7781, + "step": 20941 + }, + { + "epoch": 2.63, + "grad_norm": 1.4842617511749268, + "learning_rate": 2.4783499979082125e-06, + "loss": 0.0286, + "step": 20942 + }, + { + "epoch": 2.63, + "grad_norm": 4.505274295806885, + "learning_rate": 2.477513282851525e-06, + "loss": 0.136, + "step": 20943 + }, + { + "epoch": 2.63, + "grad_norm": 19.212722778320312, + "learning_rate": 2.4766765677948377e-06, + "loss": 2.0218, + "step": 20944 + }, + { + "epoch": 2.63, + "grad_norm": 88.36568450927734, + "learning_rate": 2.47583985273815e-06, + "loss": 0.3264, + "step": 20945 + }, + { + "epoch": 2.63, + "grad_norm": 19.913543701171875, + "learning_rate": 2.475003137681463e-06, + "loss": 1.0815, + "step": 20946 + }, + { + "epoch": 2.63, + "grad_norm": 24.3832950592041, + "learning_rate": 2.4741664226247752e-06, + "loss": 1.1981, + "step": 20947 + }, + { + "epoch": 2.63, + "grad_norm": 21.547956466674805, + "learning_rate": 2.473329707568088e-06, + "loss": 1.0862, + "step": 20948 + }, + { + "epoch": 2.63, + "grad_norm": 6.589284420013428, + "learning_rate": 2.4724929925114004e-06, + "loss": 0.2015, + "step": 20949 + }, + { + "epoch": 2.63, + "grad_norm": 1.2801471948623657, + "learning_rate": 2.471656277454713e-06, + "loss": 0.0567, + "step": 20950 + }, + { + "epoch": 2.63, + "grad_norm": 134.08868408203125, + "learning_rate": 2.4708195623980256e-06, + "loss": 1.8712, + "step": 20951 + }, + { + "epoch": 2.63, + "grad_norm": 10.042970657348633, + "learning_rate": 2.469982847341338e-06, + "loss": 2.0202, + "step": 20952 + }, + { + "epoch": 2.63, + "grad_norm": 21.065839767456055, + "learning_rate": 2.4691461322846508e-06, + "loss": 0.4961, + "step": 20953 + }, + { + "epoch": 2.63, + "grad_norm": 14.301145553588867, + "learning_rate": 2.468309417227963e-06, + "loss": 0.8794, + "step": 20954 + }, + { + "epoch": 2.63, + "grad_norm": 8.55249309539795, + "learning_rate": 2.467472702171276e-06, + "loss": 0.1541, + "step": 20955 + }, + { + "epoch": 2.63, + "grad_norm": 11.381338119506836, + "learning_rate": 2.4666359871145883e-06, + "loss": 0.7351, + "step": 20956 + }, + { + "epoch": 2.63, + "grad_norm": 60.15916442871094, + "learning_rate": 2.4657992720579007e-06, + "loss": 1.2717, + "step": 20957 + }, + { + "epoch": 2.63, + "grad_norm": 5.352663993835449, + "learning_rate": 2.4649625570012135e-06, + "loss": 0.6574, + "step": 20958 + }, + { + "epoch": 2.63, + "grad_norm": 5.918491363525391, + "learning_rate": 2.464125841944526e-06, + "loss": 0.7593, + "step": 20959 + }, + { + "epoch": 2.63, + "grad_norm": 13.636199951171875, + "learning_rate": 2.4632891268878387e-06, + "loss": 0.521, + "step": 20960 + }, + { + "epoch": 2.63, + "grad_norm": 9.359580039978027, + "learning_rate": 2.462452411831151e-06, + "loss": 1.1243, + "step": 20961 + }, + { + "epoch": 2.63, + "grad_norm": 17.589763641357422, + "learning_rate": 2.461615696774464e-06, + "loss": 1.5709, + "step": 20962 + }, + { + "epoch": 2.63, + "grad_norm": 10.143199920654297, + "learning_rate": 2.4607789817177762e-06, + "loss": 0.4982, + "step": 20963 + }, + { + "epoch": 2.63, + "grad_norm": 11.696372032165527, + "learning_rate": 2.4599422666610886e-06, + "loss": 0.6827, + "step": 20964 + }, + { + "epoch": 2.63, + "grad_norm": 26.767925262451172, + "learning_rate": 2.4591055516044014e-06, + "loss": 1.0194, + "step": 20965 + }, + { + "epoch": 2.63, + "grad_norm": 14.609832763671875, + "learning_rate": 2.458268836547714e-06, + "loss": 0.858, + "step": 20966 + }, + { + "epoch": 2.63, + "grad_norm": 10.931877136230469, + "learning_rate": 2.4574321214910266e-06, + "loss": 0.2819, + "step": 20967 + }, + { + "epoch": 2.63, + "grad_norm": 16.908096313476562, + "learning_rate": 2.456595406434339e-06, + "loss": 0.8879, + "step": 20968 + }, + { + "epoch": 2.63, + "grad_norm": 23.534486770629883, + "learning_rate": 2.4557586913776514e-06, + "loss": 2.2925, + "step": 20969 + }, + { + "epoch": 2.63, + "grad_norm": 13.532764434814453, + "learning_rate": 2.454921976320964e-06, + "loss": 0.3926, + "step": 20970 + }, + { + "epoch": 2.63, + "grad_norm": 8.230018615722656, + "learning_rate": 2.4540852612642765e-06, + "loss": 0.7927, + "step": 20971 + }, + { + "epoch": 2.63, + "grad_norm": 15.438993453979492, + "learning_rate": 2.4532485462075893e-06, + "loss": 1.0728, + "step": 20972 + }, + { + "epoch": 2.63, + "grad_norm": 34.309425354003906, + "learning_rate": 2.4524118311509017e-06, + "loss": 0.722, + "step": 20973 + }, + { + "epoch": 2.63, + "grad_norm": 0.6777411103248596, + "learning_rate": 2.4515751160942145e-06, + "loss": 0.0491, + "step": 20974 + }, + { + "epoch": 2.63, + "grad_norm": 7.284146785736084, + "learning_rate": 2.450738401037527e-06, + "loss": 0.1572, + "step": 20975 + }, + { + "epoch": 2.63, + "grad_norm": 3.39168119430542, + "learning_rate": 2.4499016859808393e-06, + "loss": 0.2688, + "step": 20976 + }, + { + "epoch": 2.63, + "grad_norm": 17.017398834228516, + "learning_rate": 2.449064970924152e-06, + "loss": 1.1687, + "step": 20977 + }, + { + "epoch": 2.63, + "grad_norm": 125.64471435546875, + "learning_rate": 2.4482282558674645e-06, + "loss": 0.9681, + "step": 20978 + }, + { + "epoch": 2.63, + "grad_norm": 8.142346382141113, + "learning_rate": 2.4473915408107773e-06, + "loss": 1.0965, + "step": 20979 + }, + { + "epoch": 2.63, + "grad_norm": 10.373797416687012, + "learning_rate": 2.4465548257540896e-06, + "loss": 0.3774, + "step": 20980 + }, + { + "epoch": 2.63, + "grad_norm": 15.762367248535156, + "learning_rate": 2.4457181106974024e-06, + "loss": 0.6187, + "step": 20981 + }, + { + "epoch": 2.63, + "grad_norm": 16.356544494628906, + "learning_rate": 2.444881395640715e-06, + "loss": 2.3165, + "step": 20982 + }, + { + "epoch": 2.63, + "grad_norm": 16.276365280151367, + "learning_rate": 2.444044680584027e-06, + "loss": 0.754, + "step": 20983 + }, + { + "epoch": 2.63, + "grad_norm": 93.12158203125, + "learning_rate": 2.44320796552734e-06, + "loss": 1.9056, + "step": 20984 + }, + { + "epoch": 2.63, + "grad_norm": 17.247848510742188, + "learning_rate": 2.4423712504706524e-06, + "loss": 0.5727, + "step": 20985 + }, + { + "epoch": 2.63, + "grad_norm": 33.041507720947266, + "learning_rate": 2.4415345354139648e-06, + "loss": 0.3893, + "step": 20986 + }, + { + "epoch": 2.63, + "grad_norm": 26.996477127075195, + "learning_rate": 2.4406978203572776e-06, + "loss": 1.1712, + "step": 20987 + }, + { + "epoch": 2.63, + "grad_norm": 14.81056022644043, + "learning_rate": 2.43986110530059e-06, + "loss": 1.5086, + "step": 20988 + }, + { + "epoch": 2.63, + "grad_norm": 35.87815856933594, + "learning_rate": 2.4390243902439027e-06, + "loss": 1.5195, + "step": 20989 + }, + { + "epoch": 2.63, + "grad_norm": 13.832409858703613, + "learning_rate": 2.438187675187215e-06, + "loss": 1.307, + "step": 20990 + }, + { + "epoch": 2.63, + "grad_norm": 22.23716163635254, + "learning_rate": 2.437350960130528e-06, + "loss": 2.4104, + "step": 20991 + }, + { + "epoch": 2.63, + "grad_norm": 7.499285697937012, + "learning_rate": 2.4365142450738403e-06, + "loss": 0.1659, + "step": 20992 + }, + { + "epoch": 2.63, + "grad_norm": 7.612830638885498, + "learning_rate": 2.4356775300171527e-06, + "loss": 0.8796, + "step": 20993 + }, + { + "epoch": 2.63, + "grad_norm": 7.417469501495361, + "learning_rate": 2.4348408149604655e-06, + "loss": 0.7916, + "step": 20994 + }, + { + "epoch": 2.63, + "grad_norm": 7.098451614379883, + "learning_rate": 2.434004099903778e-06, + "loss": 2.1647, + "step": 20995 + }, + { + "epoch": 2.63, + "grad_norm": 8.97706127166748, + "learning_rate": 2.4331673848470907e-06, + "loss": 0.4958, + "step": 20996 + }, + { + "epoch": 2.64, + "grad_norm": 33.11330032348633, + "learning_rate": 2.432330669790403e-06, + "loss": 1.3894, + "step": 20997 + }, + { + "epoch": 2.64, + "grad_norm": 7.475759506225586, + "learning_rate": 2.431493954733716e-06, + "loss": 1.1272, + "step": 20998 + }, + { + "epoch": 2.64, + "grad_norm": 6.357087135314941, + "learning_rate": 2.4306572396770282e-06, + "loss": 1.1372, + "step": 20999 + }, + { + "epoch": 2.64, + "grad_norm": 15.309009552001953, + "learning_rate": 2.4298205246203406e-06, + "loss": 0.9697, + "step": 21000 + }, + { + "epoch": 2.64, + "grad_norm": 14.061135292053223, + "learning_rate": 2.4289838095636534e-06, + "loss": 0.7943, + "step": 21001 + }, + { + "epoch": 2.64, + "grad_norm": 6.136180400848389, + "learning_rate": 2.4281470945069658e-06, + "loss": 0.6436, + "step": 21002 + }, + { + "epoch": 2.64, + "grad_norm": 8.432313919067383, + "learning_rate": 2.427310379450278e-06, + "loss": 0.1336, + "step": 21003 + }, + { + "epoch": 2.64, + "grad_norm": 16.358102798461914, + "learning_rate": 2.426473664393591e-06, + "loss": 0.5794, + "step": 21004 + }, + { + "epoch": 2.64, + "grad_norm": 59.56919860839844, + "learning_rate": 2.4256369493369038e-06, + "loss": 1.5782, + "step": 21005 + }, + { + "epoch": 2.64, + "grad_norm": 14.38819408416748, + "learning_rate": 2.424800234280216e-06, + "loss": 0.6115, + "step": 21006 + }, + { + "epoch": 2.64, + "grad_norm": 77.99490356445312, + "learning_rate": 2.4239635192235285e-06, + "loss": 1.1944, + "step": 21007 + }, + { + "epoch": 2.64, + "grad_norm": 15.622925758361816, + "learning_rate": 2.4231268041668413e-06, + "loss": 1.0147, + "step": 21008 + }, + { + "epoch": 2.64, + "grad_norm": 6.72713565826416, + "learning_rate": 2.4222900891101537e-06, + "loss": 0.7304, + "step": 21009 + }, + { + "epoch": 2.64, + "grad_norm": 2.2990753650665283, + "learning_rate": 2.421453374053466e-06, + "loss": 0.0797, + "step": 21010 + }, + { + "epoch": 2.64, + "grad_norm": 8.245819091796875, + "learning_rate": 2.420616658996779e-06, + "loss": 0.4123, + "step": 21011 + }, + { + "epoch": 2.64, + "grad_norm": 15.5343599319458, + "learning_rate": 2.4197799439400917e-06, + "loss": 0.7004, + "step": 21012 + }, + { + "epoch": 2.64, + "grad_norm": 6.714312553405762, + "learning_rate": 2.418943228883404e-06, + "loss": 0.4894, + "step": 21013 + }, + { + "epoch": 2.64, + "grad_norm": 193.41500854492188, + "learning_rate": 2.4181065138267164e-06, + "loss": 0.8122, + "step": 21014 + }, + { + "epoch": 2.64, + "grad_norm": 13.690166473388672, + "learning_rate": 2.4172697987700292e-06, + "loss": 0.607, + "step": 21015 + }, + { + "epoch": 2.64, + "grad_norm": 12.655191421508789, + "learning_rate": 2.4164330837133416e-06, + "loss": 0.3633, + "step": 21016 + }, + { + "epoch": 2.64, + "grad_norm": 20.513376235961914, + "learning_rate": 2.415596368656654e-06, + "loss": 0.5502, + "step": 21017 + }, + { + "epoch": 2.64, + "grad_norm": 13.626429557800293, + "learning_rate": 2.414759653599967e-06, + "loss": 0.9843, + "step": 21018 + }, + { + "epoch": 2.64, + "grad_norm": 60.88151550292969, + "learning_rate": 2.4139229385432796e-06, + "loss": 1.2858, + "step": 21019 + }, + { + "epoch": 2.64, + "grad_norm": 3.7170236110687256, + "learning_rate": 2.413086223486592e-06, + "loss": 0.0817, + "step": 21020 + }, + { + "epoch": 2.64, + "grad_norm": 15.460129737854004, + "learning_rate": 2.4122495084299044e-06, + "loss": 0.9185, + "step": 21021 + }, + { + "epoch": 2.64, + "grad_norm": 6.197540283203125, + "learning_rate": 2.411412793373217e-06, + "loss": 0.2762, + "step": 21022 + }, + { + "epoch": 2.64, + "grad_norm": 21.68492889404297, + "learning_rate": 2.4105760783165295e-06, + "loss": 0.9793, + "step": 21023 + }, + { + "epoch": 2.64, + "grad_norm": 23.647262573242188, + "learning_rate": 2.409739363259842e-06, + "loss": 0.9037, + "step": 21024 + }, + { + "epoch": 2.64, + "grad_norm": 2.1407809257507324, + "learning_rate": 2.4089026482031547e-06, + "loss": 0.1567, + "step": 21025 + }, + { + "epoch": 2.64, + "grad_norm": 13.150588035583496, + "learning_rate": 2.4080659331464675e-06, + "loss": 1.1496, + "step": 21026 + }, + { + "epoch": 2.64, + "grad_norm": 7.087491035461426, + "learning_rate": 2.4072292180897795e-06, + "loss": 0.4358, + "step": 21027 + }, + { + "epoch": 2.64, + "grad_norm": 76.25025939941406, + "learning_rate": 2.4063925030330923e-06, + "loss": 3.1025, + "step": 21028 + }, + { + "epoch": 2.64, + "grad_norm": 1.9695963859558105, + "learning_rate": 2.405555787976405e-06, + "loss": 0.1382, + "step": 21029 + }, + { + "epoch": 2.64, + "grad_norm": 24.534563064575195, + "learning_rate": 2.4047190729197175e-06, + "loss": 0.8972, + "step": 21030 + }, + { + "epoch": 2.64, + "grad_norm": 9.088956832885742, + "learning_rate": 2.40388235786303e-06, + "loss": 0.4364, + "step": 21031 + }, + { + "epoch": 2.64, + "grad_norm": 11.973158836364746, + "learning_rate": 2.4030456428063426e-06, + "loss": 0.0905, + "step": 21032 + }, + { + "epoch": 2.64, + "grad_norm": 2.274954080581665, + "learning_rate": 2.402208927749655e-06, + "loss": 0.1071, + "step": 21033 + }, + { + "epoch": 2.64, + "grad_norm": 15.855847358703613, + "learning_rate": 2.4013722126929674e-06, + "loss": 1.3971, + "step": 21034 + }, + { + "epoch": 2.64, + "grad_norm": 22.961088180541992, + "learning_rate": 2.40053549763628e-06, + "loss": 1.0131, + "step": 21035 + }, + { + "epoch": 2.64, + "grad_norm": 9.95785903930664, + "learning_rate": 2.399698782579593e-06, + "loss": 1.2831, + "step": 21036 + }, + { + "epoch": 2.64, + "grad_norm": 18.962446212768555, + "learning_rate": 2.3988620675229054e-06, + "loss": 0.5042, + "step": 21037 + }, + { + "epoch": 2.64, + "grad_norm": 18.83690643310547, + "learning_rate": 2.3980253524662177e-06, + "loss": 1.9688, + "step": 21038 + }, + { + "epoch": 2.64, + "grad_norm": 25.99830436706543, + "learning_rate": 2.3971886374095305e-06, + "loss": 0.9632, + "step": 21039 + }, + { + "epoch": 2.64, + "grad_norm": 56.00041198730469, + "learning_rate": 2.396351922352843e-06, + "loss": 0.6948, + "step": 21040 + }, + { + "epoch": 2.64, + "grad_norm": 4.097028732299805, + "learning_rate": 2.3955152072961553e-06, + "loss": 0.2113, + "step": 21041 + }, + { + "epoch": 2.64, + "grad_norm": 13.911191940307617, + "learning_rate": 2.394678492239468e-06, + "loss": 0.9646, + "step": 21042 + }, + { + "epoch": 2.64, + "grad_norm": 57.96830368041992, + "learning_rate": 2.393841777182781e-06, + "loss": 0.9468, + "step": 21043 + }, + { + "epoch": 2.64, + "grad_norm": 73.54779815673828, + "learning_rate": 2.393005062126093e-06, + "loss": 1.08, + "step": 21044 + }, + { + "epoch": 2.64, + "grad_norm": 17.078927993774414, + "learning_rate": 2.3921683470694057e-06, + "loss": 0.5569, + "step": 21045 + }, + { + "epoch": 2.64, + "grad_norm": 9.047167778015137, + "learning_rate": 2.3913316320127185e-06, + "loss": 1.7128, + "step": 21046 + }, + { + "epoch": 2.64, + "grad_norm": 17.860641479492188, + "learning_rate": 2.390494916956031e-06, + "loss": 0.6911, + "step": 21047 + }, + { + "epoch": 2.64, + "grad_norm": 21.053466796875, + "learning_rate": 2.3896582018993432e-06, + "loss": 1.0133, + "step": 21048 + }, + { + "epoch": 2.64, + "grad_norm": 35.85791778564453, + "learning_rate": 2.388821486842656e-06, + "loss": 0.7643, + "step": 21049 + }, + { + "epoch": 2.64, + "grad_norm": 17.7092227935791, + "learning_rate": 2.3879847717859684e-06, + "loss": 1.0582, + "step": 21050 + }, + { + "epoch": 2.64, + "grad_norm": 10.426857948303223, + "learning_rate": 2.3871480567292808e-06, + "loss": 0.5564, + "step": 21051 + }, + { + "epoch": 2.64, + "grad_norm": 779.8558349609375, + "learning_rate": 2.3863113416725936e-06, + "loss": 0.3771, + "step": 21052 + }, + { + "epoch": 2.64, + "grad_norm": 19.85243034362793, + "learning_rate": 2.3854746266159064e-06, + "loss": 1.423, + "step": 21053 + }, + { + "epoch": 2.64, + "grad_norm": 25.018346786499023, + "learning_rate": 2.3846379115592188e-06, + "loss": 0.5543, + "step": 21054 + }, + { + "epoch": 2.64, + "grad_norm": 36.439334869384766, + "learning_rate": 2.383801196502531e-06, + "loss": 3.2907, + "step": 21055 + }, + { + "epoch": 2.64, + "grad_norm": 21.46392822265625, + "learning_rate": 2.382964481445844e-06, + "loss": 1.368, + "step": 21056 + }, + { + "epoch": 2.64, + "grad_norm": 31.040111541748047, + "learning_rate": 2.3821277663891563e-06, + "loss": 1.1571, + "step": 21057 + }, + { + "epoch": 2.64, + "grad_norm": 112.70236206054688, + "learning_rate": 2.3812910513324687e-06, + "loss": 0.8701, + "step": 21058 + }, + { + "epoch": 2.64, + "grad_norm": 61.37261199951172, + "learning_rate": 2.3804543362757815e-06, + "loss": 1.4428, + "step": 21059 + }, + { + "epoch": 2.64, + "grad_norm": 21.433765411376953, + "learning_rate": 2.3796176212190943e-06, + "loss": 1.0511, + "step": 21060 + }, + { + "epoch": 2.64, + "grad_norm": 27.52485466003418, + "learning_rate": 2.3787809061624067e-06, + "loss": 1.7669, + "step": 21061 + }, + { + "epoch": 2.64, + "grad_norm": 1.9846388101577759, + "learning_rate": 2.377944191105719e-06, + "loss": 0.0667, + "step": 21062 + }, + { + "epoch": 2.64, + "grad_norm": 8.852572441101074, + "learning_rate": 2.377107476049032e-06, + "loss": 0.3681, + "step": 21063 + }, + { + "epoch": 2.64, + "grad_norm": 39.31391525268555, + "learning_rate": 2.3762707609923442e-06, + "loss": 0.408, + "step": 21064 + }, + { + "epoch": 2.64, + "grad_norm": 9.55923843383789, + "learning_rate": 2.3754340459356566e-06, + "loss": 0.8344, + "step": 21065 + }, + { + "epoch": 2.64, + "grad_norm": 16.293432235717773, + "learning_rate": 2.3745973308789694e-06, + "loss": 1.4, + "step": 21066 + }, + { + "epoch": 2.64, + "grad_norm": 29.0277042388916, + "learning_rate": 2.373760615822282e-06, + "loss": 0.486, + "step": 21067 + }, + { + "epoch": 2.64, + "grad_norm": 14.477266311645508, + "learning_rate": 2.372923900765594e-06, + "loss": 1.2147, + "step": 21068 + }, + { + "epoch": 2.64, + "grad_norm": 26.399272918701172, + "learning_rate": 2.372087185708907e-06, + "loss": 0.6965, + "step": 21069 + }, + { + "epoch": 2.64, + "grad_norm": 9.183669090270996, + "learning_rate": 2.3712504706522198e-06, + "loss": 0.9176, + "step": 21070 + }, + { + "epoch": 2.64, + "grad_norm": 165.87615966796875, + "learning_rate": 2.370413755595532e-06, + "loss": 1.3961, + "step": 21071 + }, + { + "epoch": 2.64, + "grad_norm": 13.78731918334961, + "learning_rate": 2.3695770405388445e-06, + "loss": 1.2472, + "step": 21072 + }, + { + "epoch": 2.64, + "grad_norm": 15.470413208007812, + "learning_rate": 2.3687403254821573e-06, + "loss": 0.6179, + "step": 21073 + }, + { + "epoch": 2.64, + "grad_norm": 11.213066101074219, + "learning_rate": 2.3679036104254697e-06, + "loss": 0.7374, + "step": 21074 + }, + { + "epoch": 2.64, + "grad_norm": 8.551079750061035, + "learning_rate": 2.367066895368782e-06, + "loss": 0.6043, + "step": 21075 + }, + { + "epoch": 2.64, + "grad_norm": 11.665029525756836, + "learning_rate": 2.366230180312095e-06, + "loss": 0.3494, + "step": 21076 + }, + { + "epoch": 2.65, + "grad_norm": 116.39488983154297, + "learning_rate": 2.3653934652554077e-06, + "loss": 2.3552, + "step": 21077 + }, + { + "epoch": 2.65, + "grad_norm": 19.45047378540039, + "learning_rate": 2.36455675019872e-06, + "loss": 0.6457, + "step": 21078 + }, + { + "epoch": 2.65, + "grad_norm": 11.663397789001465, + "learning_rate": 2.3637200351420325e-06, + "loss": 0.4456, + "step": 21079 + }, + { + "epoch": 2.65, + "grad_norm": 12.752456665039062, + "learning_rate": 2.3628833200853453e-06, + "loss": 1.113, + "step": 21080 + }, + { + "epoch": 2.65, + "grad_norm": 17.248798370361328, + "learning_rate": 2.3620466050286576e-06, + "loss": 2.3385, + "step": 21081 + }, + { + "epoch": 2.65, + "grad_norm": 12.785979270935059, + "learning_rate": 2.36120988997197e-06, + "loss": 0.4756, + "step": 21082 + }, + { + "epoch": 2.65, + "grad_norm": 28.546894073486328, + "learning_rate": 2.360373174915283e-06, + "loss": 0.211, + "step": 21083 + }, + { + "epoch": 2.65, + "grad_norm": 21.222902297973633, + "learning_rate": 2.3595364598585956e-06, + "loss": 0.75, + "step": 21084 + }, + { + "epoch": 2.65, + "grad_norm": 8.43933391571045, + "learning_rate": 2.3586997448019076e-06, + "loss": 0.2291, + "step": 21085 + }, + { + "epoch": 2.65, + "grad_norm": 7.80899715423584, + "learning_rate": 2.3578630297452204e-06, + "loss": 0.5475, + "step": 21086 + }, + { + "epoch": 2.65, + "grad_norm": 20.870101928710938, + "learning_rate": 2.357026314688533e-06, + "loss": 0.9009, + "step": 21087 + }, + { + "epoch": 2.65, + "grad_norm": 14.971521377563477, + "learning_rate": 2.3561895996318456e-06, + "loss": 1.3156, + "step": 21088 + }, + { + "epoch": 2.65, + "grad_norm": 12.223374366760254, + "learning_rate": 2.355352884575158e-06, + "loss": 1.7749, + "step": 21089 + }, + { + "epoch": 2.65, + "grad_norm": 19.237293243408203, + "learning_rate": 2.3545161695184707e-06, + "loss": 2.4655, + "step": 21090 + }, + { + "epoch": 2.65, + "grad_norm": 11.605509757995605, + "learning_rate": 2.353679454461783e-06, + "loss": 0.442, + "step": 21091 + }, + { + "epoch": 2.65, + "grad_norm": 5.847765922546387, + "learning_rate": 2.3528427394050955e-06, + "loss": 0.6636, + "step": 21092 + }, + { + "epoch": 2.65, + "grad_norm": 16.315454483032227, + "learning_rate": 2.3520060243484083e-06, + "loss": 0.5457, + "step": 21093 + }, + { + "epoch": 2.65, + "grad_norm": 14.820706367492676, + "learning_rate": 2.351169309291721e-06, + "loss": 0.6809, + "step": 21094 + }, + { + "epoch": 2.65, + "grad_norm": 16.140243530273438, + "learning_rate": 2.3503325942350335e-06, + "loss": 0.712, + "step": 21095 + }, + { + "epoch": 2.65, + "grad_norm": 17.508155822753906, + "learning_rate": 2.349495879178346e-06, + "loss": 0.7529, + "step": 21096 + }, + { + "epoch": 2.65, + "grad_norm": 15.05785846710205, + "learning_rate": 2.3486591641216587e-06, + "loss": 0.3504, + "step": 21097 + }, + { + "epoch": 2.65, + "grad_norm": 11.685197830200195, + "learning_rate": 2.347822449064971e-06, + "loss": 0.8119, + "step": 21098 + }, + { + "epoch": 2.65, + "grad_norm": 4.483983039855957, + "learning_rate": 2.3469857340082834e-06, + "loss": 0.1311, + "step": 21099 + }, + { + "epoch": 2.65, + "grad_norm": 9.965291976928711, + "learning_rate": 2.3461490189515962e-06, + "loss": 0.3623, + "step": 21100 + }, + { + "epoch": 2.65, + "grad_norm": 12.08174991607666, + "learning_rate": 2.345312303894909e-06, + "loss": 0.2287, + "step": 21101 + }, + { + "epoch": 2.65, + "grad_norm": 18.51045799255371, + "learning_rate": 2.3444755888382214e-06, + "loss": 0.6785, + "step": 21102 + }, + { + "epoch": 2.65, + "grad_norm": 28.313142776489258, + "learning_rate": 2.3436388737815338e-06, + "loss": 1.2543, + "step": 21103 + }, + { + "epoch": 2.65, + "grad_norm": 7.365633010864258, + "learning_rate": 2.3428021587248466e-06, + "loss": 0.4875, + "step": 21104 + }, + { + "epoch": 2.65, + "grad_norm": 100.35225677490234, + "learning_rate": 2.341965443668159e-06, + "loss": 1.3184, + "step": 21105 + }, + { + "epoch": 2.65, + "grad_norm": 1.2970550060272217, + "learning_rate": 2.3411287286114713e-06, + "loss": 0.0375, + "step": 21106 + }, + { + "epoch": 2.65, + "grad_norm": 20.789562225341797, + "learning_rate": 2.340292013554784e-06, + "loss": 0.6848, + "step": 21107 + }, + { + "epoch": 2.65, + "grad_norm": 17.4787654876709, + "learning_rate": 2.3394552984980965e-06, + "loss": 1.5577, + "step": 21108 + }, + { + "epoch": 2.65, + "grad_norm": 27.095874786376953, + "learning_rate": 2.3386185834414093e-06, + "loss": 0.9173, + "step": 21109 + }, + { + "epoch": 2.65, + "grad_norm": 42.789302825927734, + "learning_rate": 2.3377818683847217e-06, + "loss": 0.8784, + "step": 21110 + }, + { + "epoch": 2.65, + "grad_norm": 11.568604469299316, + "learning_rate": 2.3369451533280345e-06, + "loss": 0.5209, + "step": 21111 + }, + { + "epoch": 2.65, + "grad_norm": 8.733729362487793, + "learning_rate": 2.336108438271347e-06, + "loss": 1.4767, + "step": 21112 + }, + { + "epoch": 2.65, + "grad_norm": 23.612058639526367, + "learning_rate": 2.3352717232146592e-06, + "loss": 0.7964, + "step": 21113 + }, + { + "epoch": 2.65, + "grad_norm": 8.344063758850098, + "learning_rate": 2.334435008157972e-06, + "loss": 0.5217, + "step": 21114 + }, + { + "epoch": 2.65, + "grad_norm": 10.633986473083496, + "learning_rate": 2.3335982931012844e-06, + "loss": 0.6476, + "step": 21115 + }, + { + "epoch": 2.65, + "grad_norm": 21.941987991333008, + "learning_rate": 2.3327615780445972e-06, + "loss": 1.5961, + "step": 21116 + }, + { + "epoch": 2.65, + "grad_norm": 31.771562576293945, + "learning_rate": 2.3319248629879096e-06, + "loss": 1.9412, + "step": 21117 + }, + { + "epoch": 2.65, + "grad_norm": 15.069620132446289, + "learning_rate": 2.3310881479312224e-06, + "loss": 1.5819, + "step": 21118 + }, + { + "epoch": 2.65, + "grad_norm": 16.771032333374023, + "learning_rate": 2.3302514328745348e-06, + "loss": 0.2783, + "step": 21119 + }, + { + "epoch": 2.65, + "grad_norm": 6.584004878997803, + "learning_rate": 2.329414717817847e-06, + "loss": 0.9431, + "step": 21120 + }, + { + "epoch": 2.65, + "grad_norm": 3.5742485523223877, + "learning_rate": 2.32857800276116e-06, + "loss": 0.0997, + "step": 21121 + }, + { + "epoch": 2.65, + "grad_norm": 85.23136138916016, + "learning_rate": 2.3277412877044723e-06, + "loss": 0.7781, + "step": 21122 + }, + { + "epoch": 2.65, + "grad_norm": 19.807178497314453, + "learning_rate": 2.3269045726477847e-06, + "loss": 0.9071, + "step": 21123 + }, + { + "epoch": 2.65, + "grad_norm": 5.632420063018799, + "learning_rate": 2.3260678575910975e-06, + "loss": 0.1671, + "step": 21124 + }, + { + "epoch": 2.65, + "grad_norm": 30.62589454650879, + "learning_rate": 2.3252311425344103e-06, + "loss": 1.0105, + "step": 21125 + }, + { + "epoch": 2.65, + "grad_norm": 13.17794132232666, + "learning_rate": 2.3243944274777227e-06, + "loss": 0.4428, + "step": 21126 + }, + { + "epoch": 2.65, + "grad_norm": 287.6689147949219, + "learning_rate": 2.323557712421035e-06, + "loss": 0.7331, + "step": 21127 + }, + { + "epoch": 2.65, + "grad_norm": 32.80656814575195, + "learning_rate": 2.322720997364348e-06, + "loss": 3.8538, + "step": 21128 + }, + { + "epoch": 2.65, + "grad_norm": 17.656213760375977, + "learning_rate": 2.3218842823076603e-06, + "loss": 0.3552, + "step": 21129 + }, + { + "epoch": 2.65, + "grad_norm": 18.911216735839844, + "learning_rate": 2.3210475672509726e-06, + "loss": 0.8092, + "step": 21130 + }, + { + "epoch": 2.65, + "grad_norm": 8.57754898071289, + "learning_rate": 2.3202108521942854e-06, + "loss": 0.2883, + "step": 21131 + }, + { + "epoch": 2.65, + "grad_norm": 12.58278751373291, + "learning_rate": 2.319374137137598e-06, + "loss": 1.1585, + "step": 21132 + }, + { + "epoch": 2.65, + "grad_norm": 22.420452117919922, + "learning_rate": 2.3185374220809106e-06, + "loss": 1.0776, + "step": 21133 + }, + { + "epoch": 2.65, + "grad_norm": 28.474044799804688, + "learning_rate": 2.317700707024223e-06, + "loss": 0.7962, + "step": 21134 + }, + { + "epoch": 2.65, + "grad_norm": 3.378330707550049, + "learning_rate": 2.316863991967536e-06, + "loss": 0.1699, + "step": 21135 + }, + { + "epoch": 2.65, + "grad_norm": 13.12812328338623, + "learning_rate": 2.316027276910848e-06, + "loss": 0.5099, + "step": 21136 + }, + { + "epoch": 2.65, + "grad_norm": 7.315151214599609, + "learning_rate": 2.3151905618541606e-06, + "loss": 0.7315, + "step": 21137 + }, + { + "epoch": 2.65, + "grad_norm": 9.438087463378906, + "learning_rate": 2.3143538467974734e-06, + "loss": 0.7399, + "step": 21138 + }, + { + "epoch": 2.65, + "grad_norm": 32.87921142578125, + "learning_rate": 2.3135171317407857e-06, + "loss": 2.3377, + "step": 21139 + }, + { + "epoch": 2.65, + "grad_norm": 17.833864212036133, + "learning_rate": 2.3126804166840985e-06, + "loss": 2.3104, + "step": 21140 + }, + { + "epoch": 2.65, + "grad_norm": 21.223148345947266, + "learning_rate": 2.311843701627411e-06, + "loss": 0.6581, + "step": 21141 + }, + { + "epoch": 2.65, + "grad_norm": 12.318990707397461, + "learning_rate": 2.3110069865707237e-06, + "loss": 1.3616, + "step": 21142 + }, + { + "epoch": 2.65, + "grad_norm": 7.584165096282959, + "learning_rate": 2.310170271514036e-06, + "loss": 0.2593, + "step": 21143 + }, + { + "epoch": 2.65, + "grad_norm": 11.725146293640137, + "learning_rate": 2.3093335564573485e-06, + "loss": 0.3027, + "step": 21144 + }, + { + "epoch": 2.65, + "grad_norm": 7.892516136169434, + "learning_rate": 2.3084968414006613e-06, + "loss": 0.2908, + "step": 21145 + }, + { + "epoch": 2.65, + "grad_norm": 3.382558822631836, + "learning_rate": 2.3076601263439737e-06, + "loss": 0.0627, + "step": 21146 + }, + { + "epoch": 2.65, + "grad_norm": 9.933945655822754, + "learning_rate": 2.3068234112872865e-06, + "loss": 0.2756, + "step": 21147 + }, + { + "epoch": 2.65, + "grad_norm": 8.98509693145752, + "learning_rate": 2.305986696230599e-06, + "loss": 1.4634, + "step": 21148 + }, + { + "epoch": 2.65, + "grad_norm": 8.52401065826416, + "learning_rate": 2.3051499811739112e-06, + "loss": 0.4213, + "step": 21149 + }, + { + "epoch": 2.65, + "grad_norm": 6.180952548980713, + "learning_rate": 2.304313266117224e-06, + "loss": 0.4455, + "step": 21150 + }, + { + "epoch": 2.65, + "grad_norm": 7.034838676452637, + "learning_rate": 2.3034765510605364e-06, + "loss": 0.6509, + "step": 21151 + }, + { + "epoch": 2.65, + "grad_norm": 29.16041374206543, + "learning_rate": 2.302639836003849e-06, + "loss": 0.9128, + "step": 21152 + }, + { + "epoch": 2.65, + "grad_norm": 32.40922927856445, + "learning_rate": 2.3018031209471616e-06, + "loss": 1.5813, + "step": 21153 + }, + { + "epoch": 2.65, + "grad_norm": 15.488125801086426, + "learning_rate": 2.3009664058904744e-06, + "loss": 1.5634, + "step": 21154 + }, + { + "epoch": 2.65, + "grad_norm": 19.64240837097168, + "learning_rate": 2.3001296908337868e-06, + "loss": 1.8619, + "step": 21155 + }, + { + "epoch": 2.66, + "grad_norm": 12.897624015808105, + "learning_rate": 2.299292975777099e-06, + "loss": 0.5688, + "step": 21156 + }, + { + "epoch": 2.66, + "grad_norm": 10.610186576843262, + "learning_rate": 2.298456260720412e-06, + "loss": 1.3531, + "step": 21157 + }, + { + "epoch": 2.66, + "grad_norm": 77.85247039794922, + "learning_rate": 2.2976195456637243e-06, + "loss": 1.2388, + "step": 21158 + }, + { + "epoch": 2.66, + "grad_norm": 43.8265495300293, + "learning_rate": 2.296782830607037e-06, + "loss": 1.5372, + "step": 21159 + }, + { + "epoch": 2.66, + "grad_norm": 8.644686698913574, + "learning_rate": 2.2959461155503495e-06, + "loss": 0.7571, + "step": 21160 + }, + { + "epoch": 2.66, + "grad_norm": 19.352209091186523, + "learning_rate": 2.2951094004936623e-06, + "loss": 2.5901, + "step": 21161 + }, + { + "epoch": 2.66, + "grad_norm": 1.2310041189193726, + "learning_rate": 2.2942726854369747e-06, + "loss": 0.0243, + "step": 21162 + }, + { + "epoch": 2.66, + "grad_norm": 9.61866283416748, + "learning_rate": 2.293435970380287e-06, + "loss": 0.3114, + "step": 21163 + }, + { + "epoch": 2.66, + "grad_norm": 35.355979919433594, + "learning_rate": 2.2925992553236e-06, + "loss": 0.8666, + "step": 21164 + }, + { + "epoch": 2.66, + "grad_norm": 11.301298141479492, + "learning_rate": 2.2917625402669122e-06, + "loss": 1.0186, + "step": 21165 + }, + { + "epoch": 2.66, + "grad_norm": 47.20576858520508, + "learning_rate": 2.290925825210225e-06, + "loss": 0.8651, + "step": 21166 + }, + { + "epoch": 2.66, + "grad_norm": 4.567676067352295, + "learning_rate": 2.2900891101535374e-06, + "loss": 0.5583, + "step": 21167 + }, + { + "epoch": 2.66, + "grad_norm": 22.5992488861084, + "learning_rate": 2.28925239509685e-06, + "loss": 1.4118, + "step": 21168 + }, + { + "epoch": 2.66, + "grad_norm": 7.403890132904053, + "learning_rate": 2.2884156800401626e-06, + "loss": 0.3636, + "step": 21169 + }, + { + "epoch": 2.66, + "grad_norm": 14.168628692626953, + "learning_rate": 2.287578964983475e-06, + "loss": 0.4622, + "step": 21170 + }, + { + "epoch": 2.66, + "grad_norm": 15.81656265258789, + "learning_rate": 2.2867422499267878e-06, + "loss": 1.4615, + "step": 21171 + }, + { + "epoch": 2.66, + "grad_norm": 4.977477073669434, + "learning_rate": 2.2859055348701e-06, + "loss": 0.2417, + "step": 21172 + }, + { + "epoch": 2.66, + "grad_norm": 5.2455973625183105, + "learning_rate": 2.2850688198134125e-06, + "loss": 1.464, + "step": 21173 + }, + { + "epoch": 2.66, + "grad_norm": 10.898738861083984, + "learning_rate": 2.2842321047567253e-06, + "loss": 0.5189, + "step": 21174 + }, + { + "epoch": 2.66, + "grad_norm": 7.893359184265137, + "learning_rate": 2.2833953897000377e-06, + "loss": 1.1675, + "step": 21175 + }, + { + "epoch": 2.66, + "grad_norm": 19.0368709564209, + "learning_rate": 2.2825586746433505e-06, + "loss": 0.7819, + "step": 21176 + }, + { + "epoch": 2.66, + "grad_norm": 13.245115280151367, + "learning_rate": 2.281721959586663e-06, + "loss": 0.7157, + "step": 21177 + }, + { + "epoch": 2.66, + "grad_norm": 21.507944107055664, + "learning_rate": 2.2808852445299757e-06, + "loss": 1.327, + "step": 21178 + }, + { + "epoch": 2.66, + "grad_norm": 16.07476234436035, + "learning_rate": 2.280048529473288e-06, + "loss": 0.7543, + "step": 21179 + }, + { + "epoch": 2.66, + "grad_norm": 16.660404205322266, + "learning_rate": 2.2792118144166005e-06, + "loss": 0.9788, + "step": 21180 + }, + { + "epoch": 2.66, + "grad_norm": 13.21631908416748, + "learning_rate": 2.2783750993599133e-06, + "loss": 2.407, + "step": 21181 + }, + { + "epoch": 2.66, + "grad_norm": 46.83183288574219, + "learning_rate": 2.2775383843032256e-06, + "loss": 0.7512, + "step": 21182 + }, + { + "epoch": 2.66, + "grad_norm": 17.08712387084961, + "learning_rate": 2.2767016692465384e-06, + "loss": 0.6045, + "step": 21183 + }, + { + "epoch": 2.66, + "grad_norm": 2.616086006164551, + "learning_rate": 2.275864954189851e-06, + "loss": 0.0414, + "step": 21184 + }, + { + "epoch": 2.66, + "grad_norm": 96.06143951416016, + "learning_rate": 2.2750282391331636e-06, + "loss": 1.3606, + "step": 21185 + }, + { + "epoch": 2.66, + "grad_norm": 16.728219985961914, + "learning_rate": 2.274191524076476e-06, + "loss": 1.6106, + "step": 21186 + }, + { + "epoch": 2.66, + "grad_norm": 22.86241912841797, + "learning_rate": 2.2733548090197884e-06, + "loss": 1.0409, + "step": 21187 + }, + { + "epoch": 2.66, + "grad_norm": 5.369406223297119, + "learning_rate": 2.272518093963101e-06, + "loss": 0.2209, + "step": 21188 + }, + { + "epoch": 2.66, + "grad_norm": 9.571803092956543, + "learning_rate": 2.2716813789064135e-06, + "loss": 0.7741, + "step": 21189 + }, + { + "epoch": 2.66, + "grad_norm": 7.1296491622924805, + "learning_rate": 2.270844663849726e-06, + "loss": 0.6491, + "step": 21190 + }, + { + "epoch": 2.66, + "grad_norm": 9.656634330749512, + "learning_rate": 2.2700079487930387e-06, + "loss": 0.3507, + "step": 21191 + }, + { + "epoch": 2.66, + "grad_norm": 19.73517417907715, + "learning_rate": 2.2691712337363515e-06, + "loss": 0.6804, + "step": 21192 + }, + { + "epoch": 2.66, + "grad_norm": 200.69479370117188, + "learning_rate": 2.268334518679664e-06, + "loss": 1.956, + "step": 21193 + }, + { + "epoch": 2.66, + "grad_norm": 24.948909759521484, + "learning_rate": 2.2674978036229763e-06, + "loss": 0.6231, + "step": 21194 + }, + { + "epoch": 2.66, + "grad_norm": 12.767256736755371, + "learning_rate": 2.266661088566289e-06, + "loss": 1.2729, + "step": 21195 + }, + { + "epoch": 2.66, + "grad_norm": 76.8818130493164, + "learning_rate": 2.2658243735096015e-06, + "loss": 1.7502, + "step": 21196 + }, + { + "epoch": 2.66, + "grad_norm": 13.32066535949707, + "learning_rate": 2.264987658452914e-06, + "loss": 1.0797, + "step": 21197 + }, + { + "epoch": 2.66, + "grad_norm": 30.614362716674805, + "learning_rate": 2.2641509433962266e-06, + "loss": 1.2337, + "step": 21198 + }, + { + "epoch": 2.66, + "grad_norm": 22.417505264282227, + "learning_rate": 2.2633142283395395e-06, + "loss": 1.1799, + "step": 21199 + }, + { + "epoch": 2.66, + "grad_norm": 41.29404830932617, + "learning_rate": 2.262477513282852e-06, + "loss": 1.8972, + "step": 21200 + }, + { + "epoch": 2.66, + "eval_loss": 0.07736419886350632, + "eval_runtime": 95.8867, + "eval_samples_per_second": 36.939, + "eval_steps_per_second": 36.939, + "step": 21200 + }, + { + "epoch": 2.66, + "grad_norm": 10.624398231506348, + "learning_rate": 2.261640798226164e-06, + "loss": 1.2197, + "step": 21201 + }, + { + "epoch": 2.66, + "grad_norm": 3.1262216567993164, + "learning_rate": 2.260804083169477e-06, + "loss": 0.3585, + "step": 21202 + }, + { + "epoch": 2.66, + "grad_norm": 13.62775707244873, + "learning_rate": 2.2599673681127894e-06, + "loss": 1.4098, + "step": 21203 + }, + { + "epoch": 2.66, + "grad_norm": 26.97039222717285, + "learning_rate": 2.2591306530561018e-06, + "loss": 1.6124, + "step": 21204 + }, + { + "epoch": 2.66, + "grad_norm": 87.18284606933594, + "learning_rate": 2.2582939379994146e-06, + "loss": 1.0743, + "step": 21205 + }, + { + "epoch": 2.66, + "grad_norm": 11.793691635131836, + "learning_rate": 2.257457222942727e-06, + "loss": 0.4472, + "step": 21206 + }, + { + "epoch": 2.66, + "grad_norm": 14.459900856018066, + "learning_rate": 2.2566205078860397e-06, + "loss": 0.491, + "step": 21207 + }, + { + "epoch": 2.66, + "grad_norm": 4.120659351348877, + "learning_rate": 2.255783792829352e-06, + "loss": 1.3269, + "step": 21208 + }, + { + "epoch": 2.66, + "grad_norm": 3.0259759426116943, + "learning_rate": 2.254947077772665e-06, + "loss": 0.1151, + "step": 21209 + }, + { + "epoch": 2.66, + "grad_norm": 43.027565002441406, + "learning_rate": 2.2541103627159773e-06, + "loss": 0.9673, + "step": 21210 + }, + { + "epoch": 2.66, + "grad_norm": 19.61045265197754, + "learning_rate": 2.2532736476592897e-06, + "loss": 2.2392, + "step": 21211 + }, + { + "epoch": 2.66, + "grad_norm": 21.75444221496582, + "learning_rate": 2.2524369326026025e-06, + "loss": 0.3609, + "step": 21212 + }, + { + "epoch": 2.66, + "grad_norm": 19.857431411743164, + "learning_rate": 2.251600217545915e-06, + "loss": 1.7332, + "step": 21213 + }, + { + "epoch": 2.66, + "grad_norm": 7.568485736846924, + "learning_rate": 2.2507635024892272e-06, + "loss": 0.3944, + "step": 21214 + }, + { + "epoch": 2.66, + "grad_norm": 5.792768478393555, + "learning_rate": 2.24992678743254e-06, + "loss": 1.1603, + "step": 21215 + }, + { + "epoch": 2.66, + "grad_norm": 8.036165237426758, + "learning_rate": 2.249090072375853e-06, + "loss": 0.4277, + "step": 21216 + }, + { + "epoch": 2.66, + "grad_norm": 18.618942260742188, + "learning_rate": 2.2482533573191652e-06, + "loss": 1.1808, + "step": 21217 + }, + { + "epoch": 2.66, + "grad_norm": 16.693944931030273, + "learning_rate": 2.2474166422624776e-06, + "loss": 0.7119, + "step": 21218 + }, + { + "epoch": 2.66, + "grad_norm": 76.36405944824219, + "learning_rate": 2.2465799272057904e-06, + "loss": 2.6666, + "step": 21219 + }, + { + "epoch": 2.66, + "grad_norm": 163.22959899902344, + "learning_rate": 2.2457432121491028e-06, + "loss": 1.1867, + "step": 21220 + }, + { + "epoch": 2.66, + "grad_norm": 15.051604270935059, + "learning_rate": 2.244906497092415e-06, + "loss": 1.2664, + "step": 21221 + }, + { + "epoch": 2.66, + "grad_norm": 8.50916862487793, + "learning_rate": 2.244069782035728e-06, + "loss": 0.6828, + "step": 21222 + }, + { + "epoch": 2.66, + "grad_norm": 11.24258041381836, + "learning_rate": 2.2432330669790408e-06, + "loss": 0.2977, + "step": 21223 + }, + { + "epoch": 2.66, + "grad_norm": 35.36621856689453, + "learning_rate": 2.242396351922353e-06, + "loss": 2.1577, + "step": 21224 + }, + { + "epoch": 2.66, + "grad_norm": 6.245741367340088, + "learning_rate": 2.2415596368656655e-06, + "loss": 0.4049, + "step": 21225 + }, + { + "epoch": 2.66, + "grad_norm": 12.00324821472168, + "learning_rate": 2.2407229218089783e-06, + "loss": 0.4334, + "step": 21226 + }, + { + "epoch": 2.66, + "grad_norm": 23.8895320892334, + "learning_rate": 2.2398862067522907e-06, + "loss": 0.9146, + "step": 21227 + }, + { + "epoch": 2.66, + "grad_norm": 8.3075532913208, + "learning_rate": 2.239049491695603e-06, + "loss": 0.3673, + "step": 21228 + }, + { + "epoch": 2.66, + "grad_norm": 31.701955795288086, + "learning_rate": 2.238212776638916e-06, + "loss": 2.4295, + "step": 21229 + }, + { + "epoch": 2.66, + "grad_norm": 52.2243537902832, + "learning_rate": 2.2373760615822287e-06, + "loss": 0.9841, + "step": 21230 + }, + { + "epoch": 2.66, + "grad_norm": 18.255245208740234, + "learning_rate": 2.2365393465255406e-06, + "loss": 1.609, + "step": 21231 + }, + { + "epoch": 2.66, + "grad_norm": 331.67742919921875, + "learning_rate": 2.2357026314688534e-06, + "loss": 2.2401, + "step": 21232 + }, + { + "epoch": 2.66, + "grad_norm": 21.084026336669922, + "learning_rate": 2.2348659164121662e-06, + "loss": 1.3826, + "step": 21233 + }, + { + "epoch": 2.66, + "grad_norm": 41.60992431640625, + "learning_rate": 2.2340292013554786e-06, + "loss": 0.5638, + "step": 21234 + }, + { + "epoch": 2.66, + "grad_norm": 25.75510597229004, + "learning_rate": 2.233192486298791e-06, + "loss": 2.0345, + "step": 21235 + }, + { + "epoch": 2.67, + "grad_norm": 9.333986282348633, + "learning_rate": 2.232355771242104e-06, + "loss": 0.7398, + "step": 21236 + }, + { + "epoch": 2.67, + "grad_norm": 13.220513343811035, + "learning_rate": 2.231519056185416e-06, + "loss": 0.7039, + "step": 21237 + }, + { + "epoch": 2.67, + "grad_norm": 13.639309883117676, + "learning_rate": 2.2306823411287286e-06, + "loss": 0.7037, + "step": 21238 + }, + { + "epoch": 2.67, + "grad_norm": 39.554649353027344, + "learning_rate": 2.2298456260720414e-06, + "loss": 1.9648, + "step": 21239 + }, + { + "epoch": 2.67, + "grad_norm": 8.830999374389648, + "learning_rate": 2.229008911015354e-06, + "loss": 1.0308, + "step": 21240 + }, + { + "epoch": 2.67, + "grad_norm": 5.17583703994751, + "learning_rate": 2.2281721959586665e-06, + "loss": 0.1988, + "step": 21241 + }, + { + "epoch": 2.67, + "grad_norm": 15.095453262329102, + "learning_rate": 2.227335480901979e-06, + "loss": 0.77, + "step": 21242 + }, + { + "epoch": 2.67, + "grad_norm": 35.32551956176758, + "learning_rate": 2.2264987658452917e-06, + "loss": 3.1314, + "step": 21243 + }, + { + "epoch": 2.67, + "grad_norm": 18.713911056518555, + "learning_rate": 2.225662050788604e-06, + "loss": 0.9851, + "step": 21244 + }, + { + "epoch": 2.67, + "grad_norm": 203.83648681640625, + "learning_rate": 2.2248253357319165e-06, + "loss": 1.8396, + "step": 21245 + }, + { + "epoch": 2.67, + "grad_norm": 14.608433723449707, + "learning_rate": 2.2239886206752293e-06, + "loss": 1.2817, + "step": 21246 + }, + { + "epoch": 2.67, + "grad_norm": 14.23134994506836, + "learning_rate": 2.223151905618542e-06, + "loss": 0.4104, + "step": 21247 + }, + { + "epoch": 2.67, + "grad_norm": 10.220736503601074, + "learning_rate": 2.222315190561854e-06, + "loss": 0.369, + "step": 21248 + }, + { + "epoch": 2.67, + "grad_norm": 26.177026748657227, + "learning_rate": 2.221478475505167e-06, + "loss": 0.8393, + "step": 21249 + }, + { + "epoch": 2.67, + "grad_norm": 52.15340805053711, + "learning_rate": 2.2206417604484796e-06, + "loss": 1.0064, + "step": 21250 + }, + { + "epoch": 2.67, + "grad_norm": 20.59659194946289, + "learning_rate": 2.219805045391792e-06, + "loss": 0.7332, + "step": 21251 + }, + { + "epoch": 2.67, + "grad_norm": 13.785306930541992, + "learning_rate": 2.2189683303351044e-06, + "loss": 0.6688, + "step": 21252 + }, + { + "epoch": 2.67, + "grad_norm": 16.041227340698242, + "learning_rate": 2.218131615278417e-06, + "loss": 0.7687, + "step": 21253 + }, + { + "epoch": 2.67, + "grad_norm": 26.35457420349121, + "learning_rate": 2.2172949002217296e-06, + "loss": 0.5819, + "step": 21254 + }, + { + "epoch": 2.67, + "grad_norm": 24.650827407836914, + "learning_rate": 2.216458185165042e-06, + "loss": 0.9493, + "step": 21255 + }, + { + "epoch": 2.67, + "grad_norm": 34.303810119628906, + "learning_rate": 2.2156214701083548e-06, + "loss": 1.2207, + "step": 21256 + }, + { + "epoch": 2.67, + "grad_norm": 208.72035217285156, + "learning_rate": 2.2147847550516676e-06, + "loss": 1.478, + "step": 21257 + }, + { + "epoch": 2.67, + "grad_norm": 49.18112564086914, + "learning_rate": 2.21394803999498e-06, + "loss": 0.9865, + "step": 21258 + }, + { + "epoch": 2.67, + "grad_norm": 23.931133270263672, + "learning_rate": 2.2131113249382923e-06, + "loss": 1.3532, + "step": 21259 + }, + { + "epoch": 2.67, + "grad_norm": 5.023369312286377, + "learning_rate": 2.212274609881605e-06, + "loss": 0.4093, + "step": 21260 + }, + { + "epoch": 2.67, + "grad_norm": 29.752965927124023, + "learning_rate": 2.2114378948249175e-06, + "loss": 0.7628, + "step": 21261 + }, + { + "epoch": 2.67, + "grad_norm": 7.689481258392334, + "learning_rate": 2.21060117976823e-06, + "loss": 0.2462, + "step": 21262 + }, + { + "epoch": 2.67, + "grad_norm": 27.474773406982422, + "learning_rate": 2.2097644647115427e-06, + "loss": 0.7475, + "step": 21263 + }, + { + "epoch": 2.67, + "grad_norm": 21.958890914916992, + "learning_rate": 2.2089277496548555e-06, + "loss": 0.4677, + "step": 21264 + }, + { + "epoch": 2.67, + "grad_norm": 14.977560997009277, + "learning_rate": 2.208091034598168e-06, + "loss": 0.9122, + "step": 21265 + }, + { + "epoch": 2.67, + "grad_norm": 8.61684513092041, + "learning_rate": 2.2072543195414802e-06, + "loss": 0.3372, + "step": 21266 + }, + { + "epoch": 2.67, + "grad_norm": 16.10511589050293, + "learning_rate": 2.206417604484793e-06, + "loss": 0.5668, + "step": 21267 + }, + { + "epoch": 2.67, + "grad_norm": 15.77026653289795, + "learning_rate": 2.2055808894281054e-06, + "loss": 0.8589, + "step": 21268 + }, + { + "epoch": 2.67, + "grad_norm": 11.518630027770996, + "learning_rate": 2.2047441743714178e-06, + "loss": 0.2575, + "step": 21269 + }, + { + "epoch": 2.67, + "grad_norm": 1.5974454879760742, + "learning_rate": 2.2039074593147306e-06, + "loss": 0.0352, + "step": 21270 + }, + { + "epoch": 2.67, + "grad_norm": 8.891850471496582, + "learning_rate": 2.2030707442580434e-06, + "loss": 0.8898, + "step": 21271 + }, + { + "epoch": 2.67, + "grad_norm": 23.711355209350586, + "learning_rate": 2.2022340292013553e-06, + "loss": 0.5634, + "step": 21272 + }, + { + "epoch": 2.67, + "grad_norm": 23.55339813232422, + "learning_rate": 2.201397314144668e-06, + "loss": 0.7932, + "step": 21273 + }, + { + "epoch": 2.67, + "grad_norm": 24.76299285888672, + "learning_rate": 2.200560599087981e-06, + "loss": 1.3911, + "step": 21274 + }, + { + "epoch": 2.67, + "grad_norm": 14.951611518859863, + "learning_rate": 2.1997238840312933e-06, + "loss": 1.411, + "step": 21275 + }, + { + "epoch": 2.67, + "grad_norm": 13.73678207397461, + "learning_rate": 2.1988871689746057e-06, + "loss": 0.9345, + "step": 21276 + }, + { + "epoch": 2.67, + "grad_norm": 16.367549896240234, + "learning_rate": 2.1980504539179185e-06, + "loss": 0.5163, + "step": 21277 + }, + { + "epoch": 2.67, + "grad_norm": 11.205177307128906, + "learning_rate": 2.197213738861231e-06, + "loss": 0.7954, + "step": 21278 + }, + { + "epoch": 2.67, + "grad_norm": 12.95816707611084, + "learning_rate": 2.1963770238045433e-06, + "loss": 0.5146, + "step": 21279 + }, + { + "epoch": 2.67, + "grad_norm": 11.08674430847168, + "learning_rate": 2.195540308747856e-06, + "loss": 1.5952, + "step": 21280 + }, + { + "epoch": 2.67, + "grad_norm": 23.338382720947266, + "learning_rate": 2.194703593691169e-06, + "loss": 1.3663, + "step": 21281 + }, + { + "epoch": 2.67, + "grad_norm": 6.167308330535889, + "learning_rate": 2.1938668786344812e-06, + "loss": 0.209, + "step": 21282 + }, + { + "epoch": 2.67, + "grad_norm": 33.593170166015625, + "learning_rate": 2.1930301635777936e-06, + "loss": 0.759, + "step": 21283 + }, + { + "epoch": 2.67, + "grad_norm": 15.196028709411621, + "learning_rate": 2.1921934485211064e-06, + "loss": 1.0326, + "step": 21284 + }, + { + "epoch": 2.67, + "grad_norm": 13.69805908203125, + "learning_rate": 2.191356733464419e-06, + "loss": 3.6335, + "step": 21285 + }, + { + "epoch": 2.67, + "grad_norm": 18.835142135620117, + "learning_rate": 2.190520018407731e-06, + "loss": 0.5714, + "step": 21286 + }, + { + "epoch": 2.67, + "grad_norm": 13.020919799804688, + "learning_rate": 2.189683303351044e-06, + "loss": 1.5028, + "step": 21287 + }, + { + "epoch": 2.67, + "grad_norm": 38.04112243652344, + "learning_rate": 2.1888465882943568e-06, + "loss": 3.1631, + "step": 21288 + }, + { + "epoch": 2.67, + "grad_norm": 100.92923736572266, + "learning_rate": 2.188009873237669e-06, + "loss": 1.2095, + "step": 21289 + }, + { + "epoch": 2.67, + "grad_norm": 155.2234344482422, + "learning_rate": 2.1871731581809815e-06, + "loss": 1.6139, + "step": 21290 + }, + { + "epoch": 2.67, + "grad_norm": 18.216903686523438, + "learning_rate": 2.1863364431242943e-06, + "loss": 1.0611, + "step": 21291 + }, + { + "epoch": 2.67, + "grad_norm": 16.728137969970703, + "learning_rate": 2.1854997280676067e-06, + "loss": 0.6507, + "step": 21292 + }, + { + "epoch": 2.67, + "grad_norm": 11.784873962402344, + "learning_rate": 2.184663013010919e-06, + "loss": 2.4959, + "step": 21293 + }, + { + "epoch": 2.67, + "grad_norm": 17.049114227294922, + "learning_rate": 2.183826297954232e-06, + "loss": 0.7656, + "step": 21294 + }, + { + "epoch": 2.67, + "grad_norm": 26.675931930541992, + "learning_rate": 2.1829895828975443e-06, + "loss": 1.5826, + "step": 21295 + }, + { + "epoch": 2.67, + "grad_norm": 10.614181518554688, + "learning_rate": 2.1821528678408567e-06, + "loss": 0.8793, + "step": 21296 + }, + { + "epoch": 2.67, + "grad_norm": 11.736658096313477, + "learning_rate": 2.1813161527841695e-06, + "loss": 0.8583, + "step": 21297 + }, + { + "epoch": 2.67, + "grad_norm": 10.700706481933594, + "learning_rate": 2.1804794377274823e-06, + "loss": 1.0716, + "step": 21298 + }, + { + "epoch": 2.67, + "grad_norm": 12.090224266052246, + "learning_rate": 2.1796427226707946e-06, + "loss": 0.6494, + "step": 21299 + }, + { + "epoch": 2.67, + "grad_norm": 87.24824523925781, + "learning_rate": 2.178806007614107e-06, + "loss": 2.3423, + "step": 21300 + }, + { + "epoch": 2.67, + "grad_norm": 32.511173248291016, + "learning_rate": 2.17796929255742e-06, + "loss": 1.1127, + "step": 21301 + }, + { + "epoch": 2.67, + "grad_norm": 19.083251953125, + "learning_rate": 2.177132577500732e-06, + "loss": 0.5119, + "step": 21302 + }, + { + "epoch": 2.67, + "grad_norm": 3.403653383255005, + "learning_rate": 2.1762958624440446e-06, + "loss": 0.183, + "step": 21303 + }, + { + "epoch": 2.67, + "grad_norm": 9.263798713684082, + "learning_rate": 2.1754591473873574e-06, + "loss": 0.5555, + "step": 21304 + }, + { + "epoch": 2.67, + "grad_norm": 30.40312957763672, + "learning_rate": 2.17462243233067e-06, + "loss": 0.7933, + "step": 21305 + }, + { + "epoch": 2.67, + "grad_norm": 718.2808227539062, + "learning_rate": 2.1737857172739826e-06, + "loss": 1.9912, + "step": 21306 + }, + { + "epoch": 2.67, + "grad_norm": 10.145508766174316, + "learning_rate": 2.172949002217295e-06, + "loss": 0.523, + "step": 21307 + }, + { + "epoch": 2.67, + "grad_norm": 7.460542678833008, + "learning_rate": 2.1721122871606077e-06, + "loss": 0.6282, + "step": 21308 + }, + { + "epoch": 2.67, + "grad_norm": 10.18742847442627, + "learning_rate": 2.17127557210392e-06, + "loss": 0.4166, + "step": 21309 + }, + { + "epoch": 2.67, + "grad_norm": 14.787338256835938, + "learning_rate": 2.1704388570472325e-06, + "loss": 0.526, + "step": 21310 + }, + { + "epoch": 2.67, + "grad_norm": 8.858278274536133, + "learning_rate": 2.1696021419905453e-06, + "loss": 1.3333, + "step": 21311 + }, + { + "epoch": 2.67, + "grad_norm": 6.033418655395508, + "learning_rate": 2.1687654269338577e-06, + "loss": 0.3257, + "step": 21312 + }, + { + "epoch": 2.67, + "grad_norm": 19.701229095458984, + "learning_rate": 2.1679287118771705e-06, + "loss": 0.9941, + "step": 21313 + }, + { + "epoch": 2.67, + "grad_norm": 25.754854202270508, + "learning_rate": 2.167091996820483e-06, + "loss": 1.102, + "step": 21314 + }, + { + "epoch": 2.67, + "grad_norm": 17.089696884155273, + "learning_rate": 2.1662552817637957e-06, + "loss": 0.6877, + "step": 21315 + }, + { + "epoch": 2.68, + "grad_norm": 47.599430084228516, + "learning_rate": 2.165418566707108e-06, + "loss": 3.1583, + "step": 21316 + }, + { + "epoch": 2.68, + "grad_norm": 29.585224151611328, + "learning_rate": 2.1645818516504204e-06, + "loss": 1.3397, + "step": 21317 + }, + { + "epoch": 2.68, + "grad_norm": 27.140111923217773, + "learning_rate": 2.1637451365937332e-06, + "loss": 2.3643, + "step": 21318 + }, + { + "epoch": 2.68, + "grad_norm": 11.597197532653809, + "learning_rate": 2.1629084215370456e-06, + "loss": 1.4418, + "step": 21319 + }, + { + "epoch": 2.68, + "grad_norm": 5.406264781951904, + "learning_rate": 2.1620717064803584e-06, + "loss": 0.115, + "step": 21320 + }, + { + "epoch": 2.68, + "grad_norm": 70.19218444824219, + "learning_rate": 2.1612349914236708e-06, + "loss": 1.7635, + "step": 21321 + }, + { + "epoch": 2.68, + "grad_norm": 7.70452356338501, + "learning_rate": 2.1603982763669836e-06, + "loss": 0.6965, + "step": 21322 + }, + { + "epoch": 2.68, + "grad_norm": 35.344058990478516, + "learning_rate": 2.159561561310296e-06, + "loss": 0.8861, + "step": 21323 + }, + { + "epoch": 2.68, + "grad_norm": 3.616994619369507, + "learning_rate": 2.1587248462536083e-06, + "loss": 0.1458, + "step": 21324 + }, + { + "epoch": 2.68, + "grad_norm": 8.057775497436523, + "learning_rate": 2.157888131196921e-06, + "loss": 0.5616, + "step": 21325 + }, + { + "epoch": 2.68, + "grad_norm": 192.39141845703125, + "learning_rate": 2.1570514161402335e-06, + "loss": 1.0416, + "step": 21326 + }, + { + "epoch": 2.68, + "grad_norm": 5.729534149169922, + "learning_rate": 2.1562147010835463e-06, + "loss": 0.2792, + "step": 21327 + }, + { + "epoch": 2.68, + "grad_norm": 25.381532669067383, + "learning_rate": 2.1553779860268587e-06, + "loss": 1.9781, + "step": 21328 + }, + { + "epoch": 2.68, + "grad_norm": 7.036624431610107, + "learning_rate": 2.1545412709701715e-06, + "loss": 0.547, + "step": 21329 + }, + { + "epoch": 2.68, + "grad_norm": 3.12929368019104, + "learning_rate": 2.153704555913484e-06, + "loss": 0.1703, + "step": 21330 + }, + { + "epoch": 2.68, + "grad_norm": 10.610880851745605, + "learning_rate": 2.1528678408567963e-06, + "loss": 1.7418, + "step": 21331 + }, + { + "epoch": 2.68, + "grad_norm": 5.1598801612854, + "learning_rate": 2.152031125800109e-06, + "loss": 0.2733, + "step": 21332 + }, + { + "epoch": 2.68, + "grad_norm": 11.032546997070312, + "learning_rate": 2.1511944107434214e-06, + "loss": 0.5282, + "step": 21333 + }, + { + "epoch": 2.68, + "grad_norm": 43.82704162597656, + "learning_rate": 2.1503576956867342e-06, + "loss": 0.9411, + "step": 21334 + }, + { + "epoch": 2.68, + "grad_norm": 6.436817169189453, + "learning_rate": 2.1495209806300466e-06, + "loss": 0.2611, + "step": 21335 + }, + { + "epoch": 2.68, + "grad_norm": 97.83805847167969, + "learning_rate": 2.148684265573359e-06, + "loss": 2.1338, + "step": 21336 + }, + { + "epoch": 2.68, + "grad_norm": 11.422772407531738, + "learning_rate": 2.147847550516672e-06, + "loss": 0.6259, + "step": 21337 + }, + { + "epoch": 2.68, + "grad_norm": 16.130887985229492, + "learning_rate": 2.147010835459984e-06, + "loss": 0.8244, + "step": 21338 + }, + { + "epoch": 2.68, + "grad_norm": 81.12645721435547, + "learning_rate": 2.146174120403297e-06, + "loss": 2.4464, + "step": 21339 + }, + { + "epoch": 2.68, + "grad_norm": 35.82655715942383, + "learning_rate": 2.1453374053466094e-06, + "loss": 0.8004, + "step": 21340 + }, + { + "epoch": 2.68, + "grad_norm": 70.28784942626953, + "learning_rate": 2.1445006902899217e-06, + "loss": 1.1992, + "step": 21341 + }, + { + "epoch": 2.68, + "grad_norm": 20.804058074951172, + "learning_rate": 2.1436639752332345e-06, + "loss": 1.8358, + "step": 21342 + }, + { + "epoch": 2.68, + "grad_norm": 87.13873291015625, + "learning_rate": 2.142827260176547e-06, + "loss": 0.8842, + "step": 21343 + }, + { + "epoch": 2.68, + "grad_norm": 14.349838256835938, + "learning_rate": 2.1419905451198597e-06, + "loss": 0.8715, + "step": 21344 + }, + { + "epoch": 2.68, + "grad_norm": 8.820053100585938, + "learning_rate": 2.141153830063172e-06, + "loss": 1.5338, + "step": 21345 + }, + { + "epoch": 2.68, + "grad_norm": 8.869967460632324, + "learning_rate": 2.140317115006485e-06, + "loss": 0.4037, + "step": 21346 + }, + { + "epoch": 2.68, + "grad_norm": 23.704086303710938, + "learning_rate": 2.1394803999497973e-06, + "loss": 0.6011, + "step": 21347 + }, + { + "epoch": 2.68, + "grad_norm": 10.375609397888184, + "learning_rate": 2.1386436848931096e-06, + "loss": 0.5108, + "step": 21348 + }, + { + "epoch": 2.68, + "grad_norm": 22.28101921081543, + "learning_rate": 2.1378069698364225e-06, + "loss": 1.2317, + "step": 21349 + }, + { + "epoch": 2.68, + "grad_norm": 14.879716873168945, + "learning_rate": 2.136970254779735e-06, + "loss": 1.5005, + "step": 21350 + }, + { + "epoch": 2.68, + "grad_norm": 8.093547821044922, + "learning_rate": 2.1361335397230476e-06, + "loss": 0.5505, + "step": 21351 + }, + { + "epoch": 2.68, + "grad_norm": 26.884418487548828, + "learning_rate": 2.13529682466636e-06, + "loss": 1.0462, + "step": 21352 + }, + { + "epoch": 2.68, + "grad_norm": 50.99755859375, + "learning_rate": 2.1344601096096724e-06, + "loss": 2.7212, + "step": 21353 + }, + { + "epoch": 2.68, + "grad_norm": 4.737620830535889, + "learning_rate": 2.133623394552985e-06, + "loss": 0.0827, + "step": 21354 + }, + { + "epoch": 2.68, + "grad_norm": 182.28158569335938, + "learning_rate": 2.1327866794962976e-06, + "loss": 2.0251, + "step": 21355 + }, + { + "epoch": 2.68, + "grad_norm": 12.041132926940918, + "learning_rate": 2.1319499644396104e-06, + "loss": 1.6925, + "step": 21356 + }, + { + "epoch": 2.68, + "grad_norm": 46.95587158203125, + "learning_rate": 2.1311132493829227e-06, + "loss": 2.1608, + "step": 21357 + }, + { + "epoch": 2.68, + "grad_norm": 12.917547225952148, + "learning_rate": 2.1302765343262355e-06, + "loss": 1.1931, + "step": 21358 + }, + { + "epoch": 2.68, + "grad_norm": 10.681229591369629, + "learning_rate": 2.129439819269548e-06, + "loss": 1.2388, + "step": 21359 + }, + { + "epoch": 2.68, + "grad_norm": 20.28715705871582, + "learning_rate": 2.1286031042128603e-06, + "loss": 0.7494, + "step": 21360 + }, + { + "epoch": 2.68, + "grad_norm": 5.435354232788086, + "learning_rate": 2.127766389156173e-06, + "loss": 1.407, + "step": 21361 + }, + { + "epoch": 2.68, + "grad_norm": 21.793575286865234, + "learning_rate": 2.1269296740994855e-06, + "loss": 0.791, + "step": 21362 + }, + { + "epoch": 2.68, + "grad_norm": 14.35647201538086, + "learning_rate": 2.1260929590427983e-06, + "loss": 0.7086, + "step": 21363 + }, + { + "epoch": 2.68, + "grad_norm": 7.733112335205078, + "learning_rate": 2.1252562439861107e-06, + "loss": 0.2449, + "step": 21364 + }, + { + "epoch": 2.68, + "grad_norm": 3.2984607219696045, + "learning_rate": 2.1244195289294235e-06, + "loss": 0.2176, + "step": 21365 + }, + { + "epoch": 2.68, + "grad_norm": 5.907418727874756, + "learning_rate": 2.123582813872736e-06, + "loss": 0.564, + "step": 21366 + }, + { + "epoch": 2.68, + "grad_norm": 22.646575927734375, + "learning_rate": 2.1227460988160482e-06, + "loss": 2.2823, + "step": 21367 + }, + { + "epoch": 2.68, + "grad_norm": 32.62713623046875, + "learning_rate": 2.121909383759361e-06, + "loss": 2.7619, + "step": 21368 + }, + { + "epoch": 2.68, + "grad_norm": 5.670522689819336, + "learning_rate": 2.1210726687026734e-06, + "loss": 0.2738, + "step": 21369 + }, + { + "epoch": 2.68, + "grad_norm": 4.159451007843018, + "learning_rate": 2.120235953645986e-06, + "loss": 0.1918, + "step": 21370 + }, + { + "epoch": 2.68, + "grad_norm": 18.758514404296875, + "learning_rate": 2.1193992385892986e-06, + "loss": 1.2502, + "step": 21371 + }, + { + "epoch": 2.68, + "grad_norm": 20.235401153564453, + "learning_rate": 2.1185625235326114e-06, + "loss": 1.0483, + "step": 21372 + }, + { + "epoch": 2.68, + "grad_norm": 68.90306091308594, + "learning_rate": 2.1177258084759238e-06, + "loss": 2.1111, + "step": 21373 + }, + { + "epoch": 2.68, + "grad_norm": 8.712653160095215, + "learning_rate": 2.116889093419236e-06, + "loss": 0.4773, + "step": 21374 + }, + { + "epoch": 2.68, + "grad_norm": 27.22140884399414, + "learning_rate": 2.116052378362549e-06, + "loss": 2.1803, + "step": 21375 + }, + { + "epoch": 2.68, + "grad_norm": 4.715273857116699, + "learning_rate": 2.1152156633058613e-06, + "loss": 0.2097, + "step": 21376 + }, + { + "epoch": 2.68, + "grad_norm": 34.50177764892578, + "learning_rate": 2.1143789482491737e-06, + "loss": 1.2874, + "step": 21377 + }, + { + "epoch": 2.68, + "grad_norm": 100.45948028564453, + "learning_rate": 2.1135422331924865e-06, + "loss": 1.0679, + "step": 21378 + }, + { + "epoch": 2.68, + "grad_norm": 13.128931999206543, + "learning_rate": 2.1127055181357993e-06, + "loss": 0.8483, + "step": 21379 + }, + { + "epoch": 2.68, + "grad_norm": 23.208322525024414, + "learning_rate": 2.1118688030791117e-06, + "loss": 1.6605, + "step": 21380 + }, + { + "epoch": 2.68, + "grad_norm": 17.86162757873535, + "learning_rate": 2.111032088022424e-06, + "loss": 0.9252, + "step": 21381 + }, + { + "epoch": 2.68, + "grad_norm": 12.019597053527832, + "learning_rate": 2.110195372965737e-06, + "loss": 0.685, + "step": 21382 + }, + { + "epoch": 2.68, + "grad_norm": 9.456475257873535, + "learning_rate": 2.1093586579090492e-06, + "loss": 0.6459, + "step": 21383 + }, + { + "epoch": 2.68, + "grad_norm": 16.749839782714844, + "learning_rate": 2.1085219428523616e-06, + "loss": 1.6394, + "step": 21384 + }, + { + "epoch": 2.68, + "grad_norm": 728.4263305664062, + "learning_rate": 2.1076852277956744e-06, + "loss": 1.6383, + "step": 21385 + }, + { + "epoch": 2.68, + "grad_norm": 16.955371856689453, + "learning_rate": 2.106848512738987e-06, + "loss": 0.7841, + "step": 21386 + }, + { + "epoch": 2.68, + "grad_norm": 35.67090606689453, + "learning_rate": 2.1060117976822996e-06, + "loss": 2.771, + "step": 21387 + }, + { + "epoch": 2.68, + "grad_norm": 13.471112251281738, + "learning_rate": 2.105175082625612e-06, + "loss": 1.3309, + "step": 21388 + }, + { + "epoch": 2.68, + "grad_norm": 12.307778358459473, + "learning_rate": 2.1043383675689248e-06, + "loss": 0.7766, + "step": 21389 + }, + { + "epoch": 2.68, + "grad_norm": 30.031293869018555, + "learning_rate": 2.103501652512237e-06, + "loss": 1.2038, + "step": 21390 + }, + { + "epoch": 2.68, + "grad_norm": 8.911970138549805, + "learning_rate": 2.1026649374555495e-06, + "loss": 0.6402, + "step": 21391 + }, + { + "epoch": 2.68, + "grad_norm": 21.951520919799805, + "learning_rate": 2.1018282223988623e-06, + "loss": 0.8277, + "step": 21392 + }, + { + "epoch": 2.68, + "grad_norm": 4.777732849121094, + "learning_rate": 2.1009915073421747e-06, + "loss": 0.3424, + "step": 21393 + }, + { + "epoch": 2.68, + "grad_norm": 17.507043838500977, + "learning_rate": 2.100154792285487e-06, + "loss": 1.7706, + "step": 21394 + }, + { + "epoch": 2.69, + "grad_norm": 47.57414627075195, + "learning_rate": 2.0993180772288e-06, + "loss": 3.0551, + "step": 21395 + }, + { + "epoch": 2.69, + "grad_norm": 30.226314544677734, + "learning_rate": 2.0984813621721127e-06, + "loss": 0.5564, + "step": 21396 + }, + { + "epoch": 2.69, + "grad_norm": 82.02247619628906, + "learning_rate": 2.097644647115425e-06, + "loss": 1.1771, + "step": 21397 + }, + { + "epoch": 2.69, + "grad_norm": 17.59989356994629, + "learning_rate": 2.0968079320587375e-06, + "loss": 0.6229, + "step": 21398 + }, + { + "epoch": 2.69, + "grad_norm": 13.957846641540527, + "learning_rate": 2.0959712170020503e-06, + "loss": 0.3632, + "step": 21399 + }, + { + "epoch": 2.69, + "grad_norm": 7.702903747558594, + "learning_rate": 2.0951345019453626e-06, + "loss": 0.5295, + "step": 21400 + }, + { + "epoch": 2.69, + "grad_norm": 34.69055938720703, + "learning_rate": 2.094297786888675e-06, + "loss": 1.3101, + "step": 21401 + }, + { + "epoch": 2.69, + "grad_norm": 22.140445709228516, + "learning_rate": 2.093461071831988e-06, + "loss": 0.8607, + "step": 21402 + }, + { + "epoch": 2.69, + "grad_norm": 12.767571449279785, + "learning_rate": 2.0926243567753006e-06, + "loss": 0.5277, + "step": 21403 + }, + { + "epoch": 2.69, + "grad_norm": 6.7225775718688965, + "learning_rate": 2.091787641718613e-06, + "loss": 0.1126, + "step": 21404 + }, + { + "epoch": 2.69, + "grad_norm": 21.867769241333008, + "learning_rate": 2.0909509266619254e-06, + "loss": 1.0007, + "step": 21405 + }, + { + "epoch": 2.69, + "grad_norm": 5.484789848327637, + "learning_rate": 2.090114211605238e-06, + "loss": 0.2842, + "step": 21406 + }, + { + "epoch": 2.69, + "grad_norm": 10.440987586975098, + "learning_rate": 2.0892774965485506e-06, + "loss": 0.4244, + "step": 21407 + }, + { + "epoch": 2.69, + "grad_norm": 18.795623779296875, + "learning_rate": 2.088440781491863e-06, + "loss": 0.6584, + "step": 21408 + }, + { + "epoch": 2.69, + "grad_norm": 14.028873443603516, + "learning_rate": 2.0876040664351757e-06, + "loss": 0.7828, + "step": 21409 + }, + { + "epoch": 2.69, + "grad_norm": 9.235030174255371, + "learning_rate": 2.0867673513784885e-06, + "loss": 0.3285, + "step": 21410 + }, + { + "epoch": 2.69, + "grad_norm": 10.508769035339355, + "learning_rate": 2.085930636321801e-06, + "loss": 0.2151, + "step": 21411 + }, + { + "epoch": 2.69, + "grad_norm": 10.280550003051758, + "learning_rate": 2.0850939212651133e-06, + "loss": 0.6408, + "step": 21412 + }, + { + "epoch": 2.69, + "grad_norm": 11.279296875, + "learning_rate": 2.084257206208426e-06, + "loss": 0.49, + "step": 21413 + }, + { + "epoch": 2.69, + "grad_norm": 5.540591716766357, + "learning_rate": 2.0834204911517385e-06, + "loss": 0.2417, + "step": 21414 + }, + { + "epoch": 2.69, + "grad_norm": 43.89884948730469, + "learning_rate": 2.082583776095051e-06, + "loss": 1.1984, + "step": 21415 + }, + { + "epoch": 2.69, + "grad_norm": 11.392531394958496, + "learning_rate": 2.0817470610383637e-06, + "loss": 0.4732, + "step": 21416 + }, + { + "epoch": 2.69, + "grad_norm": 21.991235733032227, + "learning_rate": 2.080910345981676e-06, + "loss": 2.2995, + "step": 21417 + }, + { + "epoch": 2.69, + "grad_norm": 12.45938491821289, + "learning_rate": 2.0800736309249884e-06, + "loss": 0.8814, + "step": 21418 + }, + { + "epoch": 2.69, + "grad_norm": 12.782137870788574, + "learning_rate": 2.079236915868301e-06, + "loss": 0.8995, + "step": 21419 + }, + { + "epoch": 2.69, + "grad_norm": 23.23676300048828, + "learning_rate": 2.078400200811614e-06, + "loss": 1.4313, + "step": 21420 + }, + { + "epoch": 2.69, + "grad_norm": 21.540143966674805, + "learning_rate": 2.0775634857549264e-06, + "loss": 0.9224, + "step": 21421 + }, + { + "epoch": 2.69, + "grad_norm": 14.929125785827637, + "learning_rate": 2.0767267706982388e-06, + "loss": 1.0599, + "step": 21422 + }, + { + "epoch": 2.69, + "grad_norm": 6.553576469421387, + "learning_rate": 2.0758900556415516e-06, + "loss": 0.4106, + "step": 21423 + }, + { + "epoch": 2.69, + "grad_norm": 83.32090759277344, + "learning_rate": 2.075053340584864e-06, + "loss": 2.2362, + "step": 21424 + }, + { + "epoch": 2.69, + "grad_norm": 37.81772994995117, + "learning_rate": 2.0742166255281763e-06, + "loss": 1.1683, + "step": 21425 + }, + { + "epoch": 2.69, + "grad_norm": 16.711854934692383, + "learning_rate": 2.073379910471489e-06, + "loss": 0.4657, + "step": 21426 + }, + { + "epoch": 2.69, + "grad_norm": 17.520536422729492, + "learning_rate": 2.072543195414802e-06, + "loss": 0.4987, + "step": 21427 + }, + { + "epoch": 2.69, + "grad_norm": 21.420486450195312, + "learning_rate": 2.0717064803581143e-06, + "loss": 1.0809, + "step": 21428 + }, + { + "epoch": 2.69, + "grad_norm": 13.875296592712402, + "learning_rate": 2.0708697653014267e-06, + "loss": 0.8191, + "step": 21429 + }, + { + "epoch": 2.69, + "grad_norm": 13.885878562927246, + "learning_rate": 2.0700330502447395e-06, + "loss": 1.0701, + "step": 21430 + }, + { + "epoch": 2.69, + "grad_norm": 14.665055274963379, + "learning_rate": 2.069196335188052e-06, + "loss": 0.6985, + "step": 21431 + }, + { + "epoch": 2.69, + "grad_norm": 9.924116134643555, + "learning_rate": 2.0683596201313642e-06, + "loss": 0.5416, + "step": 21432 + }, + { + "epoch": 2.69, + "grad_norm": 38.15719985961914, + "learning_rate": 2.067522905074677e-06, + "loss": 0.415, + "step": 21433 + }, + { + "epoch": 2.69, + "grad_norm": 6.417823791503906, + "learning_rate": 2.06668619001799e-06, + "loss": 1.634, + "step": 21434 + }, + { + "epoch": 2.69, + "grad_norm": 14.809370040893555, + "learning_rate": 2.065849474961302e-06, + "loss": 0.4852, + "step": 21435 + }, + { + "epoch": 2.69, + "grad_norm": 23.902238845825195, + "learning_rate": 2.0650127599046146e-06, + "loss": 1.0686, + "step": 21436 + }, + { + "epoch": 2.69, + "grad_norm": 12.463006019592285, + "learning_rate": 2.0641760448479274e-06, + "loss": 0.564, + "step": 21437 + }, + { + "epoch": 2.69, + "grad_norm": 4.596797943115234, + "learning_rate": 2.0633393297912398e-06, + "loss": 0.521, + "step": 21438 + }, + { + "epoch": 2.69, + "grad_norm": 19.941085815429688, + "learning_rate": 2.062502614734552e-06, + "loss": 0.5877, + "step": 21439 + }, + { + "epoch": 2.69, + "grad_norm": 12.362852096557617, + "learning_rate": 2.061665899677865e-06, + "loss": 0.7027, + "step": 21440 + }, + { + "epoch": 2.69, + "grad_norm": 32.76074981689453, + "learning_rate": 2.0608291846211773e-06, + "loss": 0.7459, + "step": 21441 + }, + { + "epoch": 2.69, + "grad_norm": 7.287831783294678, + "learning_rate": 2.0599924695644897e-06, + "loss": 0.3511, + "step": 21442 + }, + { + "epoch": 2.69, + "grad_norm": 4.938597679138184, + "learning_rate": 2.0591557545078025e-06, + "loss": 0.5302, + "step": 21443 + }, + { + "epoch": 2.69, + "grad_norm": 7.694718837738037, + "learning_rate": 2.0583190394511153e-06, + "loss": 0.1274, + "step": 21444 + }, + { + "epoch": 2.69, + "grad_norm": 21.389583587646484, + "learning_rate": 2.0574823243944277e-06, + "loss": 1.3318, + "step": 21445 + }, + { + "epoch": 2.69, + "grad_norm": 16.10489845275879, + "learning_rate": 2.05664560933774e-06, + "loss": 1.4647, + "step": 21446 + }, + { + "epoch": 2.69, + "grad_norm": 19.475576400756836, + "learning_rate": 2.055808894281053e-06, + "loss": 0.9934, + "step": 21447 + }, + { + "epoch": 2.69, + "grad_norm": 4.483088970184326, + "learning_rate": 2.0549721792243653e-06, + "loss": 0.0673, + "step": 21448 + }, + { + "epoch": 2.69, + "grad_norm": 9.39845085144043, + "learning_rate": 2.0541354641676776e-06, + "loss": 0.52, + "step": 21449 + }, + { + "epoch": 2.69, + "grad_norm": 24.6356201171875, + "learning_rate": 2.0532987491109904e-06, + "loss": 1.1128, + "step": 21450 + }, + { + "epoch": 2.69, + "grad_norm": 7.2979512214660645, + "learning_rate": 2.0524620340543032e-06, + "loss": 0.5031, + "step": 21451 + }, + { + "epoch": 2.69, + "grad_norm": 7.22651481628418, + "learning_rate": 2.051625318997615e-06, + "loss": 0.4692, + "step": 21452 + }, + { + "epoch": 2.69, + "grad_norm": 13.737506866455078, + "learning_rate": 2.050788603940928e-06, + "loss": 1.4457, + "step": 21453 + }, + { + "epoch": 2.69, + "grad_norm": 11.899345397949219, + "learning_rate": 2.049951888884241e-06, + "loss": 0.4482, + "step": 21454 + }, + { + "epoch": 2.69, + "grad_norm": 14.08261489868164, + "learning_rate": 2.049115173827553e-06, + "loss": 0.4664, + "step": 21455 + }, + { + "epoch": 2.69, + "grad_norm": 15.5072021484375, + "learning_rate": 2.0482784587708656e-06, + "loss": 0.3376, + "step": 21456 + }, + { + "epoch": 2.69, + "grad_norm": 14.26341724395752, + "learning_rate": 2.0474417437141784e-06, + "loss": 0.8661, + "step": 21457 + }, + { + "epoch": 2.69, + "grad_norm": 11.781574249267578, + "learning_rate": 2.0466050286574907e-06, + "loss": 2.138, + "step": 21458 + }, + { + "epoch": 2.69, + "grad_norm": 21.719669342041016, + "learning_rate": 2.045768313600803e-06, + "loss": 0.4253, + "step": 21459 + }, + { + "epoch": 2.69, + "grad_norm": 9.645719528198242, + "learning_rate": 2.044931598544116e-06, + "loss": 0.5508, + "step": 21460 + }, + { + "epoch": 2.69, + "grad_norm": 4.674021244049072, + "learning_rate": 2.0440948834874287e-06, + "loss": 0.0924, + "step": 21461 + }, + { + "epoch": 2.69, + "grad_norm": 15.2584228515625, + "learning_rate": 2.043258168430741e-06, + "loss": 1.799, + "step": 21462 + }, + { + "epoch": 2.69, + "grad_norm": 5.222739219665527, + "learning_rate": 2.0424214533740535e-06, + "loss": 0.2849, + "step": 21463 + }, + { + "epoch": 2.69, + "grad_norm": 25.117767333984375, + "learning_rate": 2.0415847383173663e-06, + "loss": 1.5771, + "step": 21464 + }, + { + "epoch": 2.69, + "grad_norm": 24.659822463989258, + "learning_rate": 2.0407480232606787e-06, + "loss": 0.7105, + "step": 21465 + }, + { + "epoch": 2.69, + "grad_norm": 64.56285095214844, + "learning_rate": 2.039911308203991e-06, + "loss": 2.2335, + "step": 21466 + }, + { + "epoch": 2.69, + "grad_norm": 7.914119243621826, + "learning_rate": 2.039074593147304e-06, + "loss": 0.6704, + "step": 21467 + }, + { + "epoch": 2.69, + "grad_norm": 19.407703399658203, + "learning_rate": 2.0382378780906166e-06, + "loss": 1.5496, + "step": 21468 + }, + { + "epoch": 2.69, + "grad_norm": 106.05070495605469, + "learning_rate": 2.037401163033929e-06, + "loss": 1.4463, + "step": 21469 + }, + { + "epoch": 2.69, + "grad_norm": 7.93324089050293, + "learning_rate": 2.0365644479772414e-06, + "loss": 0.5607, + "step": 21470 + }, + { + "epoch": 2.69, + "grad_norm": 18.987442016601562, + "learning_rate": 2.035727732920554e-06, + "loss": 0.6445, + "step": 21471 + }, + { + "epoch": 2.69, + "grad_norm": 36.77301788330078, + "learning_rate": 2.0348910178638666e-06, + "loss": 2.6085, + "step": 21472 + }, + { + "epoch": 2.69, + "grad_norm": 21.309961318969727, + "learning_rate": 2.034054302807179e-06, + "loss": 1.2199, + "step": 21473 + }, + { + "epoch": 2.69, + "grad_norm": 22.824777603149414, + "learning_rate": 2.0332175877504918e-06, + "loss": 2.7914, + "step": 21474 + }, + { + "epoch": 2.7, + "grad_norm": 28.599380493164062, + "learning_rate": 2.0323808726938046e-06, + "loss": 0.6091, + "step": 21475 + }, + { + "epoch": 2.7, + "grad_norm": 13.5480375289917, + "learning_rate": 2.0315441576371165e-06, + "loss": 1.1295, + "step": 21476 + }, + { + "epoch": 2.7, + "grad_norm": 7.5399956703186035, + "learning_rate": 2.0307074425804293e-06, + "loss": 0.3935, + "step": 21477 + }, + { + "epoch": 2.7, + "grad_norm": 12.060484886169434, + "learning_rate": 2.029870727523742e-06, + "loss": 0.8903, + "step": 21478 + }, + { + "epoch": 2.7, + "grad_norm": 14.38305950164795, + "learning_rate": 2.0290340124670545e-06, + "loss": 1.5363, + "step": 21479 + }, + { + "epoch": 2.7, + "grad_norm": 22.1859130859375, + "learning_rate": 2.028197297410367e-06, + "loss": 1.9556, + "step": 21480 + }, + { + "epoch": 2.7, + "grad_norm": 15.168512344360352, + "learning_rate": 2.0273605823536797e-06, + "loss": 0.8704, + "step": 21481 + }, + { + "epoch": 2.7, + "grad_norm": 12.938074111938477, + "learning_rate": 2.026523867296992e-06, + "loss": 0.9819, + "step": 21482 + }, + { + "epoch": 2.7, + "grad_norm": 15.477502822875977, + "learning_rate": 2.0256871522403044e-06, + "loss": 0.4481, + "step": 21483 + }, + { + "epoch": 2.7, + "grad_norm": 21.59099578857422, + "learning_rate": 2.0248504371836172e-06, + "loss": 2.4869, + "step": 21484 + }, + { + "epoch": 2.7, + "grad_norm": 28.2064151763916, + "learning_rate": 2.02401372212693e-06, + "loss": 1.1286, + "step": 21485 + }, + { + "epoch": 2.7, + "grad_norm": 10.416912078857422, + "learning_rate": 2.0231770070702424e-06, + "loss": 0.4991, + "step": 21486 + }, + { + "epoch": 2.7, + "grad_norm": 11.99328899383545, + "learning_rate": 2.022340292013555e-06, + "loss": 1.4455, + "step": 21487 + }, + { + "epoch": 2.7, + "grad_norm": 82.79418182373047, + "learning_rate": 2.0215035769568676e-06, + "loss": 2.1569, + "step": 21488 + }, + { + "epoch": 2.7, + "grad_norm": 17.955158233642578, + "learning_rate": 2.02066686190018e-06, + "loss": 2.2086, + "step": 21489 + }, + { + "epoch": 2.7, + "grad_norm": 6.074029922485352, + "learning_rate": 2.0198301468434924e-06, + "loss": 0.1896, + "step": 21490 + }, + { + "epoch": 2.7, + "grad_norm": 18.251638412475586, + "learning_rate": 2.018993431786805e-06, + "loss": 1.5237, + "step": 21491 + }, + { + "epoch": 2.7, + "grad_norm": 10.850665092468262, + "learning_rate": 2.018156716730118e-06, + "loss": 0.7379, + "step": 21492 + }, + { + "epoch": 2.7, + "grad_norm": 16.91484260559082, + "learning_rate": 2.0173200016734303e-06, + "loss": 0.9855, + "step": 21493 + }, + { + "epoch": 2.7, + "grad_norm": 22.841798782348633, + "learning_rate": 2.0164832866167427e-06, + "loss": 0.374, + "step": 21494 + }, + { + "epoch": 2.7, + "grad_norm": 43.07855224609375, + "learning_rate": 2.0156465715600555e-06, + "loss": 1.9408, + "step": 21495 + }, + { + "epoch": 2.7, + "grad_norm": 14.059098243713379, + "learning_rate": 2.014809856503368e-06, + "loss": 0.7753, + "step": 21496 + }, + { + "epoch": 2.7, + "grad_norm": 4.776729583740234, + "learning_rate": 2.0139731414466803e-06, + "loss": 0.1441, + "step": 21497 + }, + { + "epoch": 2.7, + "grad_norm": 9.423643112182617, + "learning_rate": 2.013136426389993e-06, + "loss": 0.2007, + "step": 21498 + }, + { + "epoch": 2.7, + "grad_norm": 14.687823295593262, + "learning_rate": 2.0122997113333055e-06, + "loss": 1.5296, + "step": 21499 + }, + { + "epoch": 2.7, + "grad_norm": 15.722042083740234, + "learning_rate": 2.0114629962766183e-06, + "loss": 1.5207, + "step": 21500 + }, + { + "epoch": 2.7, + "grad_norm": 17.29703140258789, + "learning_rate": 2.0106262812199306e-06, + "loss": 1.5851, + "step": 21501 + }, + { + "epoch": 2.7, + "grad_norm": 16.280162811279297, + "learning_rate": 2.0097895661632434e-06, + "loss": 0.9692, + "step": 21502 + }, + { + "epoch": 2.7, + "grad_norm": 14.40085506439209, + "learning_rate": 2.008952851106556e-06, + "loss": 1.2911, + "step": 21503 + }, + { + "epoch": 2.7, + "grad_norm": 5.068446159362793, + "learning_rate": 2.008116136049868e-06, + "loss": 1.4361, + "step": 21504 + }, + { + "epoch": 2.7, + "grad_norm": 11.074398040771484, + "learning_rate": 2.007279420993181e-06, + "loss": 0.4624, + "step": 21505 + }, + { + "epoch": 2.7, + "grad_norm": 136.94422912597656, + "learning_rate": 2.0064427059364934e-06, + "loss": 2.9412, + "step": 21506 + }, + { + "epoch": 2.7, + "grad_norm": 5.199703216552734, + "learning_rate": 2.005605990879806e-06, + "loss": 1.524, + "step": 21507 + }, + { + "epoch": 2.7, + "grad_norm": 26.445039749145508, + "learning_rate": 2.0047692758231185e-06, + "loss": 1.0059, + "step": 21508 + }, + { + "epoch": 2.7, + "grad_norm": 19.995515823364258, + "learning_rate": 2.0039325607664314e-06, + "loss": 1.796, + "step": 21509 + }, + { + "epoch": 2.7, + "grad_norm": 136.7841796875, + "learning_rate": 2.0030958457097437e-06, + "loss": 1.4174, + "step": 21510 + }, + { + "epoch": 2.7, + "grad_norm": 26.809480667114258, + "learning_rate": 2.002259130653056e-06, + "loss": 0.6017, + "step": 21511 + }, + { + "epoch": 2.7, + "grad_norm": 21.66718292236328, + "learning_rate": 2.001422415596369e-06, + "loss": 1.6301, + "step": 21512 + }, + { + "epoch": 2.7, + "grad_norm": 47.82748794555664, + "learning_rate": 2.0005857005396813e-06, + "loss": 1.1217, + "step": 21513 + }, + { + "epoch": 2.7, + "grad_norm": 32.09178161621094, + "learning_rate": 1.999748985482994e-06, + "loss": 1.5899, + "step": 21514 + }, + { + "epoch": 2.7, + "grad_norm": 17.64548110961914, + "learning_rate": 1.9989122704263065e-06, + "loss": 1.1916, + "step": 21515 + }, + { + "epoch": 2.7, + "grad_norm": 13.294170379638672, + "learning_rate": 1.9980755553696193e-06, + "loss": 0.4883, + "step": 21516 + }, + { + "epoch": 2.7, + "grad_norm": 32.86570739746094, + "learning_rate": 1.9972388403129316e-06, + "loss": 1.4325, + "step": 21517 + }, + { + "epoch": 2.7, + "grad_norm": 15.223159790039062, + "learning_rate": 1.996402125256244e-06, + "loss": 0.4239, + "step": 21518 + }, + { + "epoch": 2.7, + "grad_norm": 6.6393938064575195, + "learning_rate": 1.995565410199557e-06, + "loss": 0.4592, + "step": 21519 + }, + { + "epoch": 2.7, + "grad_norm": 9.456892967224121, + "learning_rate": 1.994728695142869e-06, + "loss": 0.8297, + "step": 21520 + }, + { + "epoch": 2.7, + "grad_norm": 35.015174865722656, + "learning_rate": 1.9938919800861816e-06, + "loss": 1.4686, + "step": 21521 + }, + { + "epoch": 2.7, + "grad_norm": 15.150407791137695, + "learning_rate": 1.9930552650294944e-06, + "loss": 0.7951, + "step": 21522 + }, + { + "epoch": 2.7, + "grad_norm": 175.0820770263672, + "learning_rate": 1.9922185499728068e-06, + "loss": 1.808, + "step": 21523 + }, + { + "epoch": 2.7, + "grad_norm": 7.424928188323975, + "learning_rate": 1.9913818349161196e-06, + "loss": 0.266, + "step": 21524 + }, + { + "epoch": 2.7, + "grad_norm": 298.6964111328125, + "learning_rate": 1.990545119859432e-06, + "loss": 0.8698, + "step": 21525 + }, + { + "epoch": 2.7, + "grad_norm": 35.207061767578125, + "learning_rate": 1.9897084048027447e-06, + "loss": 1.2134, + "step": 21526 + }, + { + "epoch": 2.7, + "grad_norm": 5.058426856994629, + "learning_rate": 1.988871689746057e-06, + "loss": 0.4285, + "step": 21527 + }, + { + "epoch": 2.7, + "grad_norm": 13.8907470703125, + "learning_rate": 1.9880349746893695e-06, + "loss": 0.6789, + "step": 21528 + }, + { + "epoch": 2.7, + "grad_norm": 27.26032829284668, + "learning_rate": 1.9871982596326823e-06, + "loss": 0.8729, + "step": 21529 + }, + { + "epoch": 2.7, + "grad_norm": 11.517545700073242, + "learning_rate": 1.9863615445759947e-06, + "loss": 1.5727, + "step": 21530 + }, + { + "epoch": 2.7, + "grad_norm": 38.588294982910156, + "learning_rate": 1.9855248295193075e-06, + "loss": 2.6482, + "step": 21531 + }, + { + "epoch": 2.7, + "grad_norm": 10.834522247314453, + "learning_rate": 1.98468811446262e-06, + "loss": 1.2162, + "step": 21532 + }, + { + "epoch": 2.7, + "grad_norm": 15.166847229003906, + "learning_rate": 1.9838513994059327e-06, + "loss": 0.7046, + "step": 21533 + }, + { + "epoch": 2.7, + "grad_norm": 4.857996463775635, + "learning_rate": 1.983014684349245e-06, + "loss": 0.5158, + "step": 21534 + }, + { + "epoch": 2.7, + "grad_norm": 6.261816024780273, + "learning_rate": 1.9821779692925574e-06, + "loss": 0.4164, + "step": 21535 + }, + { + "epoch": 2.7, + "grad_norm": 11.289036750793457, + "learning_rate": 1.9813412542358702e-06, + "loss": 0.5289, + "step": 21536 + }, + { + "epoch": 2.7, + "grad_norm": 12.965081214904785, + "learning_rate": 1.9805045391791826e-06, + "loss": 1.0809, + "step": 21537 + }, + { + "epoch": 2.7, + "grad_norm": 12.697193145751953, + "learning_rate": 1.9796678241224954e-06, + "loss": 1.7587, + "step": 21538 + }, + { + "epoch": 2.7, + "grad_norm": 13.06324577331543, + "learning_rate": 1.9788311090658078e-06, + "loss": 1.1945, + "step": 21539 + }, + { + "epoch": 2.7, + "grad_norm": 89.4460220336914, + "learning_rate": 1.97799439400912e-06, + "loss": 3.2284, + "step": 21540 + }, + { + "epoch": 2.7, + "grad_norm": 5.918099403381348, + "learning_rate": 1.977157678952433e-06, + "loss": 0.346, + "step": 21541 + }, + { + "epoch": 2.7, + "grad_norm": 24.16585922241211, + "learning_rate": 1.9763209638957453e-06, + "loss": 0.6387, + "step": 21542 + }, + { + "epoch": 2.7, + "grad_norm": 34.83650588989258, + "learning_rate": 1.975484248839058e-06, + "loss": 1.1662, + "step": 21543 + }, + { + "epoch": 2.7, + "grad_norm": 30.044612884521484, + "learning_rate": 1.9746475337823705e-06, + "loss": 1.5414, + "step": 21544 + }, + { + "epoch": 2.7, + "grad_norm": 13.216431617736816, + "learning_rate": 1.9738108187256833e-06, + "loss": 1.3495, + "step": 21545 + }, + { + "epoch": 2.7, + "grad_norm": 5.502356052398682, + "learning_rate": 1.9729741036689957e-06, + "loss": 0.4508, + "step": 21546 + }, + { + "epoch": 2.7, + "grad_norm": 20.345603942871094, + "learning_rate": 1.972137388612308e-06, + "loss": 1.4439, + "step": 21547 + }, + { + "epoch": 2.7, + "grad_norm": 61.608367919921875, + "learning_rate": 1.971300673555621e-06, + "loss": 0.8592, + "step": 21548 + }, + { + "epoch": 2.7, + "grad_norm": 13.994182586669922, + "learning_rate": 1.9704639584989333e-06, + "loss": 0.5676, + "step": 21549 + }, + { + "epoch": 2.7, + "grad_norm": 8.34164047241211, + "learning_rate": 1.969627243442246e-06, + "loss": 0.3883, + "step": 21550 + }, + { + "epoch": 2.7, + "grad_norm": 19.863996505737305, + "learning_rate": 1.9687905283855584e-06, + "loss": 0.8862, + "step": 21551 + }, + { + "epoch": 2.7, + "grad_norm": 4.623724937438965, + "learning_rate": 1.9679538133288712e-06, + "loss": 0.2411, + "step": 21552 + }, + { + "epoch": 2.7, + "grad_norm": 11.14766788482666, + "learning_rate": 1.9671170982721836e-06, + "loss": 0.7617, + "step": 21553 + }, + { + "epoch": 2.7, + "grad_norm": 104.04212188720703, + "learning_rate": 1.966280383215496e-06, + "loss": 0.7672, + "step": 21554 + }, + { + "epoch": 2.71, + "grad_norm": 17.19343376159668, + "learning_rate": 1.965443668158809e-06, + "loss": 1.7867, + "step": 21555 + }, + { + "epoch": 2.71, + "grad_norm": 18.00482177734375, + "learning_rate": 1.964606953102121e-06, + "loss": 1.5705, + "step": 21556 + }, + { + "epoch": 2.71, + "grad_norm": 28.12812614440918, + "learning_rate": 1.9637702380454336e-06, + "loss": 1.6228, + "step": 21557 + }, + { + "epoch": 2.71, + "grad_norm": 15.558965682983398, + "learning_rate": 1.9629335229887464e-06, + "loss": 0.7861, + "step": 21558 + }, + { + "epoch": 2.71, + "grad_norm": 12.45241928100586, + "learning_rate": 1.962096807932059e-06, + "loss": 1.613, + "step": 21559 + }, + { + "epoch": 2.71, + "grad_norm": 7.432572841644287, + "learning_rate": 1.9612600928753715e-06, + "loss": 0.3244, + "step": 21560 + }, + { + "epoch": 2.71, + "grad_norm": 9.60072135925293, + "learning_rate": 1.960423377818684e-06, + "loss": 0.4073, + "step": 21561 + }, + { + "epoch": 2.71, + "grad_norm": 44.53190231323242, + "learning_rate": 1.9595866627619967e-06, + "loss": 2.8229, + "step": 21562 + }, + { + "epoch": 2.71, + "grad_norm": 13.338286399841309, + "learning_rate": 1.958749947705309e-06, + "loss": 1.023, + "step": 21563 + }, + { + "epoch": 2.71, + "grad_norm": 13.374138832092285, + "learning_rate": 1.9579132326486215e-06, + "loss": 1.9172, + "step": 21564 + }, + { + "epoch": 2.71, + "grad_norm": 98.44874572753906, + "learning_rate": 1.9570765175919343e-06, + "loss": 0.9126, + "step": 21565 + }, + { + "epoch": 2.71, + "grad_norm": 17.078516006469727, + "learning_rate": 1.9562398025352467e-06, + "loss": 1.1122, + "step": 21566 + }, + { + "epoch": 2.71, + "grad_norm": 21.637590408325195, + "learning_rate": 1.9554030874785595e-06, + "loss": 1.1725, + "step": 21567 + }, + { + "epoch": 2.71, + "grad_norm": 12.546353340148926, + "learning_rate": 1.954566372421872e-06, + "loss": 0.5145, + "step": 21568 + }, + { + "epoch": 2.71, + "grad_norm": 52.77216720581055, + "learning_rate": 1.9537296573651846e-06, + "loss": 0.7235, + "step": 21569 + }, + { + "epoch": 2.71, + "grad_norm": 18.392202377319336, + "learning_rate": 1.952892942308497e-06, + "loss": 0.8054, + "step": 21570 + }, + { + "epoch": 2.71, + "grad_norm": 21.224721908569336, + "learning_rate": 1.9520562272518094e-06, + "loss": 1.5617, + "step": 21571 + }, + { + "epoch": 2.71, + "grad_norm": 20.848115921020508, + "learning_rate": 1.951219512195122e-06, + "loss": 0.674, + "step": 21572 + }, + { + "epoch": 2.71, + "grad_norm": 13.857409477233887, + "learning_rate": 1.9503827971384346e-06, + "loss": 1.3511, + "step": 21573 + }, + { + "epoch": 2.71, + "grad_norm": 10.296735763549805, + "learning_rate": 1.9495460820817474e-06, + "loss": 1.0816, + "step": 21574 + }, + { + "epoch": 2.71, + "grad_norm": 13.349283218383789, + "learning_rate": 1.9487093670250598e-06, + "loss": 1.3952, + "step": 21575 + }, + { + "epoch": 2.71, + "grad_norm": 5.422610759735107, + "learning_rate": 1.9478726519683726e-06, + "loss": 0.6176, + "step": 21576 + }, + { + "epoch": 2.71, + "grad_norm": 9.50144100189209, + "learning_rate": 1.947035936911685e-06, + "loss": 0.3522, + "step": 21577 + }, + { + "epoch": 2.71, + "grad_norm": 15.972548484802246, + "learning_rate": 1.9461992218549973e-06, + "loss": 0.8747, + "step": 21578 + }, + { + "epoch": 2.71, + "grad_norm": 16.36379051208496, + "learning_rate": 1.94536250679831e-06, + "loss": 1.5866, + "step": 21579 + }, + { + "epoch": 2.71, + "grad_norm": 18.411258697509766, + "learning_rate": 1.9445257917416225e-06, + "loss": 1.6996, + "step": 21580 + }, + { + "epoch": 2.71, + "grad_norm": 6.302036285400391, + "learning_rate": 1.943689076684935e-06, + "loss": 0.6719, + "step": 21581 + }, + { + "epoch": 2.71, + "grad_norm": 6.9534406661987305, + "learning_rate": 1.9428523616282477e-06, + "loss": 0.2405, + "step": 21582 + }, + { + "epoch": 2.71, + "grad_norm": 13.45012378692627, + "learning_rate": 1.9420156465715605e-06, + "loss": 1.1696, + "step": 21583 + }, + { + "epoch": 2.71, + "grad_norm": 4.985976219177246, + "learning_rate": 1.941178931514873e-06, + "loss": 0.1027, + "step": 21584 + }, + { + "epoch": 2.71, + "grad_norm": 19.764118194580078, + "learning_rate": 1.9403422164581852e-06, + "loss": 2.5148, + "step": 21585 + }, + { + "epoch": 2.71, + "grad_norm": 15.650075912475586, + "learning_rate": 1.939505501401498e-06, + "loss": 1.9206, + "step": 21586 + }, + { + "epoch": 2.71, + "grad_norm": 7.171905994415283, + "learning_rate": 1.9386687863448104e-06, + "loss": 0.5164, + "step": 21587 + }, + { + "epoch": 2.71, + "grad_norm": 10.496377944946289, + "learning_rate": 1.9378320712881228e-06, + "loss": 1.1155, + "step": 21588 + }, + { + "epoch": 2.71, + "grad_norm": 30.06625747680664, + "learning_rate": 1.9369953562314356e-06, + "loss": 0.9575, + "step": 21589 + }, + { + "epoch": 2.71, + "grad_norm": 13.560931205749512, + "learning_rate": 1.9361586411747484e-06, + "loss": 0.7953, + "step": 21590 + }, + { + "epoch": 2.71, + "grad_norm": 21.100141525268555, + "learning_rate": 1.9353219261180608e-06, + "loss": 1.099, + "step": 21591 + }, + { + "epoch": 2.71, + "grad_norm": 11.04590129852295, + "learning_rate": 1.934485211061373e-06, + "loss": 0.6509, + "step": 21592 + }, + { + "epoch": 2.71, + "grad_norm": 33.39369583129883, + "learning_rate": 1.933648496004686e-06, + "loss": 1.4008, + "step": 21593 + }, + { + "epoch": 2.71, + "grad_norm": 4.745856285095215, + "learning_rate": 1.9328117809479983e-06, + "loss": 0.3466, + "step": 21594 + }, + { + "epoch": 2.71, + "grad_norm": 10.121007919311523, + "learning_rate": 1.9319750658913107e-06, + "loss": 0.5584, + "step": 21595 + }, + { + "epoch": 2.71, + "grad_norm": 35.45912551879883, + "learning_rate": 1.9311383508346235e-06, + "loss": 0.7605, + "step": 21596 + }, + { + "epoch": 2.71, + "grad_norm": 10.28868579864502, + "learning_rate": 1.9303016357779363e-06, + "loss": 0.5701, + "step": 21597 + }, + { + "epoch": 2.71, + "grad_norm": 34.35209274291992, + "learning_rate": 1.9294649207212483e-06, + "loss": 0.795, + "step": 21598 + }, + { + "epoch": 2.71, + "grad_norm": 33.060420989990234, + "learning_rate": 1.928628205664561e-06, + "loss": 2.3566, + "step": 21599 + }, + { + "epoch": 2.71, + "grad_norm": 91.64486694335938, + "learning_rate": 1.927791490607874e-06, + "loss": 1.8895, + "step": 21600 + }, + { + "epoch": 2.71, + "eval_loss": 0.07624088227748871, + "eval_runtime": 95.1929, + "eval_samples_per_second": 37.209, + "eval_steps_per_second": 37.209, + "step": 21600 + }, + { + "epoch": 2.71, + "grad_norm": 8.517383575439453, + "learning_rate": 1.9269547755511862e-06, + "loss": 1.8898, + "step": 21601 + }, + { + "epoch": 2.71, + "grad_norm": 6.557233810424805, + "learning_rate": 1.9261180604944986e-06, + "loss": 0.0612, + "step": 21602 + }, + { + "epoch": 2.71, + "grad_norm": 18.834272384643555, + "learning_rate": 1.9252813454378114e-06, + "loss": 1.7287, + "step": 21603 + }, + { + "epoch": 2.71, + "grad_norm": 14.700809478759766, + "learning_rate": 1.924444630381124e-06, + "loss": 0.6263, + "step": 21604 + }, + { + "epoch": 2.71, + "grad_norm": 8.74665641784668, + "learning_rate": 1.923607915324436e-06, + "loss": 0.3854, + "step": 21605 + }, + { + "epoch": 2.71, + "grad_norm": 20.223066329956055, + "learning_rate": 1.922771200267749e-06, + "loss": 1.1869, + "step": 21606 + }, + { + "epoch": 2.71, + "grad_norm": 26.129592895507812, + "learning_rate": 1.9219344852110618e-06, + "loss": 0.783, + "step": 21607 + }, + { + "epoch": 2.71, + "grad_norm": 6.939517498016357, + "learning_rate": 1.921097770154374e-06, + "loss": 0.3401, + "step": 21608 + }, + { + "epoch": 2.71, + "grad_norm": 15.920483589172363, + "learning_rate": 1.9202610550976865e-06, + "loss": 2.7957, + "step": 21609 + }, + { + "epoch": 2.71, + "grad_norm": 11.822491645812988, + "learning_rate": 1.9194243400409993e-06, + "loss": 1.9297, + "step": 21610 + }, + { + "epoch": 2.71, + "grad_norm": 8.13525104522705, + "learning_rate": 1.9185876249843117e-06, + "loss": 0.1707, + "step": 21611 + }, + { + "epoch": 2.71, + "grad_norm": 15.861075401306152, + "learning_rate": 1.917750909927624e-06, + "loss": 0.7218, + "step": 21612 + }, + { + "epoch": 2.71, + "grad_norm": 14.167207717895508, + "learning_rate": 1.916914194870937e-06, + "loss": 0.354, + "step": 21613 + }, + { + "epoch": 2.71, + "grad_norm": 7.59569787979126, + "learning_rate": 1.9160774798142497e-06, + "loss": 0.1944, + "step": 21614 + }, + { + "epoch": 2.71, + "grad_norm": 29.29405975341797, + "learning_rate": 1.915240764757562e-06, + "loss": 0.7861, + "step": 21615 + }, + { + "epoch": 2.71, + "grad_norm": 19.714954376220703, + "learning_rate": 1.9144040497008745e-06, + "loss": 0.9029, + "step": 21616 + }, + { + "epoch": 2.71, + "grad_norm": 9.297428131103516, + "learning_rate": 1.9135673346441873e-06, + "loss": 0.291, + "step": 21617 + }, + { + "epoch": 2.71, + "grad_norm": 79.9549560546875, + "learning_rate": 1.9127306195874996e-06, + "loss": 2.0887, + "step": 21618 + }, + { + "epoch": 2.71, + "grad_norm": 21.961437225341797, + "learning_rate": 1.911893904530812e-06, + "loss": 0.6198, + "step": 21619 + }, + { + "epoch": 2.71, + "grad_norm": 4.938629627227783, + "learning_rate": 1.911057189474125e-06, + "loss": 0.0518, + "step": 21620 + }, + { + "epoch": 2.71, + "grad_norm": 99.69220733642578, + "learning_rate": 1.910220474417437e-06, + "loss": 1.0809, + "step": 21621 + }, + { + "epoch": 2.71, + "grad_norm": 26.269498825073242, + "learning_rate": 1.9093837593607496e-06, + "loss": 1.1937, + "step": 21622 + }, + { + "epoch": 2.71, + "grad_norm": 14.736011505126953, + "learning_rate": 1.9085470443040624e-06, + "loss": 0.4535, + "step": 21623 + }, + { + "epoch": 2.71, + "grad_norm": 21.275299072265625, + "learning_rate": 1.907710329247375e-06, + "loss": 0.8934, + "step": 21624 + }, + { + "epoch": 2.71, + "grad_norm": 8.874500274658203, + "learning_rate": 1.9068736141906873e-06, + "loss": 0.8592, + "step": 21625 + }, + { + "epoch": 2.71, + "grad_norm": 2.27887225151062, + "learning_rate": 1.906036899134e-06, + "loss": 0.0699, + "step": 21626 + }, + { + "epoch": 2.71, + "grad_norm": 132.70150756835938, + "learning_rate": 1.9052001840773127e-06, + "loss": 1.296, + "step": 21627 + }, + { + "epoch": 2.71, + "grad_norm": 4.818041801452637, + "learning_rate": 1.9043634690206253e-06, + "loss": 0.1964, + "step": 21628 + }, + { + "epoch": 2.71, + "grad_norm": 79.94800567626953, + "learning_rate": 1.9035267539639377e-06, + "loss": 0.5249, + "step": 21629 + }, + { + "epoch": 2.71, + "grad_norm": 5.631694793701172, + "learning_rate": 1.9026900389072503e-06, + "loss": 0.2513, + "step": 21630 + }, + { + "epoch": 2.71, + "grad_norm": 11.695740699768066, + "learning_rate": 1.9018533238505629e-06, + "loss": 1.6826, + "step": 21631 + }, + { + "epoch": 2.71, + "grad_norm": 9.75593376159668, + "learning_rate": 1.9010166087938753e-06, + "loss": 0.4342, + "step": 21632 + }, + { + "epoch": 2.71, + "grad_norm": 30.43581771850586, + "learning_rate": 1.9001798937371879e-06, + "loss": 0.7867, + "step": 21633 + }, + { + "epoch": 2.72, + "grad_norm": 34.77962112426758, + "learning_rate": 1.8993431786805007e-06, + "loss": 1.3018, + "step": 21634 + }, + { + "epoch": 2.72, + "grad_norm": 5.737854480743408, + "learning_rate": 1.8985064636238132e-06, + "loss": 1.0135, + "step": 21635 + }, + { + "epoch": 2.72, + "grad_norm": 17.90479850769043, + "learning_rate": 1.8976697485671256e-06, + "loss": 1.1262, + "step": 21636 + }, + { + "epoch": 2.72, + "grad_norm": 5.772305011749268, + "learning_rate": 1.8968330335104382e-06, + "loss": 0.7722, + "step": 21637 + }, + { + "epoch": 2.72, + "grad_norm": 20.269916534423828, + "learning_rate": 1.8959963184537508e-06, + "loss": 1.4899, + "step": 21638 + }, + { + "epoch": 2.72, + "grad_norm": 7.383659362792969, + "learning_rate": 1.8951596033970632e-06, + "loss": 0.3814, + "step": 21639 + }, + { + "epoch": 2.72, + "grad_norm": 13.869362831115723, + "learning_rate": 1.8943228883403758e-06, + "loss": 1.1662, + "step": 21640 + }, + { + "epoch": 2.72, + "grad_norm": 9.787107467651367, + "learning_rate": 1.8934861732836884e-06, + "loss": 1.9305, + "step": 21641 + }, + { + "epoch": 2.72, + "grad_norm": 16.482925415039062, + "learning_rate": 1.8926494582270012e-06, + "loss": 0.3837, + "step": 21642 + }, + { + "epoch": 2.72, + "grad_norm": 25.612733840942383, + "learning_rate": 1.8918127431703133e-06, + "loss": 0.7088, + "step": 21643 + }, + { + "epoch": 2.72, + "grad_norm": 22.032100677490234, + "learning_rate": 1.8909760281136261e-06, + "loss": 2.0197, + "step": 21644 + }, + { + "epoch": 2.72, + "grad_norm": 24.02498435974121, + "learning_rate": 1.8901393130569387e-06, + "loss": 1.9074, + "step": 21645 + }, + { + "epoch": 2.72, + "grad_norm": 30.454744338989258, + "learning_rate": 1.889302598000251e-06, + "loss": 0.9754, + "step": 21646 + }, + { + "epoch": 2.72, + "grad_norm": 82.28011322021484, + "learning_rate": 1.8884658829435637e-06, + "loss": 1.1165, + "step": 21647 + }, + { + "epoch": 2.72, + "grad_norm": 3.8160178661346436, + "learning_rate": 1.8876291678868763e-06, + "loss": 0.8781, + "step": 21648 + }, + { + "epoch": 2.72, + "grad_norm": 29.909202575683594, + "learning_rate": 1.8867924528301889e-06, + "loss": 0.7232, + "step": 21649 + }, + { + "epoch": 2.72, + "grad_norm": 6.371339321136475, + "learning_rate": 1.8859557377735013e-06, + "loss": 0.2522, + "step": 21650 + }, + { + "epoch": 2.72, + "grad_norm": 20.851184844970703, + "learning_rate": 1.885119022716814e-06, + "loss": 0.6746, + "step": 21651 + }, + { + "epoch": 2.72, + "grad_norm": 35.381099700927734, + "learning_rate": 1.8842823076601266e-06, + "loss": 0.7025, + "step": 21652 + }, + { + "epoch": 2.72, + "grad_norm": 9.720460891723633, + "learning_rate": 1.883445592603439e-06, + "loss": 0.7627, + "step": 21653 + }, + { + "epoch": 2.72, + "grad_norm": 25.254837036132812, + "learning_rate": 1.8826088775467516e-06, + "loss": 0.7075, + "step": 21654 + }, + { + "epoch": 2.72, + "grad_norm": 14.657265663146973, + "learning_rate": 1.8817721624900642e-06, + "loss": 0.5678, + "step": 21655 + }, + { + "epoch": 2.72, + "grad_norm": 12.222607612609863, + "learning_rate": 1.8809354474333766e-06, + "loss": 1.016, + "step": 21656 + }, + { + "epoch": 2.72, + "grad_norm": 11.868119239807129, + "learning_rate": 1.8800987323766892e-06, + "loss": 1.553, + "step": 21657 + }, + { + "epoch": 2.72, + "grad_norm": 35.980133056640625, + "learning_rate": 1.8792620173200018e-06, + "loss": 1.4554, + "step": 21658 + }, + { + "epoch": 2.72, + "grad_norm": 9.9580659866333, + "learning_rate": 1.8784253022633146e-06, + "loss": 0.472, + "step": 21659 + }, + { + "epoch": 2.72, + "grad_norm": 8.805882453918457, + "learning_rate": 1.877588587206627e-06, + "loss": 0.2537, + "step": 21660 + }, + { + "epoch": 2.72, + "grad_norm": 10.853120803833008, + "learning_rate": 1.8767518721499395e-06, + "loss": 0.534, + "step": 21661 + }, + { + "epoch": 2.72, + "grad_norm": 14.03399658203125, + "learning_rate": 1.8759151570932521e-06, + "loss": 1.2223, + "step": 21662 + }, + { + "epoch": 2.72, + "grad_norm": 7.437746047973633, + "learning_rate": 1.8750784420365645e-06, + "loss": 0.3798, + "step": 21663 + }, + { + "epoch": 2.72, + "grad_norm": 11.152474403381348, + "learning_rate": 1.874241726979877e-06, + "loss": 0.5058, + "step": 21664 + }, + { + "epoch": 2.72, + "grad_norm": 14.955697059631348, + "learning_rate": 1.8734050119231897e-06, + "loss": 1.0318, + "step": 21665 + }, + { + "epoch": 2.72, + "grad_norm": 19.80023765563965, + "learning_rate": 1.8725682968665025e-06, + "loss": 0.8431, + "step": 21666 + }, + { + "epoch": 2.72, + "grad_norm": 24.065574645996094, + "learning_rate": 1.8717315818098146e-06, + "loss": 0.8752, + "step": 21667 + }, + { + "epoch": 2.72, + "grad_norm": 21.037424087524414, + "learning_rate": 1.8708948667531274e-06, + "loss": 1.5721, + "step": 21668 + }, + { + "epoch": 2.72, + "grad_norm": 7.22127628326416, + "learning_rate": 1.87005815169644e-06, + "loss": 0.6915, + "step": 21669 + }, + { + "epoch": 2.72, + "grad_norm": 30.11231803894043, + "learning_rate": 1.8692214366397524e-06, + "loss": 1.088, + "step": 21670 + }, + { + "epoch": 2.72, + "grad_norm": 11.352420806884766, + "learning_rate": 1.868384721583065e-06, + "loss": 0.6094, + "step": 21671 + }, + { + "epoch": 2.72, + "grad_norm": 12.00910758972168, + "learning_rate": 1.8675480065263776e-06, + "loss": 0.8504, + "step": 21672 + }, + { + "epoch": 2.72, + "grad_norm": 13.739583015441895, + "learning_rate": 1.8667112914696902e-06, + "loss": 1.2051, + "step": 21673 + }, + { + "epoch": 2.72, + "grad_norm": 7.9926676750183105, + "learning_rate": 1.8658745764130026e-06, + "loss": 0.7858, + "step": 21674 + }, + { + "epoch": 2.72, + "grad_norm": 20.544986724853516, + "learning_rate": 1.8650378613563152e-06, + "loss": 1.1847, + "step": 21675 + }, + { + "epoch": 2.72, + "grad_norm": 19.822420120239258, + "learning_rate": 1.864201146299628e-06, + "loss": 0.983, + "step": 21676 + }, + { + "epoch": 2.72, + "grad_norm": 8.64639663696289, + "learning_rate": 1.8633644312429403e-06, + "loss": 0.7687, + "step": 21677 + }, + { + "epoch": 2.72, + "grad_norm": 10.299410820007324, + "learning_rate": 1.862527716186253e-06, + "loss": 0.4984, + "step": 21678 + }, + { + "epoch": 2.72, + "grad_norm": 9.877063751220703, + "learning_rate": 1.8616910011295655e-06, + "loss": 0.4443, + "step": 21679 + }, + { + "epoch": 2.72, + "grad_norm": 6.195727348327637, + "learning_rate": 1.8608542860728781e-06, + "loss": 0.7731, + "step": 21680 + }, + { + "epoch": 2.72, + "grad_norm": 25.56712532043457, + "learning_rate": 1.8600175710161905e-06, + "loss": 1.5167, + "step": 21681 + }, + { + "epoch": 2.72, + "grad_norm": 13.832436561584473, + "learning_rate": 1.859180855959503e-06, + "loss": 1.0589, + "step": 21682 + }, + { + "epoch": 2.72, + "grad_norm": 21.81914710998535, + "learning_rate": 1.8583441409028159e-06, + "loss": 0.9787, + "step": 21683 + }, + { + "epoch": 2.72, + "grad_norm": 11.977666854858398, + "learning_rate": 1.857507425846128e-06, + "loss": 0.8455, + "step": 21684 + }, + { + "epoch": 2.72, + "grad_norm": 28.767717361450195, + "learning_rate": 1.8566707107894408e-06, + "loss": 1.0714, + "step": 21685 + }, + { + "epoch": 2.72, + "grad_norm": 40.01023483276367, + "learning_rate": 1.8558339957327534e-06, + "loss": 0.736, + "step": 21686 + }, + { + "epoch": 2.72, + "grad_norm": 30.824186325073242, + "learning_rate": 1.854997280676066e-06, + "loss": 2.5069, + "step": 21687 + }, + { + "epoch": 2.72, + "grad_norm": 18.87152862548828, + "learning_rate": 1.8541605656193784e-06, + "loss": 0.9405, + "step": 21688 + }, + { + "epoch": 2.72, + "grad_norm": 35.233551025390625, + "learning_rate": 1.853323850562691e-06, + "loss": 1.3706, + "step": 21689 + }, + { + "epoch": 2.72, + "grad_norm": 48.823116302490234, + "learning_rate": 1.8524871355060036e-06, + "loss": 1.7343, + "step": 21690 + }, + { + "epoch": 2.72, + "grad_norm": 12.798388481140137, + "learning_rate": 1.851650420449316e-06, + "loss": 0.9201, + "step": 21691 + }, + { + "epoch": 2.72, + "grad_norm": 11.396600723266602, + "learning_rate": 1.8508137053926288e-06, + "loss": 0.4124, + "step": 21692 + }, + { + "epoch": 2.72, + "grad_norm": 7.024505138397217, + "learning_rate": 1.8499769903359414e-06, + "loss": 0.354, + "step": 21693 + }, + { + "epoch": 2.72, + "grad_norm": 7.042710304260254, + "learning_rate": 1.849140275279254e-06, + "loss": 0.1709, + "step": 21694 + }, + { + "epoch": 2.72, + "grad_norm": 37.1627082824707, + "learning_rate": 1.8483035602225663e-06, + "loss": 1.0533, + "step": 21695 + }, + { + "epoch": 2.72, + "grad_norm": 19.921981811523438, + "learning_rate": 1.847466845165879e-06, + "loss": 0.8625, + "step": 21696 + }, + { + "epoch": 2.72, + "grad_norm": 27.523574829101562, + "learning_rate": 1.8466301301091915e-06, + "loss": 2.2229, + "step": 21697 + }, + { + "epoch": 2.72, + "grad_norm": 24.74726676940918, + "learning_rate": 1.8457934150525039e-06, + "loss": 0.5465, + "step": 21698 + }, + { + "epoch": 2.72, + "grad_norm": 36.155052185058594, + "learning_rate": 1.8449566999958165e-06, + "loss": 0.1337, + "step": 21699 + }, + { + "epoch": 2.72, + "grad_norm": 5.531887054443359, + "learning_rate": 1.8441199849391293e-06, + "loss": 0.5163, + "step": 21700 + }, + { + "epoch": 2.72, + "grad_norm": 7.0997138023376465, + "learning_rate": 1.8432832698824416e-06, + "loss": 0.2779, + "step": 21701 + }, + { + "epoch": 2.72, + "grad_norm": 13.166228294372559, + "learning_rate": 1.8424465548257542e-06, + "loss": 0.447, + "step": 21702 + }, + { + "epoch": 2.72, + "grad_norm": 14.30151653289795, + "learning_rate": 1.8416098397690668e-06, + "loss": 0.8173, + "step": 21703 + }, + { + "epoch": 2.72, + "grad_norm": 18.93376350402832, + "learning_rate": 1.8407731247123794e-06, + "loss": 1.1971, + "step": 21704 + }, + { + "epoch": 2.72, + "grad_norm": 5.382608890533447, + "learning_rate": 1.8399364096556918e-06, + "loss": 0.3352, + "step": 21705 + }, + { + "epoch": 2.72, + "grad_norm": 19.910825729370117, + "learning_rate": 1.8390996945990044e-06, + "loss": 0.8676, + "step": 21706 + }, + { + "epoch": 2.72, + "grad_norm": 63.348533630371094, + "learning_rate": 1.838262979542317e-06, + "loss": 2.9701, + "step": 21707 + }, + { + "epoch": 2.72, + "grad_norm": 15.751937866210938, + "learning_rate": 1.8374262644856294e-06, + "loss": 0.9916, + "step": 21708 + }, + { + "epoch": 2.72, + "grad_norm": 8.19721508026123, + "learning_rate": 1.8365895494289422e-06, + "loss": 0.5512, + "step": 21709 + }, + { + "epoch": 2.72, + "grad_norm": 66.76535034179688, + "learning_rate": 1.8357528343722547e-06, + "loss": 1.0022, + "step": 21710 + }, + { + "epoch": 2.72, + "grad_norm": 5.930474281311035, + "learning_rate": 1.8349161193155673e-06, + "loss": 0.1584, + "step": 21711 + }, + { + "epoch": 2.72, + "grad_norm": 36.89176559448242, + "learning_rate": 1.8340794042588797e-06, + "loss": 3.0869, + "step": 21712 + }, + { + "epoch": 2.72, + "grad_norm": 33.027488708496094, + "learning_rate": 1.8332426892021923e-06, + "loss": 1.591, + "step": 21713 + }, + { + "epoch": 2.73, + "grad_norm": 12.010252952575684, + "learning_rate": 1.832405974145505e-06, + "loss": 2.052, + "step": 21714 + }, + { + "epoch": 2.73, + "grad_norm": 8.362545013427734, + "learning_rate": 1.8315692590888173e-06, + "loss": 1.0886, + "step": 21715 + }, + { + "epoch": 2.73, + "grad_norm": 11.814370155334473, + "learning_rate": 1.8307325440321299e-06, + "loss": 0.4781, + "step": 21716 + }, + { + "epoch": 2.73, + "grad_norm": 5.486781120300293, + "learning_rate": 1.8298958289754427e-06, + "loss": 0.3103, + "step": 21717 + }, + { + "epoch": 2.73, + "grad_norm": 50.36022186279297, + "learning_rate": 1.8290591139187553e-06, + "loss": 2.5289, + "step": 21718 + }, + { + "epoch": 2.73, + "grad_norm": 11.407174110412598, + "learning_rate": 1.8282223988620676e-06, + "loss": 0.5349, + "step": 21719 + }, + { + "epoch": 2.73, + "grad_norm": 25.968992233276367, + "learning_rate": 1.8273856838053802e-06, + "loss": 1.0201, + "step": 21720 + }, + { + "epoch": 2.73, + "grad_norm": 11.296031951904297, + "learning_rate": 1.8265489687486928e-06, + "loss": 0.8723, + "step": 21721 + }, + { + "epoch": 2.73, + "grad_norm": 65.60367584228516, + "learning_rate": 1.8257122536920052e-06, + "loss": 1.6013, + "step": 21722 + }, + { + "epoch": 2.73, + "grad_norm": 43.265079498291016, + "learning_rate": 1.8248755386353178e-06, + "loss": 1.2649, + "step": 21723 + }, + { + "epoch": 2.73, + "grad_norm": 37.025516510009766, + "learning_rate": 1.8240388235786306e-06, + "loss": 0.2077, + "step": 21724 + }, + { + "epoch": 2.73, + "grad_norm": 78.99784851074219, + "learning_rate": 1.8232021085219432e-06, + "loss": 1.0799, + "step": 21725 + }, + { + "epoch": 2.73, + "grad_norm": 18.098739624023438, + "learning_rate": 1.8223653934652556e-06, + "loss": 1.2167, + "step": 21726 + }, + { + "epoch": 2.73, + "grad_norm": 15.547638893127441, + "learning_rate": 1.8215286784085681e-06, + "loss": 1.6061, + "step": 21727 + }, + { + "epoch": 2.73, + "grad_norm": 9.266261100769043, + "learning_rate": 1.8206919633518807e-06, + "loss": 1.6266, + "step": 21728 + }, + { + "epoch": 2.73, + "grad_norm": 8.35934066772461, + "learning_rate": 1.8198552482951931e-06, + "loss": 2.1372, + "step": 21729 + }, + { + "epoch": 2.73, + "grad_norm": 16.34247398376465, + "learning_rate": 1.8190185332385057e-06, + "loss": 0.9927, + "step": 21730 + }, + { + "epoch": 2.73, + "grad_norm": 15.102217674255371, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.725, + "step": 21731 + }, + { + "epoch": 2.73, + "grad_norm": 61.07167053222656, + "learning_rate": 1.817345103125131e-06, + "loss": 1.3133, + "step": 21732 + }, + { + "epoch": 2.73, + "grad_norm": 2.809201240539551, + "learning_rate": 1.8165083880684435e-06, + "loss": 0.1029, + "step": 21733 + }, + { + "epoch": 2.73, + "grad_norm": 20.1793212890625, + "learning_rate": 1.815671673011756e-06, + "loss": 1.6741, + "step": 21734 + }, + { + "epoch": 2.73, + "grad_norm": 9.742021560668945, + "learning_rate": 1.8148349579550687e-06, + "loss": 0.3113, + "step": 21735 + }, + { + "epoch": 2.73, + "grad_norm": 20.455286026000977, + "learning_rate": 1.813998242898381e-06, + "loss": 1.5275, + "step": 21736 + }, + { + "epoch": 2.73, + "grad_norm": 13.391501426696777, + "learning_rate": 1.8131615278416936e-06, + "loss": 0.6146, + "step": 21737 + }, + { + "epoch": 2.73, + "grad_norm": 7.942105293273926, + "learning_rate": 1.8123248127850062e-06, + "loss": 0.6434, + "step": 21738 + }, + { + "epoch": 2.73, + "grad_norm": 49.41905975341797, + "learning_rate": 1.8114880977283188e-06, + "loss": 1.6082, + "step": 21739 + }, + { + "epoch": 2.73, + "grad_norm": 11.774738311767578, + "learning_rate": 1.8106513826716312e-06, + "loss": 0.5761, + "step": 21740 + }, + { + "epoch": 2.73, + "grad_norm": 8.68564510345459, + "learning_rate": 1.809814667614944e-06, + "loss": 1.0904, + "step": 21741 + }, + { + "epoch": 2.73, + "grad_norm": 8.028971672058105, + "learning_rate": 1.8089779525582566e-06, + "loss": 0.2022, + "step": 21742 + }, + { + "epoch": 2.73, + "grad_norm": 10.16524600982666, + "learning_rate": 1.808141237501569e-06, + "loss": 0.6065, + "step": 21743 + }, + { + "epoch": 2.73, + "grad_norm": 7.454187393188477, + "learning_rate": 1.8073045224448815e-06, + "loss": 0.3915, + "step": 21744 + }, + { + "epoch": 2.73, + "grad_norm": 11.614712715148926, + "learning_rate": 1.8064678073881941e-06, + "loss": 1.6263, + "step": 21745 + }, + { + "epoch": 2.73, + "grad_norm": 1.0575791597366333, + "learning_rate": 1.8056310923315065e-06, + "loss": 0.0253, + "step": 21746 + }, + { + "epoch": 2.73, + "grad_norm": 5.01806640625, + "learning_rate": 1.804794377274819e-06, + "loss": 0.6434, + "step": 21747 + }, + { + "epoch": 2.73, + "grad_norm": 0.5121804475784302, + "learning_rate": 1.8039576622181317e-06, + "loss": 0.0145, + "step": 21748 + }, + { + "epoch": 2.73, + "grad_norm": 36.74553298950195, + "learning_rate": 1.8031209471614445e-06, + "loss": 1.5289, + "step": 21749 + }, + { + "epoch": 2.73, + "grad_norm": 13.009232521057129, + "learning_rate": 1.8022842321047569e-06, + "loss": 0.5034, + "step": 21750 + }, + { + "epoch": 2.73, + "grad_norm": 4.547818660736084, + "learning_rate": 1.8014475170480695e-06, + "loss": 0.5641, + "step": 21751 + }, + { + "epoch": 2.73, + "grad_norm": 31.20549201965332, + "learning_rate": 1.800610801991382e-06, + "loss": 1.0199, + "step": 21752 + }, + { + "epoch": 2.73, + "grad_norm": 8.372740745544434, + "learning_rate": 1.7997740869346944e-06, + "loss": 0.7164, + "step": 21753 + }, + { + "epoch": 2.73, + "grad_norm": 7.5386834144592285, + "learning_rate": 1.798937371878007e-06, + "loss": 1.4045, + "step": 21754 + }, + { + "epoch": 2.73, + "grad_norm": 9.312443733215332, + "learning_rate": 1.7981006568213196e-06, + "loss": 0.3174, + "step": 21755 + }, + { + "epoch": 2.73, + "grad_norm": 13.363283157348633, + "learning_rate": 1.7972639417646324e-06, + "loss": 0.9533, + "step": 21756 + }, + { + "epoch": 2.73, + "grad_norm": 9.282561302185059, + "learning_rate": 1.7964272267079446e-06, + "loss": 1.0512, + "step": 21757 + }, + { + "epoch": 2.73, + "grad_norm": 12.2020845413208, + "learning_rate": 1.7955905116512574e-06, + "loss": 0.7064, + "step": 21758 + }, + { + "epoch": 2.73, + "grad_norm": 9.255086898803711, + "learning_rate": 1.79475379659457e-06, + "loss": 0.9093, + "step": 21759 + }, + { + "epoch": 2.73, + "grad_norm": 22.05156135559082, + "learning_rate": 1.7939170815378823e-06, + "loss": 0.8288, + "step": 21760 + }, + { + "epoch": 2.73, + "grad_norm": 15.910368919372559, + "learning_rate": 1.793080366481195e-06, + "loss": 0.5122, + "step": 21761 + }, + { + "epoch": 2.73, + "grad_norm": 17.45794105529785, + "learning_rate": 1.7922436514245075e-06, + "loss": 1.8548, + "step": 21762 + }, + { + "epoch": 2.73, + "grad_norm": 9.76082706451416, + "learning_rate": 1.7914069363678201e-06, + "loss": 1.6464, + "step": 21763 + }, + { + "epoch": 2.73, + "grad_norm": 36.96340560913086, + "learning_rate": 1.7905702213111325e-06, + "loss": 1.2681, + "step": 21764 + }, + { + "epoch": 2.73, + "grad_norm": 12.535592079162598, + "learning_rate": 1.7897335062544453e-06, + "loss": 1.5499, + "step": 21765 + }, + { + "epoch": 2.73, + "grad_norm": 157.78836059570312, + "learning_rate": 1.7888967911977579e-06, + "loss": 1.6722, + "step": 21766 + }, + { + "epoch": 2.73, + "grad_norm": 17.902124404907227, + "learning_rate": 1.7880600761410703e-06, + "loss": 0.9337, + "step": 21767 + }, + { + "epoch": 2.73, + "grad_norm": 13.388311386108398, + "learning_rate": 1.7872233610843829e-06, + "loss": 0.7366, + "step": 21768 + }, + { + "epoch": 2.73, + "grad_norm": 36.524497985839844, + "learning_rate": 1.7863866460276954e-06, + "loss": 1.4949, + "step": 21769 + }, + { + "epoch": 2.73, + "grad_norm": 26.989593505859375, + "learning_rate": 1.785549930971008e-06, + "loss": 0.8982, + "step": 21770 + }, + { + "epoch": 2.73, + "grad_norm": 13.714203834533691, + "learning_rate": 1.7847132159143204e-06, + "loss": 0.5994, + "step": 21771 + }, + { + "epoch": 2.73, + "grad_norm": 4.962925910949707, + "learning_rate": 1.783876500857633e-06, + "loss": 0.1842, + "step": 21772 + }, + { + "epoch": 2.73, + "grad_norm": 28.30582046508789, + "learning_rate": 1.7830397858009458e-06, + "loss": 2.5369, + "step": 21773 + }, + { + "epoch": 2.73, + "grad_norm": 7.63728666305542, + "learning_rate": 1.7822030707442582e-06, + "loss": 0.6249, + "step": 21774 + }, + { + "epoch": 2.73, + "grad_norm": 51.46284103393555, + "learning_rate": 1.7813663556875708e-06, + "loss": 1.8673, + "step": 21775 + }, + { + "epoch": 2.73, + "grad_norm": 32.90729904174805, + "learning_rate": 1.7805296406308834e-06, + "loss": 2.6009, + "step": 21776 + }, + { + "epoch": 2.73, + "grad_norm": 10.800202369689941, + "learning_rate": 1.779692925574196e-06, + "loss": 0.64, + "step": 21777 + }, + { + "epoch": 2.73, + "grad_norm": 6.670138359069824, + "learning_rate": 1.7788562105175083e-06, + "loss": 0.3723, + "step": 21778 + }, + { + "epoch": 2.73, + "grad_norm": 150.59474182128906, + "learning_rate": 1.778019495460821e-06, + "loss": 3.0521, + "step": 21779 + }, + { + "epoch": 2.73, + "grad_norm": 3.466110944747925, + "learning_rate": 1.7771827804041335e-06, + "loss": 0.205, + "step": 21780 + }, + { + "epoch": 2.73, + "grad_norm": 33.088748931884766, + "learning_rate": 1.7763460653474459e-06, + "loss": 0.7049, + "step": 21781 + }, + { + "epoch": 2.73, + "grad_norm": 27.63764190673828, + "learning_rate": 1.7755093502907587e-06, + "loss": 0.6086, + "step": 21782 + }, + { + "epoch": 2.73, + "grad_norm": 17.60592269897461, + "learning_rate": 1.7746726352340713e-06, + "loss": 1.0391, + "step": 21783 + }, + { + "epoch": 2.73, + "grad_norm": 33.07358169555664, + "learning_rate": 1.7738359201773839e-06, + "loss": 2.4924, + "step": 21784 + }, + { + "epoch": 2.73, + "grad_norm": 4.072381019592285, + "learning_rate": 1.7729992051206962e-06, + "loss": 0.0847, + "step": 21785 + }, + { + "epoch": 2.73, + "grad_norm": 16.366477966308594, + "learning_rate": 1.7721624900640088e-06, + "loss": 1.4918, + "step": 21786 + }, + { + "epoch": 2.73, + "grad_norm": 13.376606941223145, + "learning_rate": 1.7713257750073214e-06, + "loss": 1.6522, + "step": 21787 + }, + { + "epoch": 2.73, + "grad_norm": 9.831418991088867, + "learning_rate": 1.7704890599506338e-06, + "loss": 0.5395, + "step": 21788 + }, + { + "epoch": 2.73, + "grad_norm": 15.025633811950684, + "learning_rate": 1.7696523448939464e-06, + "loss": 1.3841, + "step": 21789 + }, + { + "epoch": 2.73, + "grad_norm": 10.509660720825195, + "learning_rate": 1.7688156298372592e-06, + "loss": 0.1849, + "step": 21790 + }, + { + "epoch": 2.73, + "grad_norm": 9.489770889282227, + "learning_rate": 1.7679789147805716e-06, + "loss": 0.4074, + "step": 21791 + }, + { + "epoch": 2.73, + "grad_norm": 5.313810348510742, + "learning_rate": 1.7671421997238842e-06, + "loss": 0.5014, + "step": 21792 + }, + { + "epoch": 2.73, + "grad_norm": 19.57631492614746, + "learning_rate": 1.7663054846671968e-06, + "loss": 0.7097, + "step": 21793 + }, + { + "epoch": 2.74, + "grad_norm": 28.672203063964844, + "learning_rate": 1.7654687696105093e-06, + "loss": 2.6373, + "step": 21794 + }, + { + "epoch": 2.74, + "grad_norm": 17.289979934692383, + "learning_rate": 1.7646320545538217e-06, + "loss": 1.1245, + "step": 21795 + }, + { + "epoch": 2.74, + "grad_norm": 39.91363525390625, + "learning_rate": 1.7637953394971343e-06, + "loss": 1.1635, + "step": 21796 + }, + { + "epoch": 2.74, + "grad_norm": 26.618194580078125, + "learning_rate": 1.7629586244404471e-06, + "loss": 1.1167, + "step": 21797 + }, + { + "epoch": 2.74, + "grad_norm": 302.7001953125, + "learning_rate": 1.7621219093837593e-06, + "loss": 1.2704, + "step": 21798 + }, + { + "epoch": 2.74, + "grad_norm": 4.354804992675781, + "learning_rate": 1.761285194327072e-06, + "loss": 0.2161, + "step": 21799 + }, + { + "epoch": 2.74, + "grad_norm": 33.09197998046875, + "learning_rate": 1.7604484792703847e-06, + "loss": 1.3867, + "step": 21800 + }, + { + "epoch": 2.74, + "grad_norm": 12.664949417114258, + "learning_rate": 1.7596117642136973e-06, + "loss": 0.5833, + "step": 21801 + }, + { + "epoch": 2.74, + "grad_norm": 29.650514602661133, + "learning_rate": 1.7587750491570096e-06, + "loss": 1.6012, + "step": 21802 + }, + { + "epoch": 2.74, + "grad_norm": 20.913442611694336, + "learning_rate": 1.7579383341003222e-06, + "loss": 0.822, + "step": 21803 + }, + { + "epoch": 2.74, + "grad_norm": 12.550580024719238, + "learning_rate": 1.7571016190436348e-06, + "loss": 0.2997, + "step": 21804 + }, + { + "epoch": 2.74, + "grad_norm": 5.505816459655762, + "learning_rate": 1.7562649039869472e-06, + "loss": 0.9827, + "step": 21805 + }, + { + "epoch": 2.74, + "grad_norm": 15.06021785736084, + "learning_rate": 1.75542818893026e-06, + "loss": 0.6605, + "step": 21806 + }, + { + "epoch": 2.74, + "grad_norm": 10.036102294921875, + "learning_rate": 1.7545914738735726e-06, + "loss": 0.3061, + "step": 21807 + }, + { + "epoch": 2.74, + "grad_norm": 25.392353057861328, + "learning_rate": 1.7537547588168852e-06, + "loss": 0.874, + "step": 21808 + }, + { + "epoch": 2.74, + "grad_norm": 42.024314880371094, + "learning_rate": 1.7529180437601976e-06, + "loss": 0.5915, + "step": 21809 + }, + { + "epoch": 2.74, + "grad_norm": 14.7062349319458, + "learning_rate": 1.7520813287035102e-06, + "loss": 1.5098, + "step": 21810 + }, + { + "epoch": 2.74, + "grad_norm": 20.205429077148438, + "learning_rate": 1.7512446136468227e-06, + "loss": 0.5976, + "step": 21811 + }, + { + "epoch": 2.74, + "grad_norm": 14.482434272766113, + "learning_rate": 1.7504078985901351e-06, + "loss": 0.8241, + "step": 21812 + }, + { + "epoch": 2.74, + "grad_norm": 4.756473541259766, + "learning_rate": 1.7495711835334477e-06, + "loss": 0.2505, + "step": 21813 + }, + { + "epoch": 2.74, + "grad_norm": 26.58550262451172, + "learning_rate": 1.7487344684767605e-06, + "loss": 2.0213, + "step": 21814 + }, + { + "epoch": 2.74, + "grad_norm": 15.44678783416748, + "learning_rate": 1.747897753420073e-06, + "loss": 0.9022, + "step": 21815 + }, + { + "epoch": 2.74, + "grad_norm": 66.37962341308594, + "learning_rate": 1.7470610383633855e-06, + "loss": 1.9281, + "step": 21816 + }, + { + "epoch": 2.74, + "grad_norm": 22.037757873535156, + "learning_rate": 1.746224323306698e-06, + "loss": 1.2474, + "step": 21817 + }, + { + "epoch": 2.74, + "grad_norm": 85.15798950195312, + "learning_rate": 1.7453876082500107e-06, + "loss": 2.9829, + "step": 21818 + }, + { + "epoch": 2.74, + "grad_norm": 17.27855682373047, + "learning_rate": 1.744550893193323e-06, + "loss": 0.7159, + "step": 21819 + }, + { + "epoch": 2.74, + "grad_norm": 21.975770950317383, + "learning_rate": 1.7437141781366356e-06, + "loss": 0.9852, + "step": 21820 + }, + { + "epoch": 2.74, + "grad_norm": 5.015631198883057, + "learning_rate": 1.7428774630799482e-06, + "loss": 0.7902, + "step": 21821 + }, + { + "epoch": 2.74, + "grad_norm": 19.04303741455078, + "learning_rate": 1.742040748023261e-06, + "loss": 0.6132, + "step": 21822 + }, + { + "epoch": 2.74, + "grad_norm": 14.610198974609375, + "learning_rate": 1.7412040329665734e-06, + "loss": 0.2462, + "step": 21823 + }, + { + "epoch": 2.74, + "grad_norm": 9.195034980773926, + "learning_rate": 1.740367317909886e-06, + "loss": 0.6449, + "step": 21824 + }, + { + "epoch": 2.74, + "grad_norm": 10.353257179260254, + "learning_rate": 1.7395306028531986e-06, + "loss": 0.4132, + "step": 21825 + }, + { + "epoch": 2.74, + "grad_norm": 16.202743530273438, + "learning_rate": 1.738693887796511e-06, + "loss": 0.6932, + "step": 21826 + }, + { + "epoch": 2.74, + "grad_norm": 12.876373291015625, + "learning_rate": 1.7378571727398235e-06, + "loss": 1.0852, + "step": 21827 + }, + { + "epoch": 2.74, + "grad_norm": 68.7238540649414, + "learning_rate": 1.7370204576831361e-06, + "loss": 1.1054, + "step": 21828 + }, + { + "epoch": 2.74, + "grad_norm": 13.575844764709473, + "learning_rate": 1.736183742626449e-06, + "loss": 1.2549, + "step": 21829 + }, + { + "epoch": 2.74, + "grad_norm": 29.486371994018555, + "learning_rate": 1.7353470275697611e-06, + "loss": 0.5472, + "step": 21830 + }, + { + "epoch": 2.74, + "grad_norm": 14.410101890563965, + "learning_rate": 1.734510312513074e-06, + "loss": 0.4578, + "step": 21831 + }, + { + "epoch": 2.74, + "grad_norm": 2.11220645904541, + "learning_rate": 1.7336735974563865e-06, + "loss": 0.0596, + "step": 21832 + }, + { + "epoch": 2.74, + "grad_norm": 10.411478996276855, + "learning_rate": 1.7328368823996989e-06, + "loss": 0.8638, + "step": 21833 + }, + { + "epoch": 2.74, + "grad_norm": 17.41630744934082, + "learning_rate": 1.7320001673430115e-06, + "loss": 0.7133, + "step": 21834 + }, + { + "epoch": 2.74, + "grad_norm": 7.677325248718262, + "learning_rate": 1.731163452286324e-06, + "loss": 0.7754, + "step": 21835 + }, + { + "epoch": 2.74, + "grad_norm": 9.375128746032715, + "learning_rate": 1.7303267372296364e-06, + "loss": 0.6212, + "step": 21836 + }, + { + "epoch": 2.74, + "grad_norm": 13.899431228637695, + "learning_rate": 1.729490022172949e-06, + "loss": 1.0943, + "step": 21837 + }, + { + "epoch": 2.74, + "grad_norm": 9.955965042114258, + "learning_rate": 1.7286533071162618e-06, + "loss": 0.1808, + "step": 21838 + }, + { + "epoch": 2.74, + "grad_norm": 21.23792266845703, + "learning_rate": 1.7278165920595744e-06, + "loss": 0.8996, + "step": 21839 + }, + { + "epoch": 2.74, + "grad_norm": 18.382326126098633, + "learning_rate": 1.7269798770028868e-06, + "loss": 0.8314, + "step": 21840 + }, + { + "epoch": 2.74, + "grad_norm": 18.497270584106445, + "learning_rate": 1.7261431619461994e-06, + "loss": 0.7368, + "step": 21841 + }, + { + "epoch": 2.74, + "grad_norm": 13.710346221923828, + "learning_rate": 1.725306446889512e-06, + "loss": 0.2386, + "step": 21842 + }, + { + "epoch": 2.74, + "grad_norm": 10.088309288024902, + "learning_rate": 1.7244697318328244e-06, + "loss": 0.6079, + "step": 21843 + }, + { + "epoch": 2.74, + "grad_norm": 13.157638549804688, + "learning_rate": 1.723633016776137e-06, + "loss": 0.5675, + "step": 21844 + }, + { + "epoch": 2.74, + "grad_norm": 15.954193115234375, + "learning_rate": 1.7227963017194495e-06, + "loss": 0.7772, + "step": 21845 + }, + { + "epoch": 2.74, + "grad_norm": 13.904277801513672, + "learning_rate": 1.7219595866627623e-06, + "loss": 0.7493, + "step": 21846 + }, + { + "epoch": 2.74, + "grad_norm": 33.891056060791016, + "learning_rate": 1.7211228716060747e-06, + "loss": 1.3535, + "step": 21847 + }, + { + "epoch": 2.74, + "grad_norm": 9.177570343017578, + "learning_rate": 1.7202861565493873e-06, + "loss": 0.7781, + "step": 21848 + }, + { + "epoch": 2.74, + "grad_norm": 7.171292781829834, + "learning_rate": 1.7194494414926999e-06, + "loss": 0.184, + "step": 21849 + }, + { + "epoch": 2.74, + "grad_norm": 17.63407325744629, + "learning_rate": 1.7186127264360123e-06, + "loss": 1.3083, + "step": 21850 + }, + { + "epoch": 2.74, + "grad_norm": 34.11215591430664, + "learning_rate": 1.7177760113793249e-06, + "loss": 2.8637, + "step": 21851 + }, + { + "epoch": 2.74, + "grad_norm": 19.464448928833008, + "learning_rate": 1.7169392963226375e-06, + "loss": 1.291, + "step": 21852 + }, + { + "epoch": 2.74, + "grad_norm": 17.207942962646484, + "learning_rate": 1.71610258126595e-06, + "loss": 0.7999, + "step": 21853 + }, + { + "epoch": 2.74, + "grad_norm": 9.438970565795898, + "learning_rate": 1.7152658662092624e-06, + "loss": 0.5002, + "step": 21854 + }, + { + "epoch": 2.74, + "grad_norm": 14.350874900817871, + "learning_rate": 1.7144291511525752e-06, + "loss": 0.6589, + "step": 21855 + }, + { + "epoch": 2.74, + "grad_norm": 4.654712200164795, + "learning_rate": 1.7135924360958878e-06, + "loss": 0.9977, + "step": 21856 + }, + { + "epoch": 2.74, + "grad_norm": 106.78719329833984, + "learning_rate": 1.7127557210392002e-06, + "loss": 0.518, + "step": 21857 + }, + { + "epoch": 2.74, + "grad_norm": 19.357154846191406, + "learning_rate": 1.7119190059825128e-06, + "loss": 0.889, + "step": 21858 + }, + { + "epoch": 2.74, + "grad_norm": 4.225431442260742, + "learning_rate": 1.7110822909258254e-06, + "loss": 0.2714, + "step": 21859 + }, + { + "epoch": 2.74, + "grad_norm": 15.32987117767334, + "learning_rate": 1.710245575869138e-06, + "loss": 1.762, + "step": 21860 + }, + { + "epoch": 2.74, + "grad_norm": 96.23703002929688, + "learning_rate": 1.7094088608124503e-06, + "loss": 1.4382, + "step": 21861 + }, + { + "epoch": 2.74, + "grad_norm": 180.13279724121094, + "learning_rate": 1.708572145755763e-06, + "loss": 1.5237, + "step": 21862 + }, + { + "epoch": 2.74, + "grad_norm": 70.8868408203125, + "learning_rate": 1.7077354306990757e-06, + "loss": 1.6646, + "step": 21863 + }, + { + "epoch": 2.74, + "grad_norm": 25.734073638916016, + "learning_rate": 1.7068987156423881e-06, + "loss": 0.5257, + "step": 21864 + }, + { + "epoch": 2.74, + "grad_norm": 17.142911911010742, + "learning_rate": 1.7060620005857007e-06, + "loss": 2.0709, + "step": 21865 + }, + { + "epoch": 2.74, + "grad_norm": 28.677814483642578, + "learning_rate": 1.7052252855290133e-06, + "loss": 1.3275, + "step": 21866 + }, + { + "epoch": 2.74, + "grad_norm": 10.25939655303955, + "learning_rate": 1.7043885704723259e-06, + "loss": 1.1485, + "step": 21867 + }, + { + "epoch": 2.74, + "grad_norm": 22.28053092956543, + "learning_rate": 1.7035518554156383e-06, + "loss": 1.6201, + "step": 21868 + }, + { + "epoch": 2.74, + "grad_norm": 9.923303604125977, + "learning_rate": 1.7027151403589508e-06, + "loss": 0.705, + "step": 21869 + }, + { + "epoch": 2.74, + "grad_norm": 10.079099655151367, + "learning_rate": 1.7018784253022636e-06, + "loss": 2.1897, + "step": 21870 + }, + { + "epoch": 2.74, + "grad_norm": 11.395952224731445, + "learning_rate": 1.7010417102455758e-06, + "loss": 1.1989, + "step": 21871 + }, + { + "epoch": 2.74, + "grad_norm": 19.015522003173828, + "learning_rate": 1.7002049951888886e-06, + "loss": 0.8393, + "step": 21872 + }, + { + "epoch": 2.75, + "grad_norm": 7.965001583099365, + "learning_rate": 1.6993682801322012e-06, + "loss": 1.1799, + "step": 21873 + }, + { + "epoch": 2.75, + "grad_norm": 2.6550655364990234, + "learning_rate": 1.6985315650755138e-06, + "loss": 0.0839, + "step": 21874 + }, + { + "epoch": 2.75, + "grad_norm": 7.228896141052246, + "learning_rate": 1.6976948500188262e-06, + "loss": 0.2871, + "step": 21875 + }, + { + "epoch": 2.75, + "grad_norm": 21.15328598022461, + "learning_rate": 1.6968581349621388e-06, + "loss": 0.4294, + "step": 21876 + }, + { + "epoch": 2.75, + "grad_norm": 42.655616760253906, + "learning_rate": 1.6960214199054514e-06, + "loss": 1.9843, + "step": 21877 + }, + { + "epoch": 2.75, + "grad_norm": 56.5605583190918, + "learning_rate": 1.6951847048487637e-06, + "loss": 1.3758, + "step": 21878 + }, + { + "epoch": 2.75, + "grad_norm": 15.730677604675293, + "learning_rate": 1.6943479897920765e-06, + "loss": 0.4272, + "step": 21879 + }, + { + "epoch": 2.75, + "grad_norm": 1.9887042045593262, + "learning_rate": 1.6935112747353891e-06, + "loss": 0.0324, + "step": 21880 + }, + { + "epoch": 2.75, + "grad_norm": 15.896023750305176, + "learning_rate": 1.6926745596787015e-06, + "loss": 1.0028, + "step": 21881 + }, + { + "epoch": 2.75, + "grad_norm": 11.582592010498047, + "learning_rate": 1.691837844622014e-06, + "loss": 1.0274, + "step": 21882 + }, + { + "epoch": 2.75, + "grad_norm": 13.835699081420898, + "learning_rate": 1.6910011295653267e-06, + "loss": 1.5205, + "step": 21883 + }, + { + "epoch": 2.75, + "grad_norm": 82.54322052001953, + "learning_rate": 1.6901644145086393e-06, + "loss": 1.3526, + "step": 21884 + }, + { + "epoch": 2.75, + "grad_norm": 19.077165603637695, + "learning_rate": 1.6893276994519517e-06, + "loss": 0.6383, + "step": 21885 + }, + { + "epoch": 2.75, + "grad_norm": 19.55196762084961, + "learning_rate": 1.6884909843952642e-06, + "loss": 1.022, + "step": 21886 + }, + { + "epoch": 2.75, + "grad_norm": 8.582889556884766, + "learning_rate": 1.687654269338577e-06, + "loss": 0.4232, + "step": 21887 + }, + { + "epoch": 2.75, + "grad_norm": 15.185364723205566, + "learning_rate": 1.6868175542818892e-06, + "loss": 0.5666, + "step": 21888 + }, + { + "epoch": 2.75, + "grad_norm": 37.80705642700195, + "learning_rate": 1.685980839225202e-06, + "loss": 1.2841, + "step": 21889 + }, + { + "epoch": 2.75, + "grad_norm": 18.197996139526367, + "learning_rate": 1.6851441241685146e-06, + "loss": 0.4401, + "step": 21890 + }, + { + "epoch": 2.75, + "grad_norm": 0.7867083549499512, + "learning_rate": 1.6843074091118272e-06, + "loss": 0.0157, + "step": 21891 + }, + { + "epoch": 2.75, + "grad_norm": 19.772058486938477, + "learning_rate": 1.6834706940551396e-06, + "loss": 0.5584, + "step": 21892 + }, + { + "epoch": 2.75, + "grad_norm": 11.649335861206055, + "learning_rate": 1.6826339789984522e-06, + "loss": 1.1179, + "step": 21893 + }, + { + "epoch": 2.75, + "grad_norm": 33.918922424316406, + "learning_rate": 1.6817972639417648e-06, + "loss": 2.8612, + "step": 21894 + }, + { + "epoch": 2.75, + "grad_norm": 401.5687561035156, + "learning_rate": 1.6809605488850771e-06, + "loss": 1.3614, + "step": 21895 + }, + { + "epoch": 2.75, + "grad_norm": 13.494359016418457, + "learning_rate": 1.68012383382839e-06, + "loss": 1.6599, + "step": 21896 + }, + { + "epoch": 2.75, + "grad_norm": 326.7059631347656, + "learning_rate": 1.6792871187717025e-06, + "loss": 0.3422, + "step": 21897 + }, + { + "epoch": 2.75, + "grad_norm": 65.26554107666016, + "learning_rate": 1.6784504037150151e-06, + "loss": 1.9373, + "step": 21898 + }, + { + "epoch": 2.75, + "grad_norm": 8.846159934997559, + "learning_rate": 1.6776136886583275e-06, + "loss": 0.1525, + "step": 21899 + }, + { + "epoch": 2.75, + "grad_norm": 54.872589111328125, + "learning_rate": 1.67677697360164e-06, + "loss": 0.8604, + "step": 21900 + }, + { + "epoch": 2.75, + "grad_norm": 15.160786628723145, + "learning_rate": 1.6759402585449527e-06, + "loss": 1.3821, + "step": 21901 + }, + { + "epoch": 2.75, + "grad_norm": 92.08221435546875, + "learning_rate": 1.675103543488265e-06, + "loss": 2.8299, + "step": 21902 + }, + { + "epoch": 2.75, + "grad_norm": 2.968740940093994, + "learning_rate": 1.6742668284315776e-06, + "loss": 0.1037, + "step": 21903 + }, + { + "epoch": 2.75, + "grad_norm": 2.996655225753784, + "learning_rate": 1.6734301133748904e-06, + "loss": 0.0593, + "step": 21904 + }, + { + "epoch": 2.75, + "grad_norm": 10.670302391052246, + "learning_rate": 1.672593398318203e-06, + "loss": 0.2745, + "step": 21905 + }, + { + "epoch": 2.75, + "grad_norm": 103.45101165771484, + "learning_rate": 1.6717566832615154e-06, + "loss": 1.8951, + "step": 21906 + }, + { + "epoch": 2.75, + "grad_norm": 11.243412017822266, + "learning_rate": 1.670919968204828e-06, + "loss": 0.5089, + "step": 21907 + }, + { + "epoch": 2.75, + "grad_norm": 35.79745101928711, + "learning_rate": 1.6700832531481406e-06, + "loss": 0.6931, + "step": 21908 + }, + { + "epoch": 2.75, + "grad_norm": 38.157169342041016, + "learning_rate": 1.669246538091453e-06, + "loss": 2.0333, + "step": 21909 + }, + { + "epoch": 2.75, + "grad_norm": 7.276881217956543, + "learning_rate": 1.6684098230347656e-06, + "loss": 0.3027, + "step": 21910 + }, + { + "epoch": 2.75, + "grad_norm": 10.117690086364746, + "learning_rate": 1.6675731079780784e-06, + "loss": 0.4207, + "step": 21911 + }, + { + "epoch": 2.75, + "grad_norm": 3.078430414199829, + "learning_rate": 1.666736392921391e-06, + "loss": 0.1118, + "step": 21912 + }, + { + "epoch": 2.75, + "grad_norm": 15.570343971252441, + "learning_rate": 1.6658996778647033e-06, + "loss": 0.4665, + "step": 21913 + }, + { + "epoch": 2.75, + "grad_norm": 4.487558841705322, + "learning_rate": 1.665062962808016e-06, + "loss": 0.2549, + "step": 21914 + }, + { + "epoch": 2.75, + "grad_norm": 22.4284610748291, + "learning_rate": 1.6642262477513285e-06, + "loss": 1.4474, + "step": 21915 + }, + { + "epoch": 2.75, + "grad_norm": 11.733662605285645, + "learning_rate": 1.6633895326946409e-06, + "loss": 0.7138, + "step": 21916 + }, + { + "epoch": 2.75, + "grad_norm": 26.12148666381836, + "learning_rate": 1.6625528176379535e-06, + "loss": 2.1125, + "step": 21917 + }, + { + "epoch": 2.75, + "grad_norm": 11.106646537780762, + "learning_rate": 1.661716102581266e-06, + "loss": 0.7808, + "step": 21918 + }, + { + "epoch": 2.75, + "grad_norm": 15.67684555053711, + "learning_rate": 1.6608793875245789e-06, + "loss": 0.4385, + "step": 21919 + }, + { + "epoch": 2.75, + "grad_norm": 288.1107482910156, + "learning_rate": 1.660042672467891e-06, + "loss": 1.8863, + "step": 21920 + }, + { + "epoch": 2.75, + "grad_norm": 91.8941650390625, + "learning_rate": 1.6592059574112038e-06, + "loss": 1.952, + "step": 21921 + }, + { + "epoch": 2.75, + "grad_norm": 14.114540100097656, + "learning_rate": 1.6583692423545164e-06, + "loss": 0.3145, + "step": 21922 + }, + { + "epoch": 2.75, + "grad_norm": 59.77403259277344, + "learning_rate": 1.6575325272978288e-06, + "loss": 1.2466, + "step": 21923 + }, + { + "epoch": 2.75, + "grad_norm": 32.73985290527344, + "learning_rate": 1.6566958122411414e-06, + "loss": 1.3254, + "step": 21924 + }, + { + "epoch": 2.75, + "grad_norm": 22.290546417236328, + "learning_rate": 1.655859097184454e-06, + "loss": 2.5204, + "step": 21925 + }, + { + "epoch": 2.75, + "grad_norm": 21.034669876098633, + "learning_rate": 1.6550223821277664e-06, + "loss": 2.2211, + "step": 21926 + }, + { + "epoch": 2.75, + "grad_norm": 56.31417465209961, + "learning_rate": 1.654185667071079e-06, + "loss": 1.0157, + "step": 21927 + }, + { + "epoch": 2.75, + "grad_norm": 11.791213035583496, + "learning_rate": 1.6533489520143918e-06, + "loss": 0.4559, + "step": 21928 + }, + { + "epoch": 2.75, + "grad_norm": 16.289724349975586, + "learning_rate": 1.6525122369577043e-06, + "loss": 2.1772, + "step": 21929 + }, + { + "epoch": 2.75, + "grad_norm": 6.940343856811523, + "learning_rate": 1.6516755219010167e-06, + "loss": 1.2079, + "step": 21930 + }, + { + "epoch": 2.75, + "grad_norm": 18.676956176757812, + "learning_rate": 1.6508388068443293e-06, + "loss": 1.9555, + "step": 21931 + }, + { + "epoch": 2.75, + "grad_norm": 9.926246643066406, + "learning_rate": 1.650002091787642e-06, + "loss": 2.177, + "step": 21932 + }, + { + "epoch": 2.75, + "grad_norm": 8.71130657196045, + "learning_rate": 1.6491653767309543e-06, + "loss": 0.4256, + "step": 21933 + }, + { + "epoch": 2.75, + "grad_norm": 65.7220458984375, + "learning_rate": 1.6483286616742669e-06, + "loss": 1.3896, + "step": 21934 + }, + { + "epoch": 2.75, + "grad_norm": 23.7147216796875, + "learning_rate": 1.6474919466175795e-06, + "loss": 0.8591, + "step": 21935 + }, + { + "epoch": 2.75, + "grad_norm": 13.173413276672363, + "learning_rate": 1.6466552315608923e-06, + "loss": 0.5658, + "step": 21936 + }, + { + "epoch": 2.75, + "grad_norm": 7.923994064331055, + "learning_rate": 1.6458185165042046e-06, + "loss": 1.5625, + "step": 21937 + }, + { + "epoch": 2.75, + "grad_norm": 15.859750747680664, + "learning_rate": 1.6449818014475172e-06, + "loss": 2.2383, + "step": 21938 + }, + { + "epoch": 2.75, + "grad_norm": 2.3159897327423096, + "learning_rate": 1.6441450863908298e-06, + "loss": 0.1943, + "step": 21939 + }, + { + "epoch": 2.75, + "grad_norm": 20.445648193359375, + "learning_rate": 1.6433083713341422e-06, + "loss": 1.4534, + "step": 21940 + }, + { + "epoch": 2.75, + "grad_norm": 14.416092872619629, + "learning_rate": 1.6424716562774548e-06, + "loss": 0.372, + "step": 21941 + }, + { + "epoch": 2.75, + "grad_norm": 26.19920539855957, + "learning_rate": 1.6416349412207674e-06, + "loss": 1.3652, + "step": 21942 + }, + { + "epoch": 2.75, + "grad_norm": 9.537945747375488, + "learning_rate": 1.6407982261640802e-06, + "loss": 1.5486, + "step": 21943 + }, + { + "epoch": 2.75, + "grad_norm": 13.7022705078125, + "learning_rate": 1.6399615111073923e-06, + "loss": 0.3865, + "step": 21944 + }, + { + "epoch": 2.75, + "grad_norm": 17.28717803955078, + "learning_rate": 1.6391247960507051e-06, + "loss": 1.0219, + "step": 21945 + }, + { + "epoch": 2.75, + "grad_norm": 13.585247993469238, + "learning_rate": 1.6382880809940177e-06, + "loss": 0.5534, + "step": 21946 + }, + { + "epoch": 2.75, + "grad_norm": 14.266397476196289, + "learning_rate": 1.6374513659373301e-06, + "loss": 1.9522, + "step": 21947 + }, + { + "epoch": 2.75, + "grad_norm": 2.218893051147461, + "learning_rate": 1.6366146508806427e-06, + "loss": 0.0877, + "step": 21948 + }, + { + "epoch": 2.75, + "grad_norm": 13.300233840942383, + "learning_rate": 1.6357779358239553e-06, + "loss": 1.1357, + "step": 21949 + }, + { + "epoch": 2.75, + "grad_norm": 8.30359935760498, + "learning_rate": 1.6349412207672679e-06, + "loss": 0.268, + "step": 21950 + }, + { + "epoch": 2.75, + "grad_norm": 6.496923446655273, + "learning_rate": 1.6341045057105803e-06, + "loss": 1.0002, + "step": 21951 + }, + { + "epoch": 2.75, + "grad_norm": 11.52630615234375, + "learning_rate": 1.6332677906538929e-06, + "loss": 0.5624, + "step": 21952 + }, + { + "epoch": 2.76, + "grad_norm": 8.756043434143066, + "learning_rate": 1.6324310755972057e-06, + "loss": 0.7606, + "step": 21953 + }, + { + "epoch": 2.76, + "grad_norm": 11.567495346069336, + "learning_rate": 1.631594360540518e-06, + "loss": 0.3646, + "step": 21954 + }, + { + "epoch": 2.76, + "grad_norm": 5.404824256896973, + "learning_rate": 1.6307576454838306e-06, + "loss": 0.2808, + "step": 21955 + }, + { + "epoch": 2.76, + "grad_norm": 1.953155755996704, + "learning_rate": 1.6299209304271432e-06, + "loss": 0.0587, + "step": 21956 + }, + { + "epoch": 2.76, + "grad_norm": 6.900728225708008, + "learning_rate": 1.6290842153704558e-06, + "loss": 0.3259, + "step": 21957 + }, + { + "epoch": 2.76, + "grad_norm": 21.54115104675293, + "learning_rate": 1.6282475003137682e-06, + "loss": 0.9631, + "step": 21958 + }, + { + "epoch": 2.76, + "grad_norm": 11.71082592010498, + "learning_rate": 1.6274107852570808e-06, + "loss": 0.6494, + "step": 21959 + }, + { + "epoch": 2.76, + "grad_norm": 23.017696380615234, + "learning_rate": 1.6265740702003936e-06, + "loss": 1.0764, + "step": 21960 + }, + { + "epoch": 2.76, + "grad_norm": 16.811050415039062, + "learning_rate": 1.6257373551437057e-06, + "loss": 0.562, + "step": 21961 + }, + { + "epoch": 2.76, + "grad_norm": 10.228431701660156, + "learning_rate": 1.6249006400870185e-06, + "loss": 0.4482, + "step": 21962 + }, + { + "epoch": 2.76, + "grad_norm": 12.603243827819824, + "learning_rate": 1.6240639250303311e-06, + "loss": 0.775, + "step": 21963 + }, + { + "epoch": 2.76, + "grad_norm": 11.481263160705566, + "learning_rate": 1.6232272099736437e-06, + "loss": 1.3212, + "step": 21964 + }, + { + "epoch": 2.76, + "grad_norm": 10.63491439819336, + "learning_rate": 1.622390494916956e-06, + "loss": 0.3953, + "step": 21965 + }, + { + "epoch": 2.76, + "grad_norm": 21.720531463623047, + "learning_rate": 1.6215537798602687e-06, + "loss": 2.258, + "step": 21966 + }, + { + "epoch": 2.76, + "grad_norm": 9.512266159057617, + "learning_rate": 1.6207170648035813e-06, + "loss": 0.7471, + "step": 21967 + }, + { + "epoch": 2.76, + "grad_norm": 7.566118240356445, + "learning_rate": 1.6198803497468937e-06, + "loss": 0.2139, + "step": 21968 + }, + { + "epoch": 2.76, + "grad_norm": 9.541152000427246, + "learning_rate": 1.6190436346902065e-06, + "loss": 0.6269, + "step": 21969 + }, + { + "epoch": 2.76, + "grad_norm": 19.328577041625977, + "learning_rate": 1.618206919633519e-06, + "loss": 0.5275, + "step": 21970 + }, + { + "epoch": 2.76, + "grad_norm": 32.173343658447266, + "learning_rate": 1.6173702045768314e-06, + "loss": 1.8638, + "step": 21971 + }, + { + "epoch": 2.76, + "grad_norm": 36.626922607421875, + "learning_rate": 1.616533489520144e-06, + "loss": 0.6485, + "step": 21972 + }, + { + "epoch": 2.76, + "grad_norm": 6.538182258605957, + "learning_rate": 1.6156967744634566e-06, + "loss": 0.5285, + "step": 21973 + }, + { + "epoch": 2.76, + "grad_norm": 15.379257202148438, + "learning_rate": 1.6148600594067692e-06, + "loss": 0.7523, + "step": 21974 + }, + { + "epoch": 2.76, + "grad_norm": 5.955650806427002, + "learning_rate": 1.6140233443500816e-06, + "loss": 0.3615, + "step": 21975 + }, + { + "epoch": 2.76, + "grad_norm": 17.628507614135742, + "learning_rate": 1.6131866292933942e-06, + "loss": 0.7373, + "step": 21976 + }, + { + "epoch": 2.76, + "grad_norm": 14.149693489074707, + "learning_rate": 1.612349914236707e-06, + "loss": 0.2518, + "step": 21977 + }, + { + "epoch": 2.76, + "grad_norm": 13.153942108154297, + "learning_rate": 1.6115131991800193e-06, + "loss": 1.4344, + "step": 21978 + }, + { + "epoch": 2.76, + "grad_norm": 10.648983001708984, + "learning_rate": 1.610676484123332e-06, + "loss": 1.6093, + "step": 21979 + }, + { + "epoch": 2.76, + "grad_norm": 20.930753707885742, + "learning_rate": 1.6098397690666445e-06, + "loss": 0.6113, + "step": 21980 + }, + { + "epoch": 2.76, + "grad_norm": 10.933890342712402, + "learning_rate": 1.6090030540099571e-06, + "loss": 0.8931, + "step": 21981 + }, + { + "epoch": 2.76, + "grad_norm": 9.867168426513672, + "learning_rate": 1.6081663389532695e-06, + "loss": 0.8762, + "step": 21982 + }, + { + "epoch": 2.76, + "grad_norm": 13.132024765014648, + "learning_rate": 1.607329623896582e-06, + "loss": 2.9462, + "step": 21983 + }, + { + "epoch": 2.76, + "grad_norm": 9.387228012084961, + "learning_rate": 1.6064929088398947e-06, + "loss": 0.2168, + "step": 21984 + }, + { + "epoch": 2.76, + "grad_norm": 10.731705665588379, + "learning_rate": 1.605656193783207e-06, + "loss": 0.617, + "step": 21985 + }, + { + "epoch": 2.76, + "grad_norm": 10.764081001281738, + "learning_rate": 1.6048194787265199e-06, + "loss": 0.7568, + "step": 21986 + }, + { + "epoch": 2.76, + "grad_norm": 7.572868824005127, + "learning_rate": 1.6039827636698324e-06, + "loss": 1.1191, + "step": 21987 + }, + { + "epoch": 2.76, + "grad_norm": 9.564583778381348, + "learning_rate": 1.603146048613145e-06, + "loss": 0.5623, + "step": 21988 + }, + { + "epoch": 2.76, + "grad_norm": 9.795342445373535, + "learning_rate": 1.6023093335564574e-06, + "loss": 1.1223, + "step": 21989 + }, + { + "epoch": 2.76, + "grad_norm": 16.631587982177734, + "learning_rate": 1.60147261849977e-06, + "loss": 0.9183, + "step": 21990 + }, + { + "epoch": 2.76, + "grad_norm": 3.764885425567627, + "learning_rate": 1.6006359034430826e-06, + "loss": 0.1322, + "step": 21991 + }, + { + "epoch": 2.76, + "grad_norm": 11.128994941711426, + "learning_rate": 1.599799188386395e-06, + "loss": 0.7086, + "step": 21992 + }, + { + "epoch": 2.76, + "grad_norm": 28.502336502075195, + "learning_rate": 1.5989624733297076e-06, + "loss": 2.1299, + "step": 21993 + }, + { + "epoch": 2.76, + "grad_norm": 6.179806232452393, + "learning_rate": 1.5981257582730204e-06, + "loss": 0.5095, + "step": 21994 + }, + { + "epoch": 2.76, + "grad_norm": 94.47281646728516, + "learning_rate": 1.597289043216333e-06, + "loss": 1.3763, + "step": 21995 + }, + { + "epoch": 2.76, + "grad_norm": 5.738989353179932, + "learning_rate": 1.5964523281596453e-06, + "loss": 0.2774, + "step": 21996 + }, + { + "epoch": 2.76, + "grad_norm": 9.436177253723145, + "learning_rate": 1.595615613102958e-06, + "loss": 1.0281, + "step": 21997 + }, + { + "epoch": 2.76, + "grad_norm": 28.344717025756836, + "learning_rate": 1.5947788980462705e-06, + "loss": 1.0338, + "step": 21998 + }, + { + "epoch": 2.76, + "grad_norm": 16.288572311401367, + "learning_rate": 1.5939421829895829e-06, + "loss": 0.8427, + "step": 21999 + }, + { + "epoch": 2.76, + "grad_norm": 14.561866760253906, + "learning_rate": 1.5931054679328955e-06, + "loss": 0.7721, + "step": 22000 + }, + { + "epoch": 2.76, + "eval_loss": 0.07404118776321411, + "eval_runtime": 95.399, + "eval_samples_per_second": 37.128, + "eval_steps_per_second": 37.128, + "step": 22000 + }, + { + "epoch": 2.76, + "grad_norm": 12.305974960327148, + "learning_rate": 1.5922687528762083e-06, + "loss": 1.7453, + "step": 22001 + }, + { + "epoch": 2.76, + "grad_norm": 7.190908432006836, + "learning_rate": 1.5914320378195209e-06, + "loss": 0.229, + "step": 22002 + }, + { + "epoch": 2.76, + "grad_norm": 21.130447387695312, + "learning_rate": 1.5905953227628333e-06, + "loss": 0.8447, + "step": 22003 + }, + { + "epoch": 2.76, + "grad_norm": 5.994531631469727, + "learning_rate": 1.5897586077061458e-06, + "loss": 0.2864, + "step": 22004 + }, + { + "epoch": 2.76, + "grad_norm": 12.654057502746582, + "learning_rate": 1.5889218926494584e-06, + "loss": 1.6754, + "step": 22005 + }, + { + "epoch": 2.76, + "grad_norm": 15.838152885437012, + "learning_rate": 1.5880851775927708e-06, + "loss": 0.6666, + "step": 22006 + }, + { + "epoch": 2.76, + "grad_norm": 6.544735431671143, + "learning_rate": 1.5872484625360834e-06, + "loss": 0.7359, + "step": 22007 + }, + { + "epoch": 2.76, + "grad_norm": 18.814769744873047, + "learning_rate": 1.586411747479396e-06, + "loss": 1.3601, + "step": 22008 + }, + { + "epoch": 2.76, + "grad_norm": 34.54404067993164, + "learning_rate": 1.5855750324227084e-06, + "loss": 0.6953, + "step": 22009 + }, + { + "epoch": 2.76, + "grad_norm": 60.191776275634766, + "learning_rate": 1.5847383173660212e-06, + "loss": 2.1437, + "step": 22010 + }, + { + "epoch": 2.76, + "grad_norm": 12.865761756896973, + "learning_rate": 1.5839016023093338e-06, + "loss": 0.5915, + "step": 22011 + }, + { + "epoch": 2.76, + "grad_norm": 55.058258056640625, + "learning_rate": 1.5830648872526464e-06, + "loss": 1.368, + "step": 22012 + }, + { + "epoch": 2.76, + "grad_norm": 6.351691722869873, + "learning_rate": 1.5822281721959587e-06, + "loss": 0.5392, + "step": 22013 + }, + { + "epoch": 2.76, + "grad_norm": 1.2716728448867798, + "learning_rate": 1.5813914571392713e-06, + "loss": 0.0164, + "step": 22014 + }, + { + "epoch": 2.76, + "grad_norm": 13.238324165344238, + "learning_rate": 1.580554742082584e-06, + "loss": 0.666, + "step": 22015 + }, + { + "epoch": 2.76, + "grad_norm": 10.876042366027832, + "learning_rate": 1.5797180270258963e-06, + "loss": 0.6721, + "step": 22016 + }, + { + "epoch": 2.76, + "grad_norm": 8.264463424682617, + "learning_rate": 1.5788813119692089e-06, + "loss": 0.9602, + "step": 22017 + }, + { + "epoch": 2.76, + "grad_norm": 5.361449241638184, + "learning_rate": 1.5780445969125217e-06, + "loss": 0.2794, + "step": 22018 + }, + { + "epoch": 2.76, + "grad_norm": 35.89775085449219, + "learning_rate": 1.5772078818558343e-06, + "loss": 0.8674, + "step": 22019 + }, + { + "epoch": 2.76, + "grad_norm": 10.809699058532715, + "learning_rate": 1.5763711667991466e-06, + "loss": 0.3178, + "step": 22020 + }, + { + "epoch": 2.76, + "grad_norm": 26.81633186340332, + "learning_rate": 1.5755344517424592e-06, + "loss": 1.2973, + "step": 22021 + }, + { + "epoch": 2.76, + "grad_norm": 97.02884674072266, + "learning_rate": 1.5746977366857718e-06, + "loss": 1.5536, + "step": 22022 + }, + { + "epoch": 2.76, + "grad_norm": 37.21788787841797, + "learning_rate": 1.5738610216290842e-06, + "loss": 1.3391, + "step": 22023 + }, + { + "epoch": 2.76, + "grad_norm": 29.007442474365234, + "learning_rate": 1.5730243065723968e-06, + "loss": 1.586, + "step": 22024 + }, + { + "epoch": 2.76, + "grad_norm": 12.787467956542969, + "learning_rate": 1.5721875915157094e-06, + "loss": 0.9574, + "step": 22025 + }, + { + "epoch": 2.76, + "grad_norm": 14.426722526550293, + "learning_rate": 1.5713508764590222e-06, + "loss": 0.4947, + "step": 22026 + }, + { + "epoch": 2.76, + "grad_norm": 18.206111907958984, + "learning_rate": 1.5705141614023346e-06, + "loss": 1.6256, + "step": 22027 + }, + { + "epoch": 2.76, + "grad_norm": 11.78557014465332, + "learning_rate": 1.5696774463456472e-06, + "loss": 0.2601, + "step": 22028 + }, + { + "epoch": 2.76, + "grad_norm": 9.279020309448242, + "learning_rate": 1.5688407312889597e-06, + "loss": 0.8065, + "step": 22029 + }, + { + "epoch": 2.76, + "grad_norm": 7.007302761077881, + "learning_rate": 1.5680040162322721e-06, + "loss": 0.7756, + "step": 22030 + }, + { + "epoch": 2.76, + "grad_norm": 19.91046905517578, + "learning_rate": 1.5671673011755847e-06, + "loss": 0.7541, + "step": 22031 + }, + { + "epoch": 2.76, + "grad_norm": 57.409095764160156, + "learning_rate": 1.5663305861188973e-06, + "loss": 4.0752, + "step": 22032 + }, + { + "epoch": 2.77, + "grad_norm": 12.127786636352539, + "learning_rate": 1.5654938710622101e-06, + "loss": 2.0843, + "step": 22033 + }, + { + "epoch": 2.77, + "grad_norm": 5.797756671905518, + "learning_rate": 1.5646571560055223e-06, + "loss": 0.2609, + "step": 22034 + }, + { + "epoch": 2.77, + "grad_norm": 11.170928955078125, + "learning_rate": 1.563820440948835e-06, + "loss": 0.2006, + "step": 22035 + }, + { + "epoch": 2.77, + "grad_norm": 10.267579078674316, + "learning_rate": 1.5629837258921477e-06, + "loss": 1.2304, + "step": 22036 + }, + { + "epoch": 2.77, + "grad_norm": 16.07809829711914, + "learning_rate": 1.56214701083546e-06, + "loss": 1.8852, + "step": 22037 + }, + { + "epoch": 2.77, + "grad_norm": 9.74322509765625, + "learning_rate": 1.5613102957787726e-06, + "loss": 1.6937, + "step": 22038 + }, + { + "epoch": 2.77, + "grad_norm": 9.701142311096191, + "learning_rate": 1.5604735807220852e-06, + "loss": 2.0227, + "step": 22039 + }, + { + "epoch": 2.77, + "grad_norm": 31.289939880371094, + "learning_rate": 1.5596368656653978e-06, + "loss": 2.159, + "step": 22040 + }, + { + "epoch": 2.77, + "grad_norm": 7.308387756347656, + "learning_rate": 1.5588001506087102e-06, + "loss": 1.11, + "step": 22041 + }, + { + "epoch": 2.77, + "grad_norm": 1.7404890060424805, + "learning_rate": 1.557963435552023e-06, + "loss": 0.0592, + "step": 22042 + }, + { + "epoch": 2.77, + "grad_norm": 18.605871200561523, + "learning_rate": 1.5571267204953356e-06, + "loss": 2.3566, + "step": 22043 + }, + { + "epoch": 2.77, + "grad_norm": 4.8233323097229, + "learning_rate": 1.556290005438648e-06, + "loss": 0.3014, + "step": 22044 + }, + { + "epoch": 2.77, + "grad_norm": 10.767655372619629, + "learning_rate": 1.5554532903819606e-06, + "loss": 0.9742, + "step": 22045 + }, + { + "epoch": 2.77, + "grad_norm": 8.517369270324707, + "learning_rate": 1.5546165753252731e-06, + "loss": 0.3589, + "step": 22046 + }, + { + "epoch": 2.77, + "grad_norm": 15.145514488220215, + "learning_rate": 1.5537798602685857e-06, + "loss": 1.6709, + "step": 22047 + }, + { + "epoch": 2.77, + "grad_norm": 17.403635025024414, + "learning_rate": 1.5529431452118981e-06, + "loss": 1.2472, + "step": 22048 + }, + { + "epoch": 2.77, + "grad_norm": 8.701679229736328, + "learning_rate": 1.5521064301552107e-06, + "loss": 0.2516, + "step": 22049 + }, + { + "epoch": 2.77, + "grad_norm": 17.895038604736328, + "learning_rate": 1.5512697150985235e-06, + "loss": 1.2595, + "step": 22050 + }, + { + "epoch": 2.77, + "grad_norm": 13.750988006591797, + "learning_rate": 1.5504330000418359e-06, + "loss": 0.3344, + "step": 22051 + }, + { + "epoch": 2.77, + "grad_norm": 6.85670804977417, + "learning_rate": 1.5495962849851485e-06, + "loss": 0.1961, + "step": 22052 + }, + { + "epoch": 2.77, + "grad_norm": 41.78623962402344, + "learning_rate": 1.548759569928461e-06, + "loss": 0.9157, + "step": 22053 + }, + { + "epoch": 2.77, + "grad_norm": 14.114798545837402, + "learning_rate": 1.5479228548717734e-06, + "loss": 1.2567, + "step": 22054 + }, + { + "epoch": 2.77, + "grad_norm": 7.244355201721191, + "learning_rate": 1.547086139815086e-06, + "loss": 0.1725, + "step": 22055 + }, + { + "epoch": 2.77, + "grad_norm": 8.193955421447754, + "learning_rate": 1.5462494247583986e-06, + "loss": 0.5188, + "step": 22056 + }, + { + "epoch": 2.77, + "grad_norm": 6.3143134117126465, + "learning_rate": 1.5454127097017112e-06, + "loss": 1.343, + "step": 22057 + }, + { + "epoch": 2.77, + "grad_norm": 4.948602199554443, + "learning_rate": 1.5445759946450236e-06, + "loss": 0.2202, + "step": 22058 + }, + { + "epoch": 2.77, + "grad_norm": 3.0916144847869873, + "learning_rate": 1.5437392795883364e-06, + "loss": 0.1855, + "step": 22059 + }, + { + "epoch": 2.77, + "grad_norm": 15.833561897277832, + "learning_rate": 1.542902564531649e-06, + "loss": 1.0426, + "step": 22060 + }, + { + "epoch": 2.77, + "grad_norm": 33.184932708740234, + "learning_rate": 1.5420658494749614e-06, + "loss": 4.4033, + "step": 22061 + }, + { + "epoch": 2.77, + "grad_norm": 15.196234703063965, + "learning_rate": 1.541229134418274e-06, + "loss": 0.7092, + "step": 22062 + }, + { + "epoch": 2.77, + "grad_norm": 9.446250915527344, + "learning_rate": 1.5403924193615865e-06, + "loss": 0.7741, + "step": 22063 + }, + { + "epoch": 2.77, + "grad_norm": 7.459219932556152, + "learning_rate": 1.5395557043048991e-06, + "loss": 0.8118, + "step": 22064 + }, + { + "epoch": 2.77, + "grad_norm": 9.842741012573242, + "learning_rate": 1.5387189892482115e-06, + "loss": 0.9376, + "step": 22065 + }, + { + "epoch": 2.77, + "grad_norm": 9.335184097290039, + "learning_rate": 1.537882274191524e-06, + "loss": 2.2261, + "step": 22066 + }, + { + "epoch": 2.77, + "grad_norm": 16.755369186401367, + "learning_rate": 1.537045559134837e-06, + "loss": 0.866, + "step": 22067 + }, + { + "epoch": 2.77, + "grad_norm": 20.118492126464844, + "learning_rate": 1.5362088440781493e-06, + "loss": 1.2994, + "step": 22068 + }, + { + "epoch": 2.77, + "grad_norm": 15.074871063232422, + "learning_rate": 1.5353721290214619e-06, + "loss": 0.7052, + "step": 22069 + }, + { + "epoch": 2.77, + "grad_norm": 11.162530899047852, + "learning_rate": 1.5345354139647745e-06, + "loss": 0.2569, + "step": 22070 + }, + { + "epoch": 2.77, + "grad_norm": 16.171817779541016, + "learning_rate": 1.533698698908087e-06, + "loss": 0.7483, + "step": 22071 + }, + { + "epoch": 2.77, + "grad_norm": 12.387211799621582, + "learning_rate": 1.5328619838513994e-06, + "loss": 0.3203, + "step": 22072 + }, + { + "epoch": 2.77, + "grad_norm": 49.55227279663086, + "learning_rate": 1.532025268794712e-06, + "loss": 1.2671, + "step": 22073 + }, + { + "epoch": 2.77, + "grad_norm": 14.436878204345703, + "learning_rate": 1.5311885537380248e-06, + "loss": 0.6779, + "step": 22074 + }, + { + "epoch": 2.77, + "grad_norm": 9.842228889465332, + "learning_rate": 1.530351838681337e-06, + "loss": 1.8578, + "step": 22075 + }, + { + "epoch": 2.77, + "grad_norm": 7.118929386138916, + "learning_rate": 1.5295151236246498e-06, + "loss": 0.3359, + "step": 22076 + }, + { + "epoch": 2.77, + "grad_norm": 1.044647455215454, + "learning_rate": 1.5286784085679624e-06, + "loss": 0.0278, + "step": 22077 + }, + { + "epoch": 2.77, + "grad_norm": 9.421809196472168, + "learning_rate": 1.527841693511275e-06, + "loss": 1.5462, + "step": 22078 + }, + { + "epoch": 2.77, + "grad_norm": 14.961891174316406, + "learning_rate": 1.5270049784545873e-06, + "loss": 0.8989, + "step": 22079 + }, + { + "epoch": 2.77, + "grad_norm": 30.62376594543457, + "learning_rate": 1.5261682633979e-06, + "loss": 0.742, + "step": 22080 + }, + { + "epoch": 2.77, + "grad_norm": 36.153297424316406, + "learning_rate": 1.5253315483412125e-06, + "loss": 0.8372, + "step": 22081 + }, + { + "epoch": 2.77, + "grad_norm": 64.26068878173828, + "learning_rate": 1.524494833284525e-06, + "loss": 2.5712, + "step": 22082 + }, + { + "epoch": 2.77, + "grad_norm": 14.459300994873047, + "learning_rate": 1.5236581182278377e-06, + "loss": 0.4504, + "step": 22083 + }, + { + "epoch": 2.77, + "grad_norm": 3.5654618740081787, + "learning_rate": 1.5228214031711503e-06, + "loss": 0.2064, + "step": 22084 + }, + { + "epoch": 2.77, + "grad_norm": 10.165255546569824, + "learning_rate": 1.5219846881144629e-06, + "loss": 2.0243, + "step": 22085 + }, + { + "epoch": 2.77, + "grad_norm": 26.57514762878418, + "learning_rate": 1.5211479730577753e-06, + "loss": 1.5504, + "step": 22086 + }, + { + "epoch": 2.77, + "grad_norm": 56.33488845825195, + "learning_rate": 1.5203112580010879e-06, + "loss": 2.0848, + "step": 22087 + }, + { + "epoch": 2.77, + "grad_norm": 34.85393142700195, + "learning_rate": 1.5194745429444004e-06, + "loss": 1.8452, + "step": 22088 + }, + { + "epoch": 2.77, + "grad_norm": 13.139994621276855, + "learning_rate": 1.5186378278877128e-06, + "loss": 0.3163, + "step": 22089 + }, + { + "epoch": 2.77, + "grad_norm": 10.634119987487793, + "learning_rate": 1.5178011128310254e-06, + "loss": 0.827, + "step": 22090 + }, + { + "epoch": 2.77, + "grad_norm": 18.775665283203125, + "learning_rate": 1.5169643977743382e-06, + "loss": 1.0176, + "step": 22091 + }, + { + "epoch": 2.77, + "grad_norm": 6.013891696929932, + "learning_rate": 1.5161276827176508e-06, + "loss": 0.1807, + "step": 22092 + }, + { + "epoch": 2.77, + "grad_norm": 22.8477840423584, + "learning_rate": 1.5152909676609632e-06, + "loss": 2.253, + "step": 22093 + }, + { + "epoch": 2.77, + "grad_norm": 2.667479991912842, + "learning_rate": 1.5144542526042758e-06, + "loss": 0.2037, + "step": 22094 + }, + { + "epoch": 2.77, + "grad_norm": 8.795310974121094, + "learning_rate": 1.5136175375475884e-06, + "loss": 0.4072, + "step": 22095 + }, + { + "epoch": 2.77, + "grad_norm": 8.953936576843262, + "learning_rate": 1.5127808224909007e-06, + "loss": 0.5833, + "step": 22096 + }, + { + "epoch": 2.77, + "grad_norm": 31.658977508544922, + "learning_rate": 1.5119441074342133e-06, + "loss": 2.745, + "step": 22097 + }, + { + "epoch": 2.77, + "grad_norm": 14.953171730041504, + "learning_rate": 1.511107392377526e-06, + "loss": 1.4642, + "step": 22098 + }, + { + "epoch": 2.77, + "grad_norm": 8.55324935913086, + "learning_rate": 1.5102706773208383e-06, + "loss": 0.4538, + "step": 22099 + }, + { + "epoch": 2.77, + "grad_norm": 34.371070861816406, + "learning_rate": 1.509433962264151e-06, + "loss": 1.0861, + "step": 22100 + }, + { + "epoch": 2.77, + "grad_norm": 16.9543399810791, + "learning_rate": 1.5085972472074637e-06, + "loss": 0.5991, + "step": 22101 + }, + { + "epoch": 2.77, + "grad_norm": 9.909595489501953, + "learning_rate": 1.5077605321507763e-06, + "loss": 0.4898, + "step": 22102 + }, + { + "epoch": 2.77, + "grad_norm": 82.10369110107422, + "learning_rate": 1.5069238170940887e-06, + "loss": 2.7447, + "step": 22103 + }, + { + "epoch": 2.77, + "grad_norm": 7.9112324714660645, + "learning_rate": 1.5060871020374012e-06, + "loss": 1.4893, + "step": 22104 + }, + { + "epoch": 2.77, + "grad_norm": 1.8757380247116089, + "learning_rate": 1.5052503869807138e-06, + "loss": 0.0745, + "step": 22105 + }, + { + "epoch": 2.77, + "grad_norm": 18.111446380615234, + "learning_rate": 1.5044136719240262e-06, + "loss": 1.2961, + "step": 22106 + }, + { + "epoch": 2.77, + "grad_norm": 21.040069580078125, + "learning_rate": 1.5035769568673388e-06, + "loss": 0.4997, + "step": 22107 + }, + { + "epoch": 2.77, + "grad_norm": 12.237754821777344, + "learning_rate": 1.5027402418106516e-06, + "loss": 0.9439, + "step": 22108 + }, + { + "epoch": 2.77, + "grad_norm": 38.84874725341797, + "learning_rate": 1.5019035267539642e-06, + "loss": 1.1505, + "step": 22109 + }, + { + "epoch": 2.77, + "grad_norm": 15.144665718078613, + "learning_rate": 1.5010668116972766e-06, + "loss": 0.5755, + "step": 22110 + }, + { + "epoch": 2.77, + "grad_norm": 194.444091796875, + "learning_rate": 1.5002300966405892e-06, + "loss": 1.1618, + "step": 22111 + }, + { + "epoch": 2.78, + "grad_norm": 90.46455383300781, + "learning_rate": 1.4993933815839018e-06, + "loss": 1.0779, + "step": 22112 + }, + { + "epoch": 2.78, + "grad_norm": 25.218923568725586, + "learning_rate": 1.4985566665272141e-06, + "loss": 0.7475, + "step": 22113 + }, + { + "epoch": 2.78, + "grad_norm": 14.284053802490234, + "learning_rate": 1.4977199514705267e-06, + "loss": 0.9119, + "step": 22114 + }, + { + "epoch": 2.78, + "grad_norm": 4.315624713897705, + "learning_rate": 1.4968832364138395e-06, + "loss": 0.2522, + "step": 22115 + }, + { + "epoch": 2.78, + "grad_norm": 10.262313842773438, + "learning_rate": 1.4960465213571521e-06, + "loss": 0.7692, + "step": 22116 + }, + { + "epoch": 2.78, + "grad_norm": 10.80276870727539, + "learning_rate": 1.4952098063004645e-06, + "loss": 0.2312, + "step": 22117 + }, + { + "epoch": 2.78, + "grad_norm": 11.167405128479004, + "learning_rate": 1.494373091243777e-06, + "loss": 1.6331, + "step": 22118 + }, + { + "epoch": 2.78, + "grad_norm": 5.160336971282959, + "learning_rate": 1.4935363761870897e-06, + "loss": 0.1514, + "step": 22119 + }, + { + "epoch": 2.78, + "grad_norm": 9.625428199768066, + "learning_rate": 1.492699661130402e-06, + "loss": 0.6352, + "step": 22120 + }, + { + "epoch": 2.78, + "grad_norm": 5.5974812507629395, + "learning_rate": 1.4918629460737146e-06, + "loss": 0.2554, + "step": 22121 + }, + { + "epoch": 2.78, + "grad_norm": 55.964046478271484, + "learning_rate": 1.4910262310170272e-06, + "loss": 1.9656, + "step": 22122 + }, + { + "epoch": 2.78, + "grad_norm": 33.85047149658203, + "learning_rate": 1.49018951596034e-06, + "loss": 2.8984, + "step": 22123 + }, + { + "epoch": 2.78, + "grad_norm": 20.248414993286133, + "learning_rate": 1.4893528009036524e-06, + "loss": 0.7146, + "step": 22124 + }, + { + "epoch": 2.78, + "grad_norm": 3.2281851768493652, + "learning_rate": 1.488516085846965e-06, + "loss": 0.0871, + "step": 22125 + }, + { + "epoch": 2.78, + "grad_norm": 17.509889602661133, + "learning_rate": 1.4876793707902776e-06, + "loss": 1.3073, + "step": 22126 + }, + { + "epoch": 2.78, + "grad_norm": 3.7109663486480713, + "learning_rate": 1.48684265573359e-06, + "loss": 0.177, + "step": 22127 + }, + { + "epoch": 2.78, + "grad_norm": 33.91141891479492, + "learning_rate": 1.4860059406769026e-06, + "loss": 1.4068, + "step": 22128 + }, + { + "epoch": 2.78, + "grad_norm": 5.998182773590088, + "learning_rate": 1.4851692256202152e-06, + "loss": 0.3091, + "step": 22129 + }, + { + "epoch": 2.78, + "grad_norm": 99.66908264160156, + "learning_rate": 1.4843325105635277e-06, + "loss": 4.5171, + "step": 22130 + }, + { + "epoch": 2.78, + "grad_norm": 12.617664337158203, + "learning_rate": 1.4834957955068401e-06, + "loss": 0.5151, + "step": 22131 + }, + { + "epoch": 2.78, + "grad_norm": 2.63997220993042, + "learning_rate": 1.482659080450153e-06, + "loss": 0.0602, + "step": 22132 + }, + { + "epoch": 2.78, + "grad_norm": 28.04462242126465, + "learning_rate": 1.4818223653934655e-06, + "loss": 1.3028, + "step": 22133 + }, + { + "epoch": 2.78, + "grad_norm": 15.393304824829102, + "learning_rate": 1.4809856503367779e-06, + "loss": 0.7099, + "step": 22134 + }, + { + "epoch": 2.78, + "grad_norm": 8.952462196350098, + "learning_rate": 1.4801489352800905e-06, + "loss": 0.232, + "step": 22135 + }, + { + "epoch": 2.78, + "grad_norm": 5.731597423553467, + "learning_rate": 1.479312220223403e-06, + "loss": 0.2321, + "step": 22136 + }, + { + "epoch": 2.78, + "grad_norm": 13.162818908691406, + "learning_rate": 1.4784755051667157e-06, + "loss": 0.7053, + "step": 22137 + }, + { + "epoch": 2.78, + "grad_norm": 12.984853744506836, + "learning_rate": 1.477638790110028e-06, + "loss": 0.4655, + "step": 22138 + }, + { + "epoch": 2.78, + "grad_norm": 25.313459396362305, + "learning_rate": 1.4768020750533406e-06, + "loss": 1.4556, + "step": 22139 + }, + { + "epoch": 2.78, + "grad_norm": 69.79540252685547, + "learning_rate": 1.4759653599966534e-06, + "loss": 1.2859, + "step": 22140 + }, + { + "epoch": 2.78, + "grad_norm": 22.690126419067383, + "learning_rate": 1.4751286449399658e-06, + "loss": 1.272, + "step": 22141 + }, + { + "epoch": 2.78, + "grad_norm": 3.0274555683135986, + "learning_rate": 1.4742919298832784e-06, + "loss": 0.11, + "step": 22142 + }, + { + "epoch": 2.78, + "grad_norm": 34.69709014892578, + "learning_rate": 1.473455214826591e-06, + "loss": 3.0681, + "step": 22143 + }, + { + "epoch": 2.78, + "grad_norm": 18.169361114501953, + "learning_rate": 1.4726184997699034e-06, + "loss": 0.3721, + "step": 22144 + }, + { + "epoch": 2.78, + "grad_norm": 14.725988388061523, + "learning_rate": 1.471781784713216e-06, + "loss": 0.6978, + "step": 22145 + }, + { + "epoch": 2.78, + "grad_norm": 33.30964660644531, + "learning_rate": 1.4709450696565285e-06, + "loss": 0.3039, + "step": 22146 + }, + { + "epoch": 2.78, + "grad_norm": 22.753398895263672, + "learning_rate": 1.4701083545998413e-06, + "loss": 0.7515, + "step": 22147 + }, + { + "epoch": 2.78, + "grad_norm": 11.723492622375488, + "learning_rate": 1.4692716395431535e-06, + "loss": 0.3227, + "step": 22148 + }, + { + "epoch": 2.78, + "grad_norm": 12.993480682373047, + "learning_rate": 1.4684349244864663e-06, + "loss": 0.3641, + "step": 22149 + }, + { + "epoch": 2.78, + "grad_norm": 37.284122467041016, + "learning_rate": 1.467598209429779e-06, + "loss": 1.941, + "step": 22150 + }, + { + "epoch": 2.78, + "grad_norm": 6.983309268951416, + "learning_rate": 1.4667614943730913e-06, + "loss": 0.9879, + "step": 22151 + }, + { + "epoch": 2.78, + "grad_norm": 12.566207885742188, + "learning_rate": 1.4659247793164039e-06, + "loss": 0.5651, + "step": 22152 + }, + { + "epoch": 2.78, + "grad_norm": 27.132915496826172, + "learning_rate": 1.4650880642597165e-06, + "loss": 2.1663, + "step": 22153 + }, + { + "epoch": 2.78, + "grad_norm": 105.89178466796875, + "learning_rate": 1.464251349203029e-06, + "loss": 0.6645, + "step": 22154 + }, + { + "epoch": 2.78, + "grad_norm": 15.870180130004883, + "learning_rate": 1.4634146341463414e-06, + "loss": 1.048, + "step": 22155 + }, + { + "epoch": 2.78, + "grad_norm": 101.65707397460938, + "learning_rate": 1.4625779190896542e-06, + "loss": 1.8495, + "step": 22156 + }, + { + "epoch": 2.78, + "grad_norm": 487.60479736328125, + "learning_rate": 1.4617412040329668e-06, + "loss": 0.9429, + "step": 22157 + }, + { + "epoch": 2.78, + "grad_norm": 16.204265594482422, + "learning_rate": 1.4609044889762792e-06, + "loss": 1.0264, + "step": 22158 + }, + { + "epoch": 2.78, + "grad_norm": 21.302675247192383, + "learning_rate": 1.4600677739195918e-06, + "loss": 1.2089, + "step": 22159 + }, + { + "epoch": 2.78, + "grad_norm": 18.612960815429688, + "learning_rate": 1.4592310588629044e-06, + "loss": 0.7613, + "step": 22160 + }, + { + "epoch": 2.78, + "grad_norm": 6.147254943847656, + "learning_rate": 1.458394343806217e-06, + "loss": 0.2872, + "step": 22161 + }, + { + "epoch": 2.78, + "grad_norm": 102.18993377685547, + "learning_rate": 1.4575576287495294e-06, + "loss": 1.4707, + "step": 22162 + }, + { + "epoch": 2.78, + "grad_norm": 10.75220012664795, + "learning_rate": 1.456720913692842e-06, + "loss": 0.6927, + "step": 22163 + }, + { + "epoch": 2.78, + "grad_norm": 13.363828659057617, + "learning_rate": 1.4558841986361547e-06, + "loss": 0.708, + "step": 22164 + }, + { + "epoch": 2.78, + "grad_norm": 15.768172264099121, + "learning_rate": 1.455047483579467e-06, + "loss": 1.7297, + "step": 22165 + }, + { + "epoch": 2.78, + "grad_norm": 25.005382537841797, + "learning_rate": 1.4542107685227797e-06, + "loss": 1.0066, + "step": 22166 + }, + { + "epoch": 2.78, + "grad_norm": 6.12571382522583, + "learning_rate": 1.4533740534660923e-06, + "loss": 0.1812, + "step": 22167 + }, + { + "epoch": 2.78, + "grad_norm": 18.252429962158203, + "learning_rate": 1.4525373384094049e-06, + "loss": 1.1004, + "step": 22168 + }, + { + "epoch": 2.78, + "grad_norm": 14.02508544921875, + "learning_rate": 1.4517006233527173e-06, + "loss": 0.4741, + "step": 22169 + }, + { + "epoch": 2.78, + "grad_norm": 25.932781219482422, + "learning_rate": 1.4508639082960299e-06, + "loss": 1.9042, + "step": 22170 + }, + { + "epoch": 2.78, + "grad_norm": 111.95731353759766, + "learning_rate": 1.4500271932393425e-06, + "loss": 0.8828, + "step": 22171 + }, + { + "epoch": 2.78, + "grad_norm": 71.82391357421875, + "learning_rate": 1.4491904781826548e-06, + "loss": 1.8012, + "step": 22172 + }, + { + "epoch": 2.78, + "grad_norm": 8.690864562988281, + "learning_rate": 1.4483537631259676e-06, + "loss": 0.8402, + "step": 22173 + }, + { + "epoch": 2.78, + "grad_norm": 9.595281600952148, + "learning_rate": 1.4475170480692802e-06, + "loss": 1.3443, + "step": 22174 + }, + { + "epoch": 2.78, + "grad_norm": 9.36928653717041, + "learning_rate": 1.4466803330125928e-06, + "loss": 1.1032, + "step": 22175 + }, + { + "epoch": 2.78, + "grad_norm": 15.006649017333984, + "learning_rate": 1.4458436179559052e-06, + "loss": 0.9495, + "step": 22176 + }, + { + "epoch": 2.78, + "grad_norm": 73.973388671875, + "learning_rate": 1.4450069028992178e-06, + "loss": 0.3064, + "step": 22177 + }, + { + "epoch": 2.78, + "grad_norm": 3.6623029708862305, + "learning_rate": 1.4441701878425304e-06, + "loss": 0.2425, + "step": 22178 + }, + { + "epoch": 2.78, + "grad_norm": 7.811987400054932, + "learning_rate": 1.4433334727858427e-06, + "loss": 0.4153, + "step": 22179 + }, + { + "epoch": 2.78, + "grad_norm": 92.86666107177734, + "learning_rate": 1.4424967577291553e-06, + "loss": 1.6914, + "step": 22180 + }, + { + "epoch": 2.78, + "grad_norm": 23.44353485107422, + "learning_rate": 1.4416600426724681e-06, + "loss": 0.8927, + "step": 22181 + }, + { + "epoch": 2.78, + "grad_norm": 4.478631496429443, + "learning_rate": 1.4408233276157807e-06, + "loss": 0.1989, + "step": 22182 + }, + { + "epoch": 2.78, + "grad_norm": 109.07917785644531, + "learning_rate": 1.4399866125590931e-06, + "loss": 2.1958, + "step": 22183 + }, + { + "epoch": 2.78, + "grad_norm": 10.12816047668457, + "learning_rate": 1.4391498975024057e-06, + "loss": 0.4367, + "step": 22184 + }, + { + "epoch": 2.78, + "grad_norm": 46.511898040771484, + "learning_rate": 1.4383131824457183e-06, + "loss": 1.5326, + "step": 22185 + }, + { + "epoch": 2.78, + "grad_norm": 5.351166725158691, + "learning_rate": 1.4374764673890307e-06, + "loss": 0.4717, + "step": 22186 + }, + { + "epoch": 2.78, + "grad_norm": 7.09501314163208, + "learning_rate": 1.4366397523323433e-06, + "loss": 0.5723, + "step": 22187 + }, + { + "epoch": 2.78, + "grad_norm": 4.911225318908691, + "learning_rate": 1.435803037275656e-06, + "loss": 0.9178, + "step": 22188 + }, + { + "epoch": 2.78, + "grad_norm": 36.53730392456055, + "learning_rate": 1.4349663222189682e-06, + "loss": 1.3609, + "step": 22189 + }, + { + "epoch": 2.78, + "grad_norm": 10.447989463806152, + "learning_rate": 1.434129607162281e-06, + "loss": 0.8304, + "step": 22190 + }, + { + "epoch": 2.78, + "grad_norm": 15.028526306152344, + "learning_rate": 1.4332928921055936e-06, + "loss": 1.0632, + "step": 22191 + }, + { + "epoch": 2.79, + "grad_norm": 26.80242156982422, + "learning_rate": 1.4324561770489062e-06, + "loss": 1.0022, + "step": 22192 + }, + { + "epoch": 2.79, + "grad_norm": 55.87772750854492, + "learning_rate": 1.4316194619922186e-06, + "loss": 0.5473, + "step": 22193 + }, + { + "epoch": 2.79, + "grad_norm": 25.055089950561523, + "learning_rate": 1.4307827469355312e-06, + "loss": 1.14, + "step": 22194 + }, + { + "epoch": 2.79, + "grad_norm": 13.183613777160645, + "learning_rate": 1.4299460318788438e-06, + "loss": 0.6271, + "step": 22195 + }, + { + "epoch": 2.79, + "grad_norm": 4.73961067199707, + "learning_rate": 1.4291093168221561e-06, + "loss": 0.0372, + "step": 22196 + }, + { + "epoch": 2.79, + "grad_norm": 13.73011302947998, + "learning_rate": 1.4282726017654687e-06, + "loss": 0.1936, + "step": 22197 + }, + { + "epoch": 2.79, + "grad_norm": 22.286022186279297, + "learning_rate": 1.4274358867087815e-06, + "loss": 1.1696, + "step": 22198 + }, + { + "epoch": 2.79, + "grad_norm": 31.08314323425293, + "learning_rate": 1.4265991716520941e-06, + "loss": 1.0551, + "step": 22199 + }, + { + "epoch": 2.79, + "grad_norm": 18.57979965209961, + "learning_rate": 1.4257624565954065e-06, + "loss": 1.5874, + "step": 22200 + }, + { + "epoch": 2.79, + "grad_norm": 45.38309097290039, + "learning_rate": 1.424925741538719e-06, + "loss": 1.421, + "step": 22201 + }, + { + "epoch": 2.79, + "grad_norm": 11.388998985290527, + "learning_rate": 1.4240890264820317e-06, + "loss": 0.9901, + "step": 22202 + }, + { + "epoch": 2.79, + "grad_norm": 11.183924674987793, + "learning_rate": 1.423252311425344e-06, + "loss": 0.45, + "step": 22203 + }, + { + "epoch": 2.79, + "grad_norm": 10.512621879577637, + "learning_rate": 1.4224155963686567e-06, + "loss": 0.6383, + "step": 22204 + }, + { + "epoch": 2.79, + "grad_norm": 20.923755645751953, + "learning_rate": 1.4215788813119695e-06, + "loss": 1.6652, + "step": 22205 + }, + { + "epoch": 2.79, + "grad_norm": 22.1677188873291, + "learning_rate": 1.420742166255282e-06, + "loss": 2.2149, + "step": 22206 + }, + { + "epoch": 2.79, + "grad_norm": 16.348461151123047, + "learning_rate": 1.4199054511985944e-06, + "loss": 0.9331, + "step": 22207 + }, + { + "epoch": 2.79, + "grad_norm": 13.549220085144043, + "learning_rate": 1.419068736141907e-06, + "loss": 0.3617, + "step": 22208 + }, + { + "epoch": 2.79, + "grad_norm": 32.703369140625, + "learning_rate": 1.4182320210852196e-06, + "loss": 2.7096, + "step": 22209 + }, + { + "epoch": 2.79, + "grad_norm": 11.458334922790527, + "learning_rate": 1.417395306028532e-06, + "loss": 0.5415, + "step": 22210 + }, + { + "epoch": 2.79, + "grad_norm": 31.576841354370117, + "learning_rate": 1.4165585909718446e-06, + "loss": 0.7802, + "step": 22211 + }, + { + "epoch": 2.79, + "grad_norm": 16.505142211914062, + "learning_rate": 1.4157218759151572e-06, + "loss": 1.3228, + "step": 22212 + }, + { + "epoch": 2.79, + "grad_norm": 10.127742767333984, + "learning_rate": 1.41488516085847e-06, + "loss": 0.22, + "step": 22213 + }, + { + "epoch": 2.79, + "grad_norm": 88.93213653564453, + "learning_rate": 1.4140484458017823e-06, + "loss": 1.3596, + "step": 22214 + }, + { + "epoch": 2.79, + "grad_norm": 16.036808013916016, + "learning_rate": 1.413211730745095e-06, + "loss": 1.5984, + "step": 22215 + }, + { + "epoch": 2.79, + "grad_norm": 81.969970703125, + "learning_rate": 1.4123750156884075e-06, + "loss": 1.7192, + "step": 22216 + }, + { + "epoch": 2.79, + "grad_norm": 9.316056251525879, + "learning_rate": 1.41153830063172e-06, + "loss": 0.2607, + "step": 22217 + }, + { + "epoch": 2.79, + "grad_norm": 9.6272554397583, + "learning_rate": 1.4107015855750325e-06, + "loss": 1.7587, + "step": 22218 + }, + { + "epoch": 2.79, + "grad_norm": 7.52526330947876, + "learning_rate": 1.409864870518345e-06, + "loss": 0.7524, + "step": 22219 + }, + { + "epoch": 2.79, + "grad_norm": 48.97482681274414, + "learning_rate": 1.4090281554616579e-06, + "loss": 2.1355, + "step": 22220 + }, + { + "epoch": 2.79, + "grad_norm": 15.028366088867188, + "learning_rate": 1.40819144040497e-06, + "loss": 0.4171, + "step": 22221 + }, + { + "epoch": 2.79, + "grad_norm": 14.43149185180664, + "learning_rate": 1.4073547253482828e-06, + "loss": 0.8771, + "step": 22222 + }, + { + "epoch": 2.79, + "grad_norm": 10.106249809265137, + "learning_rate": 1.4065180102915954e-06, + "loss": 0.3454, + "step": 22223 + }, + { + "epoch": 2.79, + "grad_norm": 14.374917984008789, + "learning_rate": 1.4056812952349078e-06, + "loss": 0.7194, + "step": 22224 + }, + { + "epoch": 2.79, + "grad_norm": 19.454383850097656, + "learning_rate": 1.4048445801782204e-06, + "loss": 0.8213, + "step": 22225 + }, + { + "epoch": 2.79, + "grad_norm": 10.458917617797852, + "learning_rate": 1.404007865121533e-06, + "loss": 0.3617, + "step": 22226 + }, + { + "epoch": 2.79, + "grad_norm": 11.162657737731934, + "learning_rate": 1.4031711500648456e-06, + "loss": 0.8042, + "step": 22227 + }, + { + "epoch": 2.79, + "grad_norm": 19.725736618041992, + "learning_rate": 1.402334435008158e-06, + "loss": 0.8581, + "step": 22228 + }, + { + "epoch": 2.79, + "grad_norm": 9.259628295898438, + "learning_rate": 1.4014977199514706e-06, + "loss": 0.5202, + "step": 22229 + }, + { + "epoch": 2.79, + "grad_norm": 13.855989456176758, + "learning_rate": 1.4006610048947834e-06, + "loss": 0.539, + "step": 22230 + }, + { + "epoch": 2.79, + "grad_norm": 5.412811756134033, + "learning_rate": 1.3998242898380957e-06, + "loss": 0.2478, + "step": 22231 + }, + { + "epoch": 2.79, + "grad_norm": 12.529213905334473, + "learning_rate": 1.3989875747814083e-06, + "loss": 1.2361, + "step": 22232 + }, + { + "epoch": 2.79, + "grad_norm": 16.189722061157227, + "learning_rate": 1.398150859724721e-06, + "loss": 0.3163, + "step": 22233 + }, + { + "epoch": 2.79, + "grad_norm": 25.4317626953125, + "learning_rate": 1.3973141446680333e-06, + "loss": 2.3149, + "step": 22234 + }, + { + "epoch": 2.79, + "grad_norm": 5.911479473114014, + "learning_rate": 1.3964774296113459e-06, + "loss": 0.718, + "step": 22235 + }, + { + "epoch": 2.79, + "grad_norm": 14.407179832458496, + "learning_rate": 1.3956407145546585e-06, + "loss": 0.4233, + "step": 22236 + }, + { + "epoch": 2.79, + "grad_norm": 152.7520294189453, + "learning_rate": 1.3948039994979713e-06, + "loss": 3.3073, + "step": 22237 + }, + { + "epoch": 2.79, + "grad_norm": 26.118106842041016, + "learning_rate": 1.3939672844412834e-06, + "loss": 1.4666, + "step": 22238 + }, + { + "epoch": 2.79, + "grad_norm": 7.672816276550293, + "learning_rate": 1.3931305693845962e-06, + "loss": 0.3535, + "step": 22239 + }, + { + "epoch": 2.79, + "grad_norm": 10.650636672973633, + "learning_rate": 1.3922938543279088e-06, + "loss": 0.7928, + "step": 22240 + }, + { + "epoch": 2.79, + "grad_norm": 9.059195518493652, + "learning_rate": 1.3914571392712212e-06, + "loss": 0.8067, + "step": 22241 + }, + { + "epoch": 2.79, + "grad_norm": 7.795120716094971, + "learning_rate": 1.3906204242145338e-06, + "loss": 0.4376, + "step": 22242 + }, + { + "epoch": 2.79, + "grad_norm": 16.20629119873047, + "learning_rate": 1.3897837091578464e-06, + "loss": 0.5432, + "step": 22243 + }, + { + "epoch": 2.79, + "grad_norm": 8.30618667602539, + "learning_rate": 1.388946994101159e-06, + "loss": 0.5956, + "step": 22244 + }, + { + "epoch": 2.79, + "grad_norm": 4.620741844177246, + "learning_rate": 1.3881102790444714e-06, + "loss": 0.2403, + "step": 22245 + }, + { + "epoch": 2.79, + "grad_norm": 90.4376220703125, + "learning_rate": 1.3872735639877842e-06, + "loss": 0.9716, + "step": 22246 + }, + { + "epoch": 2.79, + "grad_norm": 2.850315809249878, + "learning_rate": 1.3864368489310968e-06, + "loss": 0.3054, + "step": 22247 + }, + { + "epoch": 2.79, + "grad_norm": 8.010313034057617, + "learning_rate": 1.3856001338744091e-06, + "loss": 0.2131, + "step": 22248 + }, + { + "epoch": 2.79, + "grad_norm": 9.284424781799316, + "learning_rate": 1.3847634188177217e-06, + "loss": 0.2982, + "step": 22249 + }, + { + "epoch": 2.79, + "grad_norm": 20.03277015686035, + "learning_rate": 1.3839267037610343e-06, + "loss": 1.3185, + "step": 22250 + }, + { + "epoch": 2.79, + "grad_norm": 24.733230590820312, + "learning_rate": 1.383089988704347e-06, + "loss": 0.5493, + "step": 22251 + }, + { + "epoch": 2.79, + "grad_norm": 13.268937110900879, + "learning_rate": 1.3822532736476593e-06, + "loss": 1.1436, + "step": 22252 + }, + { + "epoch": 2.79, + "grad_norm": 14.504895210266113, + "learning_rate": 1.3814165585909719e-06, + "loss": 0.6736, + "step": 22253 + }, + { + "epoch": 2.79, + "grad_norm": 7.637774467468262, + "learning_rate": 1.3805798435342847e-06, + "loss": 0.4366, + "step": 22254 + }, + { + "epoch": 2.79, + "grad_norm": 50.78046798706055, + "learning_rate": 1.379743128477597e-06, + "loss": 0.9827, + "step": 22255 + }, + { + "epoch": 2.79, + "grad_norm": 25.723501205444336, + "learning_rate": 1.3789064134209096e-06, + "loss": 0.8503, + "step": 22256 + }, + { + "epoch": 2.79, + "grad_norm": 15.402575492858887, + "learning_rate": 1.3780696983642222e-06, + "loss": 0.5884, + "step": 22257 + }, + { + "epoch": 2.79, + "grad_norm": 23.20662498474121, + "learning_rate": 1.3772329833075348e-06, + "loss": 2.1696, + "step": 22258 + }, + { + "epoch": 2.79, + "grad_norm": 12.273051261901855, + "learning_rate": 1.3763962682508472e-06, + "loss": 2.4901, + "step": 22259 + }, + { + "epoch": 2.79, + "grad_norm": 24.371145248413086, + "learning_rate": 1.3755595531941598e-06, + "loss": 1.1444, + "step": 22260 + }, + { + "epoch": 2.79, + "grad_norm": 6.011651039123535, + "learning_rate": 1.3747228381374724e-06, + "loss": 0.4818, + "step": 22261 + }, + { + "epoch": 2.79, + "grad_norm": 16.847021102905273, + "learning_rate": 1.3738861230807848e-06, + "loss": 0.4117, + "step": 22262 + }, + { + "epoch": 2.79, + "grad_norm": 6.79587984085083, + "learning_rate": 1.3730494080240976e-06, + "loss": 0.7666, + "step": 22263 + }, + { + "epoch": 2.79, + "grad_norm": 26.817100524902344, + "learning_rate": 1.3722126929674101e-06, + "loss": 1.3122, + "step": 22264 + }, + { + "epoch": 2.79, + "grad_norm": 23.336999893188477, + "learning_rate": 1.3713759779107227e-06, + "loss": 2.0669, + "step": 22265 + }, + { + "epoch": 2.79, + "grad_norm": 7.358304500579834, + "learning_rate": 1.3705392628540351e-06, + "loss": 0.3368, + "step": 22266 + }, + { + "epoch": 2.79, + "grad_norm": 26.694063186645508, + "learning_rate": 1.3697025477973477e-06, + "loss": 0.399, + "step": 22267 + }, + { + "epoch": 2.79, + "grad_norm": 23.938920974731445, + "learning_rate": 1.3688658327406603e-06, + "loss": 1.0198, + "step": 22268 + }, + { + "epoch": 2.79, + "grad_norm": 21.425411224365234, + "learning_rate": 1.3680291176839727e-06, + "loss": 0.7446, + "step": 22269 + }, + { + "epoch": 2.79, + "grad_norm": 11.50066089630127, + "learning_rate": 1.3671924026272853e-06, + "loss": 0.9048, + "step": 22270 + }, + { + "epoch": 2.79, + "grad_norm": 9.022086143493652, + "learning_rate": 1.366355687570598e-06, + "loss": 0.205, + "step": 22271 + }, + { + "epoch": 2.8, + "grad_norm": 113.79344940185547, + "learning_rate": 1.3655189725139107e-06, + "loss": 2.931, + "step": 22272 + }, + { + "epoch": 2.8, + "grad_norm": 12.153200149536133, + "learning_rate": 1.364682257457223e-06, + "loss": 0.7875, + "step": 22273 + }, + { + "epoch": 2.8, + "grad_norm": 30.668458938598633, + "learning_rate": 1.3638455424005356e-06, + "loss": 0.8056, + "step": 22274 + }, + { + "epoch": 2.8, + "grad_norm": 39.18879318237305, + "learning_rate": 1.3630088273438482e-06, + "loss": 1.0483, + "step": 22275 + }, + { + "epoch": 2.8, + "grad_norm": 11.61953353881836, + "learning_rate": 1.3621721122871606e-06, + "loss": 0.9745, + "step": 22276 + }, + { + "epoch": 2.8, + "grad_norm": 10.363846778869629, + "learning_rate": 1.3613353972304732e-06, + "loss": 0.2994, + "step": 22277 + }, + { + "epoch": 2.8, + "grad_norm": 10.151551246643066, + "learning_rate": 1.360498682173786e-06, + "loss": 0.571, + "step": 22278 + }, + { + "epoch": 2.8, + "grad_norm": 17.394407272338867, + "learning_rate": 1.3596619671170982e-06, + "loss": 1.6727, + "step": 22279 + }, + { + "epoch": 2.8, + "grad_norm": 19.183019638061523, + "learning_rate": 1.358825252060411e-06, + "loss": 1.3546, + "step": 22280 + }, + { + "epoch": 2.8, + "grad_norm": 12.273883819580078, + "learning_rate": 1.3579885370037235e-06, + "loss": 1.5346, + "step": 22281 + }, + { + "epoch": 2.8, + "grad_norm": 11.285651206970215, + "learning_rate": 1.3571518219470361e-06, + "loss": 2.372, + "step": 22282 + }, + { + "epoch": 2.8, + "grad_norm": 27.698572158813477, + "learning_rate": 1.3563151068903485e-06, + "loss": 2.4062, + "step": 22283 + }, + { + "epoch": 2.8, + "grad_norm": 18.60379409790039, + "learning_rate": 1.355478391833661e-06, + "loss": 0.8674, + "step": 22284 + }, + { + "epoch": 2.8, + "grad_norm": 23.93038558959961, + "learning_rate": 1.3546416767769737e-06, + "loss": 1.1512, + "step": 22285 + }, + { + "epoch": 2.8, + "grad_norm": 13.560357093811035, + "learning_rate": 1.353804961720286e-06, + "loss": 1.3127, + "step": 22286 + }, + { + "epoch": 2.8, + "grad_norm": 41.231143951416016, + "learning_rate": 1.3529682466635989e-06, + "loss": 0.8257, + "step": 22287 + }, + { + "epoch": 2.8, + "grad_norm": 6.1269731521606445, + "learning_rate": 1.3521315316069115e-06, + "loss": 0.295, + "step": 22288 + }, + { + "epoch": 2.8, + "grad_norm": 38.05921936035156, + "learning_rate": 1.351294816550224e-06, + "loss": 0.5179, + "step": 22289 + }, + { + "epoch": 2.8, + "grad_norm": 7.305657863616943, + "learning_rate": 1.3504581014935364e-06, + "loss": 0.1885, + "step": 22290 + }, + { + "epoch": 2.8, + "grad_norm": 58.65455627441406, + "learning_rate": 1.349621386436849e-06, + "loss": 0.8713, + "step": 22291 + }, + { + "epoch": 2.8, + "grad_norm": 7.845381259918213, + "learning_rate": 1.3487846713801616e-06, + "loss": 0.4956, + "step": 22292 + }, + { + "epoch": 2.8, + "grad_norm": 6.775365352630615, + "learning_rate": 1.347947956323474e-06, + "loss": 0.5024, + "step": 22293 + }, + { + "epoch": 2.8, + "grad_norm": 28.21062469482422, + "learning_rate": 1.3471112412667866e-06, + "loss": 1.7029, + "step": 22294 + }, + { + "epoch": 2.8, + "grad_norm": 22.696487426757812, + "learning_rate": 1.3462745262100994e-06, + "loss": 1.1901, + "step": 22295 + }, + { + "epoch": 2.8, + "grad_norm": 8.468282699584961, + "learning_rate": 1.345437811153412e-06, + "loss": 1.3211, + "step": 22296 + }, + { + "epoch": 2.8, + "grad_norm": 8.390238761901855, + "learning_rate": 1.3446010960967243e-06, + "loss": 0.5668, + "step": 22297 + }, + { + "epoch": 2.8, + "grad_norm": 10.155052185058594, + "learning_rate": 1.343764381040037e-06, + "loss": 0.3998, + "step": 22298 + }, + { + "epoch": 2.8, + "grad_norm": 22.792644500732422, + "learning_rate": 1.3429276659833495e-06, + "loss": 0.4127, + "step": 22299 + }, + { + "epoch": 2.8, + "grad_norm": 6.965851783752441, + "learning_rate": 1.342090950926662e-06, + "loss": 0.6262, + "step": 22300 + }, + { + "epoch": 2.8, + "grad_norm": 28.035646438598633, + "learning_rate": 1.3412542358699745e-06, + "loss": 0.9244, + "step": 22301 + }, + { + "epoch": 2.8, + "grad_norm": 17.093761444091797, + "learning_rate": 1.340417520813287e-06, + "loss": 0.9921, + "step": 22302 + }, + { + "epoch": 2.8, + "grad_norm": 42.652687072753906, + "learning_rate": 1.3395808057565999e-06, + "loss": 0.8933, + "step": 22303 + }, + { + "epoch": 2.8, + "grad_norm": 8.79464054107666, + "learning_rate": 1.3387440906999123e-06, + "loss": 0.3885, + "step": 22304 + }, + { + "epoch": 2.8, + "grad_norm": 7.84977912902832, + "learning_rate": 1.3379073756432249e-06, + "loss": 0.4833, + "step": 22305 + }, + { + "epoch": 2.8, + "grad_norm": 8.520516395568848, + "learning_rate": 1.3370706605865374e-06, + "loss": 1.4362, + "step": 22306 + }, + { + "epoch": 2.8, + "grad_norm": 9.85045337677002, + "learning_rate": 1.3362339455298498e-06, + "loss": 1.3511, + "step": 22307 + }, + { + "epoch": 2.8, + "grad_norm": 9.924347877502441, + "learning_rate": 1.3353972304731624e-06, + "loss": 0.9138, + "step": 22308 + }, + { + "epoch": 2.8, + "grad_norm": 64.50171661376953, + "learning_rate": 1.334560515416475e-06, + "loss": 1.7466, + "step": 22309 + }, + { + "epoch": 2.8, + "grad_norm": 69.96508026123047, + "learning_rate": 1.3337238003597878e-06, + "loss": 1.3254, + "step": 22310 + }, + { + "epoch": 2.8, + "grad_norm": 17.21268081665039, + "learning_rate": 1.3328870853031e-06, + "loss": 1.1874, + "step": 22311 + }, + { + "epoch": 2.8, + "grad_norm": 20.13076400756836, + "learning_rate": 1.3320503702464128e-06, + "loss": 1.0865, + "step": 22312 + }, + { + "epoch": 2.8, + "grad_norm": 3.721005916595459, + "learning_rate": 1.3312136551897254e-06, + "loss": 0.1926, + "step": 22313 + }, + { + "epoch": 2.8, + "grad_norm": 14.235478401184082, + "learning_rate": 1.3303769401330377e-06, + "loss": 0.7383, + "step": 22314 + }, + { + "epoch": 2.8, + "grad_norm": 14.641931533813477, + "learning_rate": 1.3295402250763503e-06, + "loss": 1.0961, + "step": 22315 + }, + { + "epoch": 2.8, + "grad_norm": 12.710472106933594, + "learning_rate": 1.328703510019663e-06, + "loss": 0.7244, + "step": 22316 + }, + { + "epoch": 2.8, + "grad_norm": 19.97145652770996, + "learning_rate": 1.3278667949629755e-06, + "loss": 1.1615, + "step": 22317 + }, + { + "epoch": 2.8, + "grad_norm": 8.688436508178711, + "learning_rate": 1.3270300799062879e-06, + "loss": 1.3011, + "step": 22318 + }, + { + "epoch": 2.8, + "grad_norm": 6.827741622924805, + "learning_rate": 1.3261933648496007e-06, + "loss": 0.4903, + "step": 22319 + }, + { + "epoch": 2.8, + "grad_norm": 10.897211074829102, + "learning_rate": 1.3253566497929133e-06, + "loss": 0.5886, + "step": 22320 + }, + { + "epoch": 2.8, + "grad_norm": 101.98963928222656, + "learning_rate": 1.3245199347362257e-06, + "loss": 2.3322, + "step": 22321 + }, + { + "epoch": 2.8, + "grad_norm": 51.10252380371094, + "learning_rate": 1.3236832196795383e-06, + "loss": 0.8964, + "step": 22322 + }, + { + "epoch": 2.8, + "grad_norm": 11.098581314086914, + "learning_rate": 1.3228465046228508e-06, + "loss": 0.7318, + "step": 22323 + }, + { + "epoch": 2.8, + "grad_norm": 63.52899169921875, + "learning_rate": 1.3220097895661632e-06, + "loss": 1.7013, + "step": 22324 + }, + { + "epoch": 2.8, + "grad_norm": 7.581141471862793, + "learning_rate": 1.3211730745094758e-06, + "loss": 1.4585, + "step": 22325 + }, + { + "epoch": 2.8, + "grad_norm": 19.211050033569336, + "learning_rate": 1.3203363594527884e-06, + "loss": 1.2471, + "step": 22326 + }, + { + "epoch": 2.8, + "grad_norm": 6.085424900054932, + "learning_rate": 1.3194996443961012e-06, + "loss": 0.6361, + "step": 22327 + }, + { + "epoch": 2.8, + "grad_norm": 9.494768142700195, + "learning_rate": 1.3186629293394136e-06, + "loss": 1.0115, + "step": 22328 + }, + { + "epoch": 2.8, + "grad_norm": 12.25251293182373, + "learning_rate": 1.3178262142827262e-06, + "loss": 0.277, + "step": 22329 + }, + { + "epoch": 2.8, + "grad_norm": 10.640769004821777, + "learning_rate": 1.3169894992260388e-06, + "loss": 1.3467, + "step": 22330 + }, + { + "epoch": 2.8, + "grad_norm": 349.9528503417969, + "learning_rate": 1.3161527841693511e-06, + "loss": 1.6585, + "step": 22331 + }, + { + "epoch": 2.8, + "grad_norm": 32.06199645996094, + "learning_rate": 1.3153160691126637e-06, + "loss": 2.6387, + "step": 22332 + }, + { + "epoch": 2.8, + "grad_norm": 16.242982864379883, + "learning_rate": 1.3144793540559763e-06, + "loss": 0.5804, + "step": 22333 + }, + { + "epoch": 2.8, + "grad_norm": 7.629415035247803, + "learning_rate": 1.313642638999289e-06, + "loss": 0.6288, + "step": 22334 + }, + { + "epoch": 2.8, + "grad_norm": 26.868345260620117, + "learning_rate": 1.3128059239426013e-06, + "loss": 1.5236, + "step": 22335 + }, + { + "epoch": 2.8, + "grad_norm": 7.834210395812988, + "learning_rate": 1.311969208885914e-06, + "loss": 0.3088, + "step": 22336 + }, + { + "epoch": 2.8, + "grad_norm": 8.49197769165039, + "learning_rate": 1.3111324938292267e-06, + "loss": 0.5462, + "step": 22337 + }, + { + "epoch": 2.8, + "grad_norm": 15.133360862731934, + "learning_rate": 1.310295778772539e-06, + "loss": 2.2611, + "step": 22338 + }, + { + "epoch": 2.8, + "grad_norm": 3.2791247367858887, + "learning_rate": 1.3094590637158516e-06, + "loss": 0.2174, + "step": 22339 + }, + { + "epoch": 2.8, + "grad_norm": 2.5591752529144287, + "learning_rate": 1.3086223486591642e-06, + "loss": 0.1529, + "step": 22340 + }, + { + "epoch": 2.8, + "grad_norm": 24.870407104492188, + "learning_rate": 1.3077856336024768e-06, + "loss": 0.9089, + "step": 22341 + }, + { + "epoch": 2.8, + "grad_norm": 20.577219009399414, + "learning_rate": 1.3069489185457892e-06, + "loss": 1.445, + "step": 22342 + }, + { + "epoch": 2.8, + "grad_norm": 17.739303588867188, + "learning_rate": 1.3061122034891018e-06, + "loss": 0.8651, + "step": 22343 + }, + { + "epoch": 2.8, + "grad_norm": 30.139007568359375, + "learning_rate": 1.3052754884324146e-06, + "loss": 0.9499, + "step": 22344 + }, + { + "epoch": 2.8, + "grad_norm": 19.0602970123291, + "learning_rate": 1.304438773375727e-06, + "loss": 1.2218, + "step": 22345 + }, + { + "epoch": 2.8, + "grad_norm": 19.527162551879883, + "learning_rate": 1.3036020583190396e-06, + "loss": 0.6184, + "step": 22346 + }, + { + "epoch": 2.8, + "grad_norm": 7.0958099365234375, + "learning_rate": 1.3027653432623522e-06, + "loss": 0.2741, + "step": 22347 + }, + { + "epoch": 2.8, + "grad_norm": 4.984929084777832, + "learning_rate": 1.3019286282056647e-06, + "loss": 0.2121, + "step": 22348 + }, + { + "epoch": 2.8, + "grad_norm": 15.76543140411377, + "learning_rate": 1.3010919131489771e-06, + "loss": 1.2545, + "step": 22349 + }, + { + "epoch": 2.8, + "grad_norm": 4.139402389526367, + "learning_rate": 1.3002551980922897e-06, + "loss": 0.2187, + "step": 22350 + }, + { + "epoch": 2.81, + "grad_norm": 8.716383934020996, + "learning_rate": 1.2994184830356025e-06, + "loss": 1.4159, + "step": 22351 + }, + { + "epoch": 2.81, + "grad_norm": 91.85513305664062, + "learning_rate": 1.2985817679789147e-06, + "loss": 2.825, + "step": 22352 + }, + { + "epoch": 2.81, + "grad_norm": 16.891550064086914, + "learning_rate": 1.2977450529222275e-06, + "loss": 0.548, + "step": 22353 + }, + { + "epoch": 2.81, + "grad_norm": 27.4136962890625, + "learning_rate": 1.29690833786554e-06, + "loss": 0.7632, + "step": 22354 + }, + { + "epoch": 2.81, + "grad_norm": 15.752585411071777, + "learning_rate": 1.2960716228088527e-06, + "loss": 1.5803, + "step": 22355 + }, + { + "epoch": 2.81, + "grad_norm": 5.520205974578857, + "learning_rate": 1.295234907752165e-06, + "loss": 0.3901, + "step": 22356 + }, + { + "epoch": 2.81, + "grad_norm": 35.271385192871094, + "learning_rate": 1.2943981926954776e-06, + "loss": 0.5498, + "step": 22357 + }, + { + "epoch": 2.81, + "grad_norm": 11.174712181091309, + "learning_rate": 1.2935614776387902e-06, + "loss": 0.938, + "step": 22358 + }, + { + "epoch": 2.81, + "grad_norm": 6.589173793792725, + "learning_rate": 1.2927247625821026e-06, + "loss": 0.5047, + "step": 22359 + }, + { + "epoch": 2.81, + "grad_norm": 68.62174224853516, + "learning_rate": 1.2918880475254154e-06, + "loss": 0.5637, + "step": 22360 + }, + { + "epoch": 2.81, + "grad_norm": 9.69046401977539, + "learning_rate": 1.291051332468728e-06, + "loss": 0.5623, + "step": 22361 + }, + { + "epoch": 2.81, + "grad_norm": 34.01222610473633, + "learning_rate": 1.2902146174120406e-06, + "loss": 0.5374, + "step": 22362 + }, + { + "epoch": 2.81, + "grad_norm": 3.726040840148926, + "learning_rate": 1.289377902355353e-06, + "loss": 0.339, + "step": 22363 + }, + { + "epoch": 2.81, + "grad_norm": 29.50176239013672, + "learning_rate": 1.2885411872986656e-06, + "loss": 2.0108, + "step": 22364 + }, + { + "epoch": 2.81, + "grad_norm": 16.313310623168945, + "learning_rate": 1.2877044722419781e-06, + "loss": 0.6736, + "step": 22365 + }, + { + "epoch": 2.81, + "grad_norm": 7.732245445251465, + "learning_rate": 1.2868677571852905e-06, + "loss": 1.1685, + "step": 22366 + }, + { + "epoch": 2.81, + "grad_norm": 14.34288501739502, + "learning_rate": 1.2860310421286031e-06, + "loss": 0.5316, + "step": 22367 + }, + { + "epoch": 2.81, + "grad_norm": 20.638566970825195, + "learning_rate": 1.285194327071916e-06, + "loss": 1.3243, + "step": 22368 + }, + { + "epoch": 2.81, + "grad_norm": 13.029252052307129, + "learning_rate": 1.284357612015228e-06, + "loss": 1.3098, + "step": 22369 + }, + { + "epoch": 2.81, + "grad_norm": 19.74635887145996, + "learning_rate": 1.2835208969585409e-06, + "loss": 1.8711, + "step": 22370 + }, + { + "epoch": 2.81, + "grad_norm": 22.1148738861084, + "learning_rate": 1.2826841819018535e-06, + "loss": 0.9669, + "step": 22371 + }, + { + "epoch": 2.81, + "grad_norm": 17.366498947143555, + "learning_rate": 1.281847466845166e-06, + "loss": 0.7144, + "step": 22372 + }, + { + "epoch": 2.81, + "grad_norm": 9.708720207214355, + "learning_rate": 1.2810107517884784e-06, + "loss": 0.272, + "step": 22373 + }, + { + "epoch": 2.81, + "grad_norm": 14.64260482788086, + "learning_rate": 1.280174036731791e-06, + "loss": 0.9609, + "step": 22374 + }, + { + "epoch": 2.81, + "grad_norm": 38.372615814208984, + "learning_rate": 1.2793373216751036e-06, + "loss": 1.3878, + "step": 22375 + }, + { + "epoch": 2.81, + "grad_norm": 6.752889633178711, + "learning_rate": 1.278500606618416e-06, + "loss": 0.0768, + "step": 22376 + }, + { + "epoch": 2.81, + "grad_norm": 64.34168243408203, + "learning_rate": 1.2776638915617288e-06, + "loss": 0.9934, + "step": 22377 + }, + { + "epoch": 2.81, + "grad_norm": 8.978327751159668, + "learning_rate": 1.2768271765050414e-06, + "loss": 2.2121, + "step": 22378 + }, + { + "epoch": 2.81, + "grad_norm": 80.38655853271484, + "learning_rate": 1.275990461448354e-06, + "loss": 1.3939, + "step": 22379 + }, + { + "epoch": 2.81, + "grad_norm": 7.601134300231934, + "learning_rate": 1.2751537463916664e-06, + "loss": 0.8712, + "step": 22380 + }, + { + "epoch": 2.81, + "grad_norm": 12.885122299194336, + "learning_rate": 1.274317031334979e-06, + "loss": 0.5965, + "step": 22381 + }, + { + "epoch": 2.81, + "grad_norm": 60.07326126098633, + "learning_rate": 1.2734803162782915e-06, + "loss": 1.6777, + "step": 22382 + }, + { + "epoch": 2.81, + "grad_norm": 33.034019470214844, + "learning_rate": 1.272643601221604e-06, + "loss": 0.5645, + "step": 22383 + }, + { + "epoch": 2.81, + "grad_norm": 8.753458023071289, + "learning_rate": 1.2718068861649165e-06, + "loss": 0.2947, + "step": 22384 + }, + { + "epoch": 2.81, + "grad_norm": 7.3992156982421875, + "learning_rate": 1.2709701711082293e-06, + "loss": 0.2016, + "step": 22385 + }, + { + "epoch": 2.81, + "grad_norm": 31.3106689453125, + "learning_rate": 1.270133456051542e-06, + "loss": 1.7178, + "step": 22386 + }, + { + "epoch": 2.81, + "grad_norm": 17.78363037109375, + "learning_rate": 1.2692967409948543e-06, + "loss": 0.6034, + "step": 22387 + }, + { + "epoch": 2.81, + "grad_norm": 11.05273151397705, + "learning_rate": 1.2684600259381669e-06, + "loss": 1.2865, + "step": 22388 + }, + { + "epoch": 2.81, + "grad_norm": 6.84112024307251, + "learning_rate": 1.2676233108814795e-06, + "loss": 0.2924, + "step": 22389 + }, + { + "epoch": 2.81, + "grad_norm": 27.54936408996582, + "learning_rate": 1.2667865958247918e-06, + "loss": 1.6276, + "step": 22390 + }, + { + "epoch": 2.81, + "grad_norm": 40.96695327758789, + "learning_rate": 1.2659498807681044e-06, + "loss": 1.1847, + "step": 22391 + }, + { + "epoch": 2.81, + "grad_norm": 22.234130859375, + "learning_rate": 1.2651131657114172e-06, + "loss": 0.8053, + "step": 22392 + }, + { + "epoch": 2.81, + "grad_norm": 21.5286865234375, + "learning_rate": 1.2642764506547298e-06, + "loss": 2.3074, + "step": 22393 + }, + { + "epoch": 2.81, + "grad_norm": 23.00366973876953, + "learning_rate": 1.2634397355980422e-06, + "loss": 1.866, + "step": 22394 + }, + { + "epoch": 2.81, + "grad_norm": 11.963624954223633, + "learning_rate": 1.2626030205413548e-06, + "loss": 0.525, + "step": 22395 + }, + { + "epoch": 2.81, + "grad_norm": 7.500345230102539, + "learning_rate": 1.2617663054846674e-06, + "loss": 0.2791, + "step": 22396 + }, + { + "epoch": 2.81, + "grad_norm": 40.571903228759766, + "learning_rate": 1.2609295904279798e-06, + "loss": 1.9009, + "step": 22397 + }, + { + "epoch": 2.81, + "grad_norm": 10.458895683288574, + "learning_rate": 1.2600928753712923e-06, + "loss": 0.7065, + "step": 22398 + }, + { + "epoch": 2.81, + "grad_norm": 21.273595809936523, + "learning_rate": 1.259256160314605e-06, + "loss": 0.6852, + "step": 22399 + }, + { + "epoch": 2.81, + "grad_norm": 9.218423843383789, + "learning_rate": 1.2584194452579177e-06, + "loss": 0.3528, + "step": 22400 + }, + { + "epoch": 2.81, + "eval_loss": 0.07806830108165741, + "eval_runtime": 96.4114, + "eval_samples_per_second": 36.738, + "eval_steps_per_second": 36.738, + "step": 22400 + }, + { + "epoch": 2.81, + "grad_norm": 23.908506393432617, + "learning_rate": 1.25758273020123e-06, + "loss": 1.2257, + "step": 22401 + }, + { + "epoch": 2.81, + "grad_norm": 4.9419846534729, + "learning_rate": 1.2567460151445427e-06, + "loss": 0.072, + "step": 22402 + }, + { + "epoch": 2.81, + "grad_norm": 19.66534423828125, + "learning_rate": 1.2559093000878553e-06, + "loss": 2.4977, + "step": 22403 + }, + { + "epoch": 2.81, + "grad_norm": 11.625102043151855, + "learning_rate": 1.2550725850311677e-06, + "loss": 0.7169, + "step": 22404 + }, + { + "epoch": 2.81, + "grad_norm": 11.85889720916748, + "learning_rate": 1.2542358699744803e-06, + "loss": 0.22, + "step": 22405 + }, + { + "epoch": 2.81, + "grad_norm": 22.91364288330078, + "learning_rate": 1.2533991549177929e-06, + "loss": 0.5825, + "step": 22406 + }, + { + "epoch": 2.81, + "grad_norm": 14.040718078613281, + "learning_rate": 1.2525624398611054e-06, + "loss": 0.7661, + "step": 22407 + }, + { + "epoch": 2.81, + "grad_norm": 14.2266263961792, + "learning_rate": 1.2517257248044178e-06, + "loss": 1.4066, + "step": 22408 + }, + { + "epoch": 2.81, + "grad_norm": 105.7550048828125, + "learning_rate": 1.2508890097477306e-06, + "loss": 2.0166, + "step": 22409 + }, + { + "epoch": 2.81, + "grad_norm": 18.059402465820312, + "learning_rate": 1.2500522946910432e-06, + "loss": 0.7641, + "step": 22410 + }, + { + "epoch": 2.81, + "grad_norm": 8.43260669708252, + "learning_rate": 1.2492155796343556e-06, + "loss": 0.362, + "step": 22411 + }, + { + "epoch": 2.81, + "grad_norm": 20.3315372467041, + "learning_rate": 1.2483788645776682e-06, + "loss": 1.883, + "step": 22412 + }, + { + "epoch": 2.81, + "grad_norm": 134.1221923828125, + "learning_rate": 1.2475421495209808e-06, + "loss": 0.3431, + "step": 22413 + }, + { + "epoch": 2.81, + "grad_norm": 24.616275787353516, + "learning_rate": 1.2467054344642934e-06, + "loss": 1.5184, + "step": 22414 + }, + { + "epoch": 2.81, + "grad_norm": 10.436813354492188, + "learning_rate": 1.2458687194076057e-06, + "loss": 0.7298, + "step": 22415 + }, + { + "epoch": 2.81, + "grad_norm": 14.31775188446045, + "learning_rate": 1.2450320043509183e-06, + "loss": 0.9045, + "step": 22416 + }, + { + "epoch": 2.81, + "grad_norm": 29.62679672241211, + "learning_rate": 1.244195289294231e-06, + "loss": 0.5825, + "step": 22417 + }, + { + "epoch": 2.81, + "grad_norm": 3.6407570838928223, + "learning_rate": 1.2433585742375435e-06, + "loss": 0.1671, + "step": 22418 + }, + { + "epoch": 2.81, + "grad_norm": 16.576555252075195, + "learning_rate": 1.242521859180856e-06, + "loss": 0.4805, + "step": 22419 + }, + { + "epoch": 2.81, + "grad_norm": 12.0615234375, + "learning_rate": 1.2416851441241687e-06, + "loss": 0.6741, + "step": 22420 + }, + { + "epoch": 2.81, + "grad_norm": 18.188894271850586, + "learning_rate": 1.2408484290674813e-06, + "loss": 0.4955, + "step": 22421 + }, + { + "epoch": 2.81, + "grad_norm": 10.029412269592285, + "learning_rate": 1.2400117140107937e-06, + "loss": 0.853, + "step": 22422 + }, + { + "epoch": 2.81, + "grad_norm": 3.855475902557373, + "learning_rate": 1.2391749989541062e-06, + "loss": 0.3004, + "step": 22423 + }, + { + "epoch": 2.81, + "grad_norm": 111.1025619506836, + "learning_rate": 1.2383382838974188e-06, + "loss": 2.0545, + "step": 22424 + }, + { + "epoch": 2.81, + "grad_norm": 58.22159194946289, + "learning_rate": 1.2375015688407314e-06, + "loss": 1.6608, + "step": 22425 + }, + { + "epoch": 2.81, + "grad_norm": 30.532649993896484, + "learning_rate": 1.236664853784044e-06, + "loss": 1.4307, + "step": 22426 + }, + { + "epoch": 2.81, + "grad_norm": 12.674906730651855, + "learning_rate": 1.2358281387273564e-06, + "loss": 0.8895, + "step": 22427 + }, + { + "epoch": 2.81, + "grad_norm": 30.858558654785156, + "learning_rate": 1.234991423670669e-06, + "loss": 0.9602, + "step": 22428 + }, + { + "epoch": 2.81, + "grad_norm": 95.74943542480469, + "learning_rate": 1.2341547086139816e-06, + "loss": 1.8953, + "step": 22429 + }, + { + "epoch": 2.81, + "grad_norm": 7.380533218383789, + "learning_rate": 1.2333179935572942e-06, + "loss": 2.7785, + "step": 22430 + }, + { + "epoch": 2.82, + "grad_norm": 3.821453809738159, + "learning_rate": 1.2324812785006068e-06, + "loss": 0.2935, + "step": 22431 + }, + { + "epoch": 2.82, + "grad_norm": 12.30856990814209, + "learning_rate": 1.2316445634439193e-06, + "loss": 1.5786, + "step": 22432 + }, + { + "epoch": 2.82, + "grad_norm": 8.218766212463379, + "learning_rate": 1.230807848387232e-06, + "loss": 0.0842, + "step": 22433 + }, + { + "epoch": 2.82, + "grad_norm": 9.76360034942627, + "learning_rate": 1.2299711333305443e-06, + "loss": 0.7282, + "step": 22434 + }, + { + "epoch": 2.82, + "grad_norm": 11.711906433105469, + "learning_rate": 1.229134418273857e-06, + "loss": 0.5736, + "step": 22435 + }, + { + "epoch": 2.82, + "grad_norm": 4.1550116539001465, + "learning_rate": 1.2282977032171695e-06, + "loss": 0.1218, + "step": 22436 + }, + { + "epoch": 2.82, + "grad_norm": 41.38620376586914, + "learning_rate": 1.227460988160482e-06, + "loss": 1.7576, + "step": 22437 + }, + { + "epoch": 2.82, + "grad_norm": 19.813682556152344, + "learning_rate": 1.2266242731037947e-06, + "loss": 0.6346, + "step": 22438 + }, + { + "epoch": 2.82, + "grad_norm": 5.613452434539795, + "learning_rate": 1.2257875580471073e-06, + "loss": 0.4866, + "step": 22439 + }, + { + "epoch": 2.82, + "grad_norm": 18.13245391845703, + "learning_rate": 1.2249508429904196e-06, + "loss": 1.4789, + "step": 22440 + }, + { + "epoch": 2.82, + "grad_norm": 11.131356239318848, + "learning_rate": 1.2241141279337322e-06, + "loss": 0.5981, + "step": 22441 + }, + { + "epoch": 2.82, + "grad_norm": 19.741561889648438, + "learning_rate": 1.2232774128770448e-06, + "loss": 0.8035, + "step": 22442 + }, + { + "epoch": 2.82, + "grad_norm": 1.2421314716339111, + "learning_rate": 1.2224406978203574e-06, + "loss": 0.0455, + "step": 22443 + }, + { + "epoch": 2.82, + "grad_norm": 3.521859884262085, + "learning_rate": 1.22160398276367e-06, + "loss": 0.0857, + "step": 22444 + }, + { + "epoch": 2.82, + "grad_norm": 91.45506286621094, + "learning_rate": 1.2207672677069824e-06, + "loss": 2.0198, + "step": 22445 + }, + { + "epoch": 2.82, + "grad_norm": 25.695064544677734, + "learning_rate": 1.219930552650295e-06, + "loss": 0.8741, + "step": 22446 + }, + { + "epoch": 2.82, + "grad_norm": 20.033838272094727, + "learning_rate": 1.2190938375936076e-06, + "loss": 0.6432, + "step": 22447 + }, + { + "epoch": 2.82, + "grad_norm": 7.642991065979004, + "learning_rate": 1.2182571225369202e-06, + "loss": 0.6744, + "step": 22448 + }, + { + "epoch": 2.82, + "grad_norm": 14.59307861328125, + "learning_rate": 1.2174204074802327e-06, + "loss": 2.1747, + "step": 22449 + }, + { + "epoch": 2.82, + "grad_norm": 2.3779585361480713, + "learning_rate": 1.2165836924235453e-06, + "loss": 0.069, + "step": 22450 + }, + { + "epoch": 2.82, + "grad_norm": 22.025114059448242, + "learning_rate": 1.215746977366858e-06, + "loss": 0.9735, + "step": 22451 + }, + { + "epoch": 2.82, + "grad_norm": 12.39184856414795, + "learning_rate": 1.2149102623101703e-06, + "loss": 1.1784, + "step": 22452 + }, + { + "epoch": 2.82, + "grad_norm": 21.25431251525879, + "learning_rate": 1.2140735472534829e-06, + "loss": 1.6685, + "step": 22453 + }, + { + "epoch": 2.82, + "grad_norm": 17.156089782714844, + "learning_rate": 1.2132368321967955e-06, + "loss": 0.6898, + "step": 22454 + }, + { + "epoch": 2.82, + "grad_norm": 14.869438171386719, + "learning_rate": 1.212400117140108e-06, + "loss": 0.7735, + "step": 22455 + }, + { + "epoch": 2.82, + "grad_norm": 16.35450553894043, + "learning_rate": 1.2115634020834207e-06, + "loss": 1.148, + "step": 22456 + }, + { + "epoch": 2.82, + "grad_norm": 5.551679611206055, + "learning_rate": 1.210726687026733e-06, + "loss": 0.331, + "step": 22457 + }, + { + "epoch": 2.82, + "grad_norm": 333.46478271484375, + "learning_rate": 1.2098899719700458e-06, + "loss": 0.785, + "step": 22458 + }, + { + "epoch": 2.82, + "grad_norm": 10.77422046661377, + "learning_rate": 1.2090532569133582e-06, + "loss": 0.238, + "step": 22459 + }, + { + "epoch": 2.82, + "grad_norm": 9.586767196655273, + "learning_rate": 1.2082165418566708e-06, + "loss": 1.4508, + "step": 22460 + }, + { + "epoch": 2.82, + "grad_norm": 11.997692108154297, + "learning_rate": 1.2073798267999834e-06, + "loss": 2.2217, + "step": 22461 + }, + { + "epoch": 2.82, + "grad_norm": 11.284348487854004, + "learning_rate": 1.206543111743296e-06, + "loss": 0.2131, + "step": 22462 + }, + { + "epoch": 2.82, + "grad_norm": 9.041314125061035, + "learning_rate": 1.2057063966866086e-06, + "loss": 1.3704, + "step": 22463 + }, + { + "epoch": 2.82, + "grad_norm": 95.8100814819336, + "learning_rate": 1.204869681629921e-06, + "loss": 0.6106, + "step": 22464 + }, + { + "epoch": 2.82, + "grad_norm": 9.691165924072266, + "learning_rate": 1.2040329665732338e-06, + "loss": 0.3612, + "step": 22465 + }, + { + "epoch": 2.82, + "grad_norm": 9.955235481262207, + "learning_rate": 1.2031962515165461e-06, + "loss": 0.5314, + "step": 22466 + }, + { + "epoch": 2.82, + "grad_norm": 26.02245330810547, + "learning_rate": 1.2023595364598587e-06, + "loss": 3.2644, + "step": 22467 + }, + { + "epoch": 2.82, + "grad_norm": 6.988763332366943, + "learning_rate": 1.2015228214031713e-06, + "loss": 0.9097, + "step": 22468 + }, + { + "epoch": 2.82, + "grad_norm": 6.152822494506836, + "learning_rate": 1.2006861063464837e-06, + "loss": 0.4453, + "step": 22469 + }, + { + "epoch": 2.82, + "grad_norm": 21.691246032714844, + "learning_rate": 1.1998493912897965e-06, + "loss": 0.8282, + "step": 22470 + }, + { + "epoch": 2.82, + "grad_norm": 7.050600051879883, + "learning_rate": 1.1990126762331089e-06, + "loss": 0.1907, + "step": 22471 + }, + { + "epoch": 2.82, + "grad_norm": 74.11097717285156, + "learning_rate": 1.1981759611764215e-06, + "loss": 0.5265, + "step": 22472 + }, + { + "epoch": 2.82, + "grad_norm": 9.140049934387207, + "learning_rate": 1.197339246119734e-06, + "loss": 0.6186, + "step": 22473 + }, + { + "epoch": 2.82, + "grad_norm": 16.83196449279785, + "learning_rate": 1.1965025310630464e-06, + "loss": 0.4343, + "step": 22474 + }, + { + "epoch": 2.82, + "grad_norm": 19.166473388671875, + "learning_rate": 1.1956658160063592e-06, + "loss": 1.0129, + "step": 22475 + }, + { + "epoch": 2.82, + "grad_norm": 27.66779327392578, + "learning_rate": 1.1948291009496716e-06, + "loss": 0.8765, + "step": 22476 + }, + { + "epoch": 2.82, + "grad_norm": 6.122854232788086, + "learning_rate": 1.1939923858929842e-06, + "loss": 0.2845, + "step": 22477 + }, + { + "epoch": 2.82, + "grad_norm": 13.45590877532959, + "learning_rate": 1.1931556708362968e-06, + "loss": 1.699, + "step": 22478 + }, + { + "epoch": 2.82, + "grad_norm": 19.627155303955078, + "learning_rate": 1.1923189557796094e-06, + "loss": 0.7282, + "step": 22479 + }, + { + "epoch": 2.82, + "grad_norm": 7.944036483764648, + "learning_rate": 1.191482240722922e-06, + "loss": 1.7327, + "step": 22480 + }, + { + "epoch": 2.82, + "grad_norm": 19.45163345336914, + "learning_rate": 1.1906455256662344e-06, + "loss": 0.7671, + "step": 22481 + }, + { + "epoch": 2.82, + "grad_norm": 9.033167839050293, + "learning_rate": 1.1898088106095472e-06, + "loss": 0.2838, + "step": 22482 + }, + { + "epoch": 2.82, + "grad_norm": 40.726802825927734, + "learning_rate": 1.1889720955528595e-06, + "loss": 1.4657, + "step": 22483 + }, + { + "epoch": 2.82, + "grad_norm": 7.455557823181152, + "learning_rate": 1.1881353804961721e-06, + "loss": 0.4454, + "step": 22484 + }, + { + "epoch": 2.82, + "grad_norm": 9.799588203430176, + "learning_rate": 1.1872986654394847e-06, + "loss": 1.0009, + "step": 22485 + }, + { + "epoch": 2.82, + "grad_norm": 11.131878852844238, + "learning_rate": 1.186461950382797e-06, + "loss": 1.3634, + "step": 22486 + }, + { + "epoch": 2.82, + "grad_norm": 72.7829818725586, + "learning_rate": 1.1856252353261099e-06, + "loss": 1.2031, + "step": 22487 + }, + { + "epoch": 2.82, + "grad_norm": 17.33318519592285, + "learning_rate": 1.1847885202694223e-06, + "loss": 0.3565, + "step": 22488 + }, + { + "epoch": 2.82, + "grad_norm": 7.278853416442871, + "learning_rate": 1.1839518052127349e-06, + "loss": 0.326, + "step": 22489 + }, + { + "epoch": 2.82, + "grad_norm": 16.62299346923828, + "learning_rate": 1.1831150901560474e-06, + "loss": 0.5297, + "step": 22490 + }, + { + "epoch": 2.82, + "grad_norm": 24.486923217773438, + "learning_rate": 1.18227837509936e-06, + "loss": 1.6779, + "step": 22491 + }, + { + "epoch": 2.82, + "grad_norm": 14.43045711517334, + "learning_rate": 1.1814416600426726e-06, + "loss": 1.7082, + "step": 22492 + }, + { + "epoch": 2.82, + "grad_norm": 18.77744483947754, + "learning_rate": 1.180604944985985e-06, + "loss": 0.8335, + "step": 22493 + }, + { + "epoch": 2.82, + "grad_norm": 8.909994125366211, + "learning_rate": 1.1797682299292978e-06, + "loss": 1.7434, + "step": 22494 + }, + { + "epoch": 2.82, + "grad_norm": 2.27986741065979, + "learning_rate": 1.1789315148726102e-06, + "loss": 0.1914, + "step": 22495 + }, + { + "epoch": 2.82, + "grad_norm": 11.619105339050293, + "learning_rate": 1.1780947998159228e-06, + "loss": 1.1619, + "step": 22496 + }, + { + "epoch": 2.82, + "grad_norm": 34.9727783203125, + "learning_rate": 1.1772580847592354e-06, + "loss": 1.296, + "step": 22497 + }, + { + "epoch": 2.82, + "grad_norm": 7.1914167404174805, + "learning_rate": 1.1764213697025477e-06, + "loss": 0.1431, + "step": 22498 + }, + { + "epoch": 2.82, + "grad_norm": 13.69640064239502, + "learning_rate": 1.1755846546458605e-06, + "loss": 0.7162, + "step": 22499 + }, + { + "epoch": 2.82, + "grad_norm": 28.79456329345703, + "learning_rate": 1.174747939589173e-06, + "loss": 1.167, + "step": 22500 + }, + { + "epoch": 2.82, + "grad_norm": 42.08561706542969, + "learning_rate": 1.1739112245324855e-06, + "loss": 1.3198, + "step": 22501 + }, + { + "epoch": 2.82, + "grad_norm": 20.690153121948242, + "learning_rate": 1.1730745094757981e-06, + "loss": 2.3716, + "step": 22502 + }, + { + "epoch": 2.82, + "grad_norm": 48.5506591796875, + "learning_rate": 1.1722377944191107e-06, + "loss": 1.4877, + "step": 22503 + }, + { + "epoch": 2.82, + "grad_norm": 17.76114845275879, + "learning_rate": 1.1714010793624233e-06, + "loss": 0.366, + "step": 22504 + }, + { + "epoch": 2.82, + "grad_norm": 20.296478271484375, + "learning_rate": 1.1705643643057357e-06, + "loss": 1.1978, + "step": 22505 + }, + { + "epoch": 2.82, + "grad_norm": 5.036375999450684, + "learning_rate": 1.1697276492490483e-06, + "loss": 0.1301, + "step": 22506 + }, + { + "epoch": 2.82, + "grad_norm": 31.5634822845459, + "learning_rate": 1.1688909341923608e-06, + "loss": 1.8502, + "step": 22507 + }, + { + "epoch": 2.82, + "grad_norm": 14.203628540039062, + "learning_rate": 1.1680542191356734e-06, + "loss": 0.7201, + "step": 22508 + }, + { + "epoch": 2.82, + "grad_norm": 25.401737213134766, + "learning_rate": 1.167217504078986e-06, + "loss": 1.0927, + "step": 22509 + }, + { + "epoch": 2.82, + "grad_norm": 26.5362606048584, + "learning_rate": 1.1663807890222986e-06, + "loss": 1.4557, + "step": 22510 + }, + { + "epoch": 2.83, + "grad_norm": 39.88875961303711, + "learning_rate": 1.1655440739656112e-06, + "loss": 4.3725, + "step": 22511 + }, + { + "epoch": 2.83, + "grad_norm": 11.070425033569336, + "learning_rate": 1.1647073589089236e-06, + "loss": 0.9716, + "step": 22512 + }, + { + "epoch": 2.83, + "grad_norm": 31.722532272338867, + "learning_rate": 1.1638706438522362e-06, + "loss": 1.8316, + "step": 22513 + }, + { + "epoch": 2.83, + "grad_norm": 28.786523818969727, + "learning_rate": 1.1630339287955488e-06, + "loss": 2.101, + "step": 22514 + }, + { + "epoch": 2.83, + "grad_norm": 17.756635665893555, + "learning_rate": 1.1621972137388614e-06, + "loss": 1.3284, + "step": 22515 + }, + { + "epoch": 2.83, + "grad_norm": 18.357152938842773, + "learning_rate": 1.161360498682174e-06, + "loss": 1.6907, + "step": 22516 + }, + { + "epoch": 2.83, + "grad_norm": 21.013147354125977, + "learning_rate": 1.1605237836254863e-06, + "loss": 0.8034, + "step": 22517 + }, + { + "epoch": 2.83, + "grad_norm": 14.976155281066895, + "learning_rate": 1.159687068568799e-06, + "loss": 1.068, + "step": 22518 + }, + { + "epoch": 2.83, + "grad_norm": 102.81354522705078, + "learning_rate": 1.1588503535121115e-06, + "loss": 1.9243, + "step": 22519 + }, + { + "epoch": 2.83, + "grad_norm": 14.17033863067627, + "learning_rate": 1.158013638455424e-06, + "loss": 1.1028, + "step": 22520 + }, + { + "epoch": 2.83, + "grad_norm": 133.36195373535156, + "learning_rate": 1.1571769233987367e-06, + "loss": 1.0503, + "step": 22521 + }, + { + "epoch": 2.83, + "grad_norm": 18.195566177368164, + "learning_rate": 1.1563402083420493e-06, + "loss": 1.143, + "step": 22522 + }, + { + "epoch": 2.83, + "grad_norm": 15.236781120300293, + "learning_rate": 1.1555034932853619e-06, + "loss": 0.828, + "step": 22523 + }, + { + "epoch": 2.83, + "grad_norm": 23.67279624938965, + "learning_rate": 1.1546667782286742e-06, + "loss": 0.6793, + "step": 22524 + }, + { + "epoch": 2.83, + "grad_norm": 27.306930541992188, + "learning_rate": 1.1538300631719868e-06, + "loss": 1.3672, + "step": 22525 + }, + { + "epoch": 2.83, + "grad_norm": 9.656767845153809, + "learning_rate": 1.1529933481152994e-06, + "loss": 0.627, + "step": 22526 + }, + { + "epoch": 2.83, + "grad_norm": 14.961053848266602, + "learning_rate": 1.152156633058612e-06, + "loss": 1.361, + "step": 22527 + }, + { + "epoch": 2.83, + "grad_norm": 20.277849197387695, + "learning_rate": 1.1513199180019246e-06, + "loss": 1.0639, + "step": 22528 + }, + { + "epoch": 2.83, + "grad_norm": 22.297962188720703, + "learning_rate": 1.1504832029452372e-06, + "loss": 2.0858, + "step": 22529 + }, + { + "epoch": 2.83, + "grad_norm": 2.163911819458008, + "learning_rate": 1.1496464878885496e-06, + "loss": 0.1031, + "step": 22530 + }, + { + "epoch": 2.83, + "grad_norm": 15.617265701293945, + "learning_rate": 1.1488097728318622e-06, + "loss": 0.9074, + "step": 22531 + }, + { + "epoch": 2.83, + "grad_norm": 2.05405855178833, + "learning_rate": 1.1479730577751747e-06, + "loss": 0.1178, + "step": 22532 + }, + { + "epoch": 2.83, + "grad_norm": 10.521383285522461, + "learning_rate": 1.1471363427184873e-06, + "loss": 1.549, + "step": 22533 + }, + { + "epoch": 2.83, + "grad_norm": 13.366665840148926, + "learning_rate": 1.1462996276618e-06, + "loss": 0.9132, + "step": 22534 + }, + { + "epoch": 2.83, + "grad_norm": 14.899330139160156, + "learning_rate": 1.1454629126051125e-06, + "loss": 0.7934, + "step": 22535 + }, + { + "epoch": 2.83, + "grad_norm": 51.658470153808594, + "learning_rate": 1.144626197548425e-06, + "loss": 1.1244, + "step": 22536 + }, + { + "epoch": 2.83, + "grad_norm": 235.84877014160156, + "learning_rate": 1.1437894824917375e-06, + "loss": 1.9939, + "step": 22537 + }, + { + "epoch": 2.83, + "grad_norm": 11.595799446105957, + "learning_rate": 1.14295276743505e-06, + "loss": 0.2692, + "step": 22538 + }, + { + "epoch": 2.83, + "grad_norm": 8.764069557189941, + "learning_rate": 1.1421160523783627e-06, + "loss": 0.6718, + "step": 22539 + }, + { + "epoch": 2.83, + "grad_norm": 12.100363731384277, + "learning_rate": 1.1412793373216753e-06, + "loss": 0.4135, + "step": 22540 + }, + { + "epoch": 2.83, + "grad_norm": 12.80016803741455, + "learning_rate": 1.1404426222649878e-06, + "loss": 0.4654, + "step": 22541 + }, + { + "epoch": 2.83, + "grad_norm": 74.28779602050781, + "learning_rate": 1.1396059072083002e-06, + "loss": 1.9398, + "step": 22542 + }, + { + "epoch": 2.83, + "grad_norm": 13.961068153381348, + "learning_rate": 1.1387691921516128e-06, + "loss": 0.1284, + "step": 22543 + }, + { + "epoch": 2.83, + "grad_norm": 6.4551615715026855, + "learning_rate": 1.1379324770949254e-06, + "loss": 0.2141, + "step": 22544 + }, + { + "epoch": 2.83, + "grad_norm": 10.471248626708984, + "learning_rate": 1.137095762038238e-06, + "loss": 0.7794, + "step": 22545 + }, + { + "epoch": 2.83, + "grad_norm": 10.314981460571289, + "learning_rate": 1.1362590469815506e-06, + "loss": 1.3724, + "step": 22546 + }, + { + "epoch": 2.83, + "grad_norm": 31.218461990356445, + "learning_rate": 1.135422331924863e-06, + "loss": 2.968, + "step": 22547 + }, + { + "epoch": 2.83, + "grad_norm": 19.79244041442871, + "learning_rate": 1.1345856168681758e-06, + "loss": 0.7614, + "step": 22548 + }, + { + "epoch": 2.83, + "grad_norm": 8.343755722045898, + "learning_rate": 1.1337489018114881e-06, + "loss": 0.5436, + "step": 22549 + }, + { + "epoch": 2.83, + "grad_norm": 13.522882461547852, + "learning_rate": 1.1329121867548007e-06, + "loss": 0.447, + "step": 22550 + }, + { + "epoch": 2.83, + "grad_norm": 8.117951393127441, + "learning_rate": 1.1320754716981133e-06, + "loss": 0.7153, + "step": 22551 + }, + { + "epoch": 2.83, + "grad_norm": 19.595169067382812, + "learning_rate": 1.131238756641426e-06, + "loss": 0.8592, + "step": 22552 + }, + { + "epoch": 2.83, + "grad_norm": 13.410231590270996, + "learning_rate": 1.1304020415847385e-06, + "loss": 0.4868, + "step": 22553 + }, + { + "epoch": 2.83, + "grad_norm": 19.711048126220703, + "learning_rate": 1.1295653265280509e-06, + "loss": 0.6378, + "step": 22554 + }, + { + "epoch": 2.83, + "grad_norm": 20.844606399536133, + "learning_rate": 1.1287286114713635e-06, + "loss": 0.7601, + "step": 22555 + }, + { + "epoch": 2.83, + "grad_norm": 8.204733848571777, + "learning_rate": 1.127891896414676e-06, + "loss": 0.2567, + "step": 22556 + }, + { + "epoch": 2.83, + "grad_norm": 25.223560333251953, + "learning_rate": 1.1270551813579887e-06, + "loss": 2.1085, + "step": 22557 + }, + { + "epoch": 2.83, + "grad_norm": 13.40188217163086, + "learning_rate": 1.1262184663013012e-06, + "loss": 1.0065, + "step": 22558 + }, + { + "epoch": 2.83, + "grad_norm": 7.276838779449463, + "learning_rate": 1.1253817512446136e-06, + "loss": 0.7332, + "step": 22559 + }, + { + "epoch": 2.83, + "grad_norm": 105.237548828125, + "learning_rate": 1.1245450361879264e-06, + "loss": 1.9138, + "step": 22560 + }, + { + "epoch": 2.83, + "grad_norm": 19.08675765991211, + "learning_rate": 1.1237083211312388e-06, + "loss": 0.4783, + "step": 22561 + }, + { + "epoch": 2.83, + "grad_norm": 15.11357593536377, + "learning_rate": 1.1228716060745514e-06, + "loss": 1.7871, + "step": 22562 + }, + { + "epoch": 2.83, + "grad_norm": 31.744043350219727, + "learning_rate": 1.122034891017864e-06, + "loss": 2.8459, + "step": 22563 + }, + { + "epoch": 2.83, + "grad_norm": 4.599182605743408, + "learning_rate": 1.1211981759611766e-06, + "loss": 0.2267, + "step": 22564 + }, + { + "epoch": 2.83, + "grad_norm": 17.8436336517334, + "learning_rate": 1.1203614609044892e-06, + "loss": 0.8453, + "step": 22565 + }, + { + "epoch": 2.83, + "grad_norm": 13.657493591308594, + "learning_rate": 1.1195247458478015e-06, + "loss": 0.7263, + "step": 22566 + }, + { + "epoch": 2.83, + "grad_norm": 12.992782592773438, + "learning_rate": 1.1186880307911143e-06, + "loss": 0.8964, + "step": 22567 + }, + { + "epoch": 2.83, + "grad_norm": 15.952777862548828, + "learning_rate": 1.1178513157344267e-06, + "loss": 1.2787, + "step": 22568 + }, + { + "epoch": 2.83, + "grad_norm": 21.19894790649414, + "learning_rate": 1.1170146006777393e-06, + "loss": 2.0199, + "step": 22569 + }, + { + "epoch": 2.83, + "grad_norm": 17.431442260742188, + "learning_rate": 1.116177885621052e-06, + "loss": 1.0392, + "step": 22570 + }, + { + "epoch": 2.83, + "grad_norm": 2.0603222846984863, + "learning_rate": 1.1153411705643643e-06, + "loss": 0.1267, + "step": 22571 + }, + { + "epoch": 2.83, + "grad_norm": 11.604832649230957, + "learning_rate": 1.114504455507677e-06, + "loss": 1.8294, + "step": 22572 + }, + { + "epoch": 2.83, + "grad_norm": 5.897436141967773, + "learning_rate": 1.1136677404509895e-06, + "loss": 0.4455, + "step": 22573 + }, + { + "epoch": 2.83, + "grad_norm": 9.812992095947266, + "learning_rate": 1.112831025394302e-06, + "loss": 1.161, + "step": 22574 + }, + { + "epoch": 2.83, + "grad_norm": 8.65078353881836, + "learning_rate": 1.1119943103376146e-06, + "loss": 1.5561, + "step": 22575 + }, + { + "epoch": 2.83, + "grad_norm": 7.049508094787598, + "learning_rate": 1.111157595280927e-06, + "loss": 0.238, + "step": 22576 + }, + { + "epoch": 2.83, + "grad_norm": 17.952756881713867, + "learning_rate": 1.1103208802242398e-06, + "loss": 0.7611, + "step": 22577 + }, + { + "epoch": 2.83, + "grad_norm": 20.81865692138672, + "learning_rate": 1.1094841651675522e-06, + "loss": 0.9094, + "step": 22578 + }, + { + "epoch": 2.83, + "grad_norm": 30.030189514160156, + "learning_rate": 1.1086474501108648e-06, + "loss": 0.8968, + "step": 22579 + }, + { + "epoch": 2.83, + "grad_norm": 18.892066955566406, + "learning_rate": 1.1078107350541774e-06, + "loss": 0.725, + "step": 22580 + }, + { + "epoch": 2.83, + "grad_norm": 11.122385025024414, + "learning_rate": 1.10697401999749e-06, + "loss": 0.8928, + "step": 22581 + }, + { + "epoch": 2.83, + "grad_norm": 15.550915718078613, + "learning_rate": 1.1061373049408026e-06, + "loss": 0.9142, + "step": 22582 + }, + { + "epoch": 2.83, + "grad_norm": 107.0487060546875, + "learning_rate": 1.105300589884115e-06, + "loss": 0.951, + "step": 22583 + }, + { + "epoch": 2.83, + "grad_norm": 6.754660606384277, + "learning_rate": 1.1044638748274277e-06, + "loss": 0.7648, + "step": 22584 + }, + { + "epoch": 2.83, + "grad_norm": 10.755863189697266, + "learning_rate": 1.1036271597707401e-06, + "loss": 1.1334, + "step": 22585 + }, + { + "epoch": 2.83, + "grad_norm": 11.957283020019531, + "learning_rate": 1.1027904447140527e-06, + "loss": 1.8996, + "step": 22586 + }, + { + "epoch": 2.83, + "grad_norm": 13.54029655456543, + "learning_rate": 1.1019537296573653e-06, + "loss": 0.5176, + "step": 22587 + }, + { + "epoch": 2.83, + "grad_norm": 142.4005584716797, + "learning_rate": 1.1011170146006777e-06, + "loss": 1.6767, + "step": 22588 + }, + { + "epoch": 2.83, + "grad_norm": 11.815372467041016, + "learning_rate": 1.1002802995439905e-06, + "loss": 1.6228, + "step": 22589 + }, + { + "epoch": 2.84, + "grad_norm": 4.578014373779297, + "learning_rate": 1.0994435844873029e-06, + "loss": 0.1797, + "step": 22590 + }, + { + "epoch": 2.84, + "grad_norm": 10.217596054077148, + "learning_rate": 1.0986068694306154e-06, + "loss": 0.2287, + "step": 22591 + }, + { + "epoch": 2.84, + "grad_norm": 7.644987106323242, + "learning_rate": 1.097770154373928e-06, + "loss": 0.8052, + "step": 22592 + }, + { + "epoch": 2.84, + "grad_norm": 45.09164810180664, + "learning_rate": 1.0969334393172406e-06, + "loss": 2.0139, + "step": 22593 + }, + { + "epoch": 2.84, + "grad_norm": 4.5308732986450195, + "learning_rate": 1.0960967242605532e-06, + "loss": 0.1777, + "step": 22594 + }, + { + "epoch": 2.84, + "grad_norm": 13.71830940246582, + "learning_rate": 1.0952600092038656e-06, + "loss": 0.4998, + "step": 22595 + }, + { + "epoch": 2.84, + "grad_norm": 13.78543472290039, + "learning_rate": 1.0944232941471784e-06, + "loss": 0.3428, + "step": 22596 + }, + { + "epoch": 2.84, + "grad_norm": 57.147552490234375, + "learning_rate": 1.0935865790904908e-06, + "loss": 2.1839, + "step": 22597 + }, + { + "epoch": 2.84, + "grad_norm": 25.24407386779785, + "learning_rate": 1.0927498640338034e-06, + "loss": 1.0217, + "step": 22598 + }, + { + "epoch": 2.84, + "grad_norm": 10.55384349822998, + "learning_rate": 1.091913148977116e-06, + "loss": 0.4198, + "step": 22599 + }, + { + "epoch": 2.84, + "grad_norm": 8.443134307861328, + "learning_rate": 1.0910764339204283e-06, + "loss": 0.9573, + "step": 22600 + }, + { + "epoch": 2.84, + "grad_norm": 10.019526481628418, + "learning_rate": 1.0902397188637411e-06, + "loss": 1.2772, + "step": 22601 + }, + { + "epoch": 2.84, + "grad_norm": 1.7345339059829712, + "learning_rate": 1.0894030038070535e-06, + "loss": 0.1394, + "step": 22602 + }, + { + "epoch": 2.84, + "grad_norm": 24.403596878051758, + "learning_rate": 1.088566288750366e-06, + "loss": 0.8839, + "step": 22603 + }, + { + "epoch": 2.84, + "grad_norm": 9.105947494506836, + "learning_rate": 1.0877295736936787e-06, + "loss": 0.871, + "step": 22604 + }, + { + "epoch": 2.84, + "grad_norm": 6.4465413093566895, + "learning_rate": 1.0868928586369913e-06, + "loss": 0.3138, + "step": 22605 + }, + { + "epoch": 2.84, + "grad_norm": 9.141624450683594, + "learning_rate": 1.0860561435803039e-06, + "loss": 0.0722, + "step": 22606 + }, + { + "epoch": 2.84, + "grad_norm": 8.823845863342285, + "learning_rate": 1.0852194285236162e-06, + "loss": 0.8992, + "step": 22607 + }, + { + "epoch": 2.84, + "grad_norm": 10.580303192138672, + "learning_rate": 1.0843827134669288e-06, + "loss": 0.8326, + "step": 22608 + }, + { + "epoch": 2.84, + "grad_norm": 4.165613651275635, + "learning_rate": 1.0835459984102414e-06, + "loss": 1.5931, + "step": 22609 + }, + { + "epoch": 2.84, + "grad_norm": 5.843441963195801, + "learning_rate": 1.082709283353554e-06, + "loss": 0.2336, + "step": 22610 + }, + { + "epoch": 2.84, + "grad_norm": 11.892224311828613, + "learning_rate": 1.0818725682968666e-06, + "loss": 1.7391, + "step": 22611 + }, + { + "epoch": 2.84, + "grad_norm": 4.86954927444458, + "learning_rate": 1.0810358532401792e-06, + "loss": 1.0329, + "step": 22612 + }, + { + "epoch": 2.84, + "grad_norm": 12.3053560256958, + "learning_rate": 1.0801991381834918e-06, + "loss": 0.4214, + "step": 22613 + }, + { + "epoch": 2.84, + "grad_norm": 7.288525581359863, + "learning_rate": 1.0793624231268042e-06, + "loss": 0.4891, + "step": 22614 + }, + { + "epoch": 2.84, + "grad_norm": 11.251693725585938, + "learning_rate": 1.0785257080701168e-06, + "loss": 0.3047, + "step": 22615 + }, + { + "epoch": 2.84, + "grad_norm": 21.362695693969727, + "learning_rate": 1.0776889930134293e-06, + "loss": 1.0763, + "step": 22616 + }, + { + "epoch": 2.84, + "grad_norm": 5.845518589019775, + "learning_rate": 1.076852277956742e-06, + "loss": 0.798, + "step": 22617 + }, + { + "epoch": 2.84, + "grad_norm": 8.721627235412598, + "learning_rate": 1.0760155629000545e-06, + "loss": 0.2793, + "step": 22618 + }, + { + "epoch": 2.84, + "grad_norm": 16.777753829956055, + "learning_rate": 1.0751788478433671e-06, + "loss": 1.5033, + "step": 22619 + }, + { + "epoch": 2.84, + "grad_norm": 23.648386001586914, + "learning_rate": 1.0743421327866795e-06, + "loss": 2.3692, + "step": 22620 + }, + { + "epoch": 2.84, + "grad_norm": 14.79354476928711, + "learning_rate": 1.073505417729992e-06, + "loss": 0.6241, + "step": 22621 + }, + { + "epoch": 2.84, + "grad_norm": 77.8792953491211, + "learning_rate": 1.0726687026733047e-06, + "loss": 2.4211, + "step": 22622 + }, + { + "epoch": 2.84, + "grad_norm": 4.98190450668335, + "learning_rate": 1.0718319876166173e-06, + "loss": 0.2484, + "step": 22623 + }, + { + "epoch": 2.84, + "grad_norm": 10.655683517456055, + "learning_rate": 1.0709952725599299e-06, + "loss": 0.9693, + "step": 22624 + }, + { + "epoch": 2.84, + "grad_norm": 34.36240005493164, + "learning_rate": 1.0701585575032424e-06, + "loss": 1.4315, + "step": 22625 + }, + { + "epoch": 2.84, + "grad_norm": 10.894150733947754, + "learning_rate": 1.0693218424465548e-06, + "loss": 1.6335, + "step": 22626 + }, + { + "epoch": 2.84, + "grad_norm": 6.933768272399902, + "learning_rate": 1.0684851273898674e-06, + "loss": 0.5767, + "step": 22627 + }, + { + "epoch": 2.84, + "grad_norm": 18.84404945373535, + "learning_rate": 1.06764841233318e-06, + "loss": 1.5802, + "step": 22628 + }, + { + "epoch": 2.84, + "grad_norm": 34.67753219604492, + "learning_rate": 1.0668116972764926e-06, + "loss": 4.0518, + "step": 22629 + }, + { + "epoch": 2.84, + "grad_norm": 96.99250793457031, + "learning_rate": 1.0659749822198052e-06, + "loss": 3.8837, + "step": 22630 + }, + { + "epoch": 2.84, + "grad_norm": 18.811664581298828, + "learning_rate": 1.0651382671631178e-06, + "loss": 1.5722, + "step": 22631 + }, + { + "epoch": 2.84, + "grad_norm": 8.915626525878906, + "learning_rate": 1.0643015521064302e-06, + "loss": 1.6112, + "step": 22632 + }, + { + "epoch": 2.84, + "grad_norm": 34.93801498413086, + "learning_rate": 1.0634648370497427e-06, + "loss": 1.2345, + "step": 22633 + }, + { + "epoch": 2.84, + "grad_norm": 24.479141235351562, + "learning_rate": 1.0626281219930553e-06, + "loss": 1.2939, + "step": 22634 + }, + { + "epoch": 2.84, + "grad_norm": 8.075628280639648, + "learning_rate": 1.061791406936368e-06, + "loss": 1.6627, + "step": 22635 + }, + { + "epoch": 2.84, + "grad_norm": 13.343048095703125, + "learning_rate": 1.0609546918796805e-06, + "loss": 0.6281, + "step": 22636 + }, + { + "epoch": 2.84, + "grad_norm": 17.33895492553711, + "learning_rate": 1.060117976822993e-06, + "loss": 0.6769, + "step": 22637 + }, + { + "epoch": 2.84, + "grad_norm": 11.125850677490234, + "learning_rate": 1.0592812617663057e-06, + "loss": 0.9356, + "step": 22638 + }, + { + "epoch": 2.84, + "grad_norm": 18.014148712158203, + "learning_rate": 1.058444546709618e-06, + "loss": 0.3273, + "step": 22639 + }, + { + "epoch": 2.84, + "grad_norm": 8.335336685180664, + "learning_rate": 1.0576078316529307e-06, + "loss": 0.5336, + "step": 22640 + }, + { + "epoch": 2.84, + "grad_norm": 14.801827430725098, + "learning_rate": 1.0567711165962433e-06, + "loss": 0.5302, + "step": 22641 + }, + { + "epoch": 2.84, + "grad_norm": 14.088603019714355, + "learning_rate": 1.0559344015395558e-06, + "loss": 0.3378, + "step": 22642 + }, + { + "epoch": 2.84, + "grad_norm": 384.4151306152344, + "learning_rate": 1.0550976864828684e-06, + "loss": 1.3484, + "step": 22643 + }, + { + "epoch": 2.84, + "grad_norm": 11.850312232971191, + "learning_rate": 1.0542609714261808e-06, + "loss": 0.4568, + "step": 22644 + }, + { + "epoch": 2.84, + "grad_norm": 21.090885162353516, + "learning_rate": 1.0534242563694934e-06, + "loss": 1.4683, + "step": 22645 + }, + { + "epoch": 2.84, + "grad_norm": 25.674741744995117, + "learning_rate": 1.052587541312806e-06, + "loss": 1.5322, + "step": 22646 + }, + { + "epoch": 2.84, + "grad_norm": 14.398693084716797, + "learning_rate": 1.0517508262561186e-06, + "loss": 0.7668, + "step": 22647 + }, + { + "epoch": 2.84, + "grad_norm": 13.833110809326172, + "learning_rate": 1.0509141111994312e-06, + "loss": 0.4308, + "step": 22648 + }, + { + "epoch": 2.84, + "grad_norm": 28.17066764831543, + "learning_rate": 1.0500773961427435e-06, + "loss": 4.24, + "step": 22649 + }, + { + "epoch": 2.84, + "grad_norm": 51.966556549072266, + "learning_rate": 1.0492406810860563e-06, + "loss": 2.0793, + "step": 22650 + }, + { + "epoch": 2.84, + "grad_norm": 73.22601318359375, + "learning_rate": 1.0484039660293687e-06, + "loss": 2.8554, + "step": 22651 + }, + { + "epoch": 2.84, + "grad_norm": 8.28471851348877, + "learning_rate": 1.0475672509726813e-06, + "loss": 0.2304, + "step": 22652 + }, + { + "epoch": 2.84, + "grad_norm": 92.43878936767578, + "learning_rate": 1.046730535915994e-06, + "loss": 2.0176, + "step": 22653 + }, + { + "epoch": 2.84, + "grad_norm": 15.221357345581055, + "learning_rate": 1.0458938208593065e-06, + "loss": 0.5011, + "step": 22654 + }, + { + "epoch": 2.84, + "grad_norm": 5.658198356628418, + "learning_rate": 1.045057105802619e-06, + "loss": 0.3352, + "step": 22655 + }, + { + "epoch": 2.84, + "grad_norm": 19.164915084838867, + "learning_rate": 1.0442203907459315e-06, + "loss": 1.2764, + "step": 22656 + }, + { + "epoch": 2.84, + "grad_norm": 16.541826248168945, + "learning_rate": 1.0433836756892443e-06, + "loss": 0.6415, + "step": 22657 + }, + { + "epoch": 2.84, + "grad_norm": 1.7284843921661377, + "learning_rate": 1.0425469606325566e-06, + "loss": 0.0434, + "step": 22658 + }, + { + "epoch": 2.84, + "grad_norm": 10.829341888427734, + "learning_rate": 1.0417102455758692e-06, + "loss": 0.4431, + "step": 22659 + }, + { + "epoch": 2.84, + "grad_norm": 14.03040599822998, + "learning_rate": 1.0408735305191818e-06, + "loss": 1.3381, + "step": 22660 + }, + { + "epoch": 2.84, + "grad_norm": 10.154394149780273, + "learning_rate": 1.0400368154624942e-06, + "loss": 1.0825, + "step": 22661 + }, + { + "epoch": 2.84, + "grad_norm": 1.6120656728744507, + "learning_rate": 1.039200100405807e-06, + "loss": 0.0268, + "step": 22662 + }, + { + "epoch": 2.84, + "grad_norm": 22.520339965820312, + "learning_rate": 1.0383633853491194e-06, + "loss": 0.4964, + "step": 22663 + }, + { + "epoch": 2.84, + "grad_norm": 16.420804977416992, + "learning_rate": 1.037526670292432e-06, + "loss": 2.3717, + "step": 22664 + }, + { + "epoch": 2.84, + "grad_norm": 10.640663146972656, + "learning_rate": 1.0366899552357446e-06, + "loss": 1.0823, + "step": 22665 + }, + { + "epoch": 2.84, + "grad_norm": 8.779507637023926, + "learning_rate": 1.0358532401790572e-06, + "loss": 0.4319, + "step": 22666 + }, + { + "epoch": 2.84, + "grad_norm": 7.7672648429870605, + "learning_rate": 1.0350165251223697e-06, + "loss": 0.8595, + "step": 22667 + }, + { + "epoch": 2.84, + "grad_norm": 30.301015853881836, + "learning_rate": 1.0341798100656821e-06, + "loss": 0.4864, + "step": 22668 + }, + { + "epoch": 2.84, + "grad_norm": 15.309416770935059, + "learning_rate": 1.033343095008995e-06, + "loss": 0.7949, + "step": 22669 + }, + { + "epoch": 2.85, + "grad_norm": 18.696205139160156, + "learning_rate": 1.0325063799523073e-06, + "loss": 0.5772, + "step": 22670 + }, + { + "epoch": 2.85, + "grad_norm": 11.885797500610352, + "learning_rate": 1.0316696648956199e-06, + "loss": 1.4711, + "step": 22671 + }, + { + "epoch": 2.85, + "grad_norm": 14.371444702148438, + "learning_rate": 1.0308329498389325e-06, + "loss": 0.701, + "step": 22672 + }, + { + "epoch": 2.85, + "grad_norm": 10.62415885925293, + "learning_rate": 1.0299962347822449e-06, + "loss": 0.6706, + "step": 22673 + }, + { + "epoch": 2.85, + "grad_norm": 12.693033218383789, + "learning_rate": 1.0291595197255577e-06, + "loss": 1.2575, + "step": 22674 + }, + { + "epoch": 2.85, + "grad_norm": 22.45041275024414, + "learning_rate": 1.02832280466887e-06, + "loss": 0.8786, + "step": 22675 + }, + { + "epoch": 2.85, + "grad_norm": 12.779298782348633, + "learning_rate": 1.0274860896121826e-06, + "loss": 0.874, + "step": 22676 + }, + { + "epoch": 2.85, + "grad_norm": 71.88469696044922, + "learning_rate": 1.0266493745554952e-06, + "loss": 1.2904, + "step": 22677 + }, + { + "epoch": 2.85, + "grad_norm": 18.182397842407227, + "learning_rate": 1.0258126594988076e-06, + "loss": 1.5542, + "step": 22678 + }, + { + "epoch": 2.85, + "grad_norm": 23.903799057006836, + "learning_rate": 1.0249759444421204e-06, + "loss": 0.6968, + "step": 22679 + }, + { + "epoch": 2.85, + "grad_norm": 37.96636962890625, + "learning_rate": 1.0241392293854328e-06, + "loss": 0.8339, + "step": 22680 + }, + { + "epoch": 2.85, + "grad_norm": 17.603233337402344, + "learning_rate": 1.0233025143287454e-06, + "loss": 0.8701, + "step": 22681 + }, + { + "epoch": 2.85, + "grad_norm": 4.552093029022217, + "learning_rate": 1.022465799272058e-06, + "loss": 0.121, + "step": 22682 + }, + { + "epoch": 2.85, + "grad_norm": 16.59980583190918, + "learning_rate": 1.0216290842153706e-06, + "loss": 1.0351, + "step": 22683 + }, + { + "epoch": 2.85, + "grad_norm": 98.66775512695312, + "learning_rate": 1.0207923691586831e-06, + "loss": 0.7874, + "step": 22684 + }, + { + "epoch": 2.85, + "grad_norm": 18.55583953857422, + "learning_rate": 1.0199556541019955e-06, + "loss": 1.0839, + "step": 22685 + }, + { + "epoch": 2.85, + "grad_norm": 27.879377365112305, + "learning_rate": 1.0191189390453083e-06, + "loss": 0.5498, + "step": 22686 + }, + { + "epoch": 2.85, + "grad_norm": 11.789697647094727, + "learning_rate": 1.0182822239886207e-06, + "loss": 0.5332, + "step": 22687 + }, + { + "epoch": 2.85, + "grad_norm": 21.460121154785156, + "learning_rate": 1.0174455089319333e-06, + "loss": 0.9124, + "step": 22688 + }, + { + "epoch": 2.85, + "grad_norm": 13.511993408203125, + "learning_rate": 1.0166087938752459e-06, + "loss": 0.3465, + "step": 22689 + }, + { + "epoch": 2.85, + "grad_norm": 10.475018501281738, + "learning_rate": 1.0157720788185583e-06, + "loss": 0.3422, + "step": 22690 + }, + { + "epoch": 2.85, + "grad_norm": 5.866156101226807, + "learning_rate": 1.014935363761871e-06, + "loss": 0.2938, + "step": 22691 + }, + { + "epoch": 2.85, + "grad_norm": 12.232178688049316, + "learning_rate": 1.0140986487051834e-06, + "loss": 0.3215, + "step": 22692 + }, + { + "epoch": 2.85, + "grad_norm": 12.380698204040527, + "learning_rate": 1.013261933648496e-06, + "loss": 0.4729, + "step": 22693 + }, + { + "epoch": 2.85, + "grad_norm": 3.1376395225524902, + "learning_rate": 1.0124252185918086e-06, + "loss": 0.2062, + "step": 22694 + }, + { + "epoch": 2.85, + "grad_norm": 15.682345390319824, + "learning_rate": 1.0115885035351212e-06, + "loss": 0.3961, + "step": 22695 + }, + { + "epoch": 2.85, + "grad_norm": 27.296039581298828, + "learning_rate": 1.0107517884784338e-06, + "loss": 0.5643, + "step": 22696 + }, + { + "epoch": 2.85, + "grad_norm": 28.356821060180664, + "learning_rate": 1.0099150734217462e-06, + "loss": 1.7059, + "step": 22697 + }, + { + "epoch": 2.85, + "grad_norm": 24.347946166992188, + "learning_rate": 1.009078358365059e-06, + "loss": 0.7439, + "step": 22698 + }, + { + "epoch": 2.85, + "grad_norm": 25.663373947143555, + "learning_rate": 1.0082416433083714e-06, + "loss": 1.4367, + "step": 22699 + }, + { + "epoch": 2.85, + "grad_norm": 14.530847549438477, + "learning_rate": 1.007404928251684e-06, + "loss": 0.6064, + "step": 22700 + }, + { + "epoch": 2.85, + "grad_norm": 6.665618896484375, + "learning_rate": 1.0065682131949965e-06, + "loss": 0.4526, + "step": 22701 + }, + { + "epoch": 2.85, + "grad_norm": 21.13688850402832, + "learning_rate": 1.0057314981383091e-06, + "loss": 1.0899, + "step": 22702 + }, + { + "epoch": 2.85, + "grad_norm": 9.309040069580078, + "learning_rate": 1.0048947830816217e-06, + "loss": 0.8862, + "step": 22703 + }, + { + "epoch": 2.85, + "grad_norm": 8.414344787597656, + "learning_rate": 1.004058068024934e-06, + "loss": 0.2494, + "step": 22704 + }, + { + "epoch": 2.85, + "grad_norm": 8.917380332946777, + "learning_rate": 1.0032213529682467e-06, + "loss": 1.2896, + "step": 22705 + }, + { + "epoch": 2.85, + "grad_norm": 3.6235129833221436, + "learning_rate": 1.0023846379115593e-06, + "loss": 0.1717, + "step": 22706 + }, + { + "epoch": 2.85, + "grad_norm": 3.6737544536590576, + "learning_rate": 1.0015479228548719e-06, + "loss": 0.0908, + "step": 22707 + }, + { + "epoch": 2.85, + "grad_norm": 38.907203674316406, + "learning_rate": 1.0007112077981845e-06, + "loss": 4.4776, + "step": 22708 + }, + { + "epoch": 2.85, + "grad_norm": 13.30609130859375, + "learning_rate": 9.99874492741497e-07, + "loss": 0.3695, + "step": 22709 + }, + { + "epoch": 2.85, + "grad_norm": 10.25177001953125, + "learning_rate": 9.990377776848096e-07, + "loss": 0.3207, + "step": 22710 + }, + { + "epoch": 2.85, + "grad_norm": 13.686180114746094, + "learning_rate": 9.98201062628122e-07, + "loss": 0.344, + "step": 22711 + }, + { + "epoch": 2.85, + "grad_norm": 4.232295036315918, + "learning_rate": 9.973643475714346e-07, + "loss": 0.2562, + "step": 22712 + }, + { + "epoch": 2.85, + "grad_norm": 17.139768600463867, + "learning_rate": 9.965276325147472e-07, + "loss": 0.6248, + "step": 22713 + }, + { + "epoch": 2.85, + "grad_norm": 63.723995208740234, + "learning_rate": 9.956909174580598e-07, + "loss": 1.3205, + "step": 22714 + }, + { + "epoch": 2.85, + "grad_norm": 4.056235313415527, + "learning_rate": 9.948542024013724e-07, + "loss": 0.1356, + "step": 22715 + }, + { + "epoch": 2.85, + "grad_norm": 6.38409423828125, + "learning_rate": 9.940174873446848e-07, + "loss": 0.1036, + "step": 22716 + }, + { + "epoch": 2.85, + "grad_norm": 10.570390701293945, + "learning_rate": 9.931807722879973e-07, + "loss": 0.9784, + "step": 22717 + }, + { + "epoch": 2.85, + "grad_norm": 43.72049331665039, + "learning_rate": 9.9234405723131e-07, + "loss": 1.594, + "step": 22718 + }, + { + "epoch": 2.85, + "grad_norm": 7.340564727783203, + "learning_rate": 9.915073421746225e-07, + "loss": 1.9042, + "step": 22719 + }, + { + "epoch": 2.85, + "grad_norm": 11.679526329040527, + "learning_rate": 9.906706271179351e-07, + "loss": 0.519, + "step": 22720 + }, + { + "epoch": 2.85, + "grad_norm": 6.5310893058776855, + "learning_rate": 9.898339120612477e-07, + "loss": 0.2962, + "step": 22721 + }, + { + "epoch": 2.85, + "grad_norm": 18.232358932495117, + "learning_rate": 9.8899719700456e-07, + "loss": 0.885, + "step": 22722 + }, + { + "epoch": 2.85, + "grad_norm": 10.807291030883789, + "learning_rate": 9.881604819478727e-07, + "loss": 0.3318, + "step": 22723 + }, + { + "epoch": 2.85, + "grad_norm": 9.099920272827148, + "learning_rate": 9.873237668911853e-07, + "loss": 0.8537, + "step": 22724 + }, + { + "epoch": 2.85, + "grad_norm": 8.183937072753906, + "learning_rate": 9.864870518344978e-07, + "loss": 0.4723, + "step": 22725 + }, + { + "epoch": 2.85, + "grad_norm": 6.987006187438965, + "learning_rate": 9.856503367778104e-07, + "loss": 0.5541, + "step": 22726 + }, + { + "epoch": 2.85, + "grad_norm": 22.417749404907227, + "learning_rate": 9.84813621721123e-07, + "loss": 1.1988, + "step": 22727 + }, + { + "epoch": 2.85, + "grad_norm": 2.788111925125122, + "learning_rate": 9.839769066644356e-07, + "loss": 0.1956, + "step": 22728 + }, + { + "epoch": 2.85, + "grad_norm": 14.666352272033691, + "learning_rate": 9.83140191607748e-07, + "loss": 0.6124, + "step": 22729 + }, + { + "epoch": 2.85, + "grad_norm": 16.536130905151367, + "learning_rate": 9.823034765510606e-07, + "loss": 0.6982, + "step": 22730 + }, + { + "epoch": 2.85, + "grad_norm": 20.315258026123047, + "learning_rate": 9.814667614943732e-07, + "loss": 0.4652, + "step": 22731 + }, + { + "epoch": 2.85, + "grad_norm": 7.736295223236084, + "learning_rate": 9.806300464376858e-07, + "loss": 0.4856, + "step": 22732 + }, + { + "epoch": 2.85, + "grad_norm": 6.603104591369629, + "learning_rate": 9.797933313809984e-07, + "loss": 0.2176, + "step": 22733 + }, + { + "epoch": 2.85, + "grad_norm": 29.130878448486328, + "learning_rate": 9.789566163243107e-07, + "loss": 1.2782, + "step": 22734 + }, + { + "epoch": 2.85, + "grad_norm": 26.576641082763672, + "learning_rate": 9.781199012676233e-07, + "loss": 0.2887, + "step": 22735 + }, + { + "epoch": 2.85, + "grad_norm": 9.380056381225586, + "learning_rate": 9.77283186210936e-07, + "loss": 1.7271, + "step": 22736 + }, + { + "epoch": 2.85, + "grad_norm": 110.59015655517578, + "learning_rate": 9.764464711542485e-07, + "loss": 0.7674, + "step": 22737 + }, + { + "epoch": 2.85, + "grad_norm": 32.303466796875, + "learning_rate": 9.75609756097561e-07, + "loss": 1.5897, + "step": 22738 + }, + { + "epoch": 2.85, + "grad_norm": 59.81349563598633, + "learning_rate": 9.747730410408737e-07, + "loss": 2.2013, + "step": 22739 + }, + { + "epoch": 2.85, + "grad_norm": 321.8476257324219, + "learning_rate": 9.739363259841863e-07, + "loss": 1.7899, + "step": 22740 + }, + { + "epoch": 2.85, + "grad_norm": 11.349199295043945, + "learning_rate": 9.730996109274987e-07, + "loss": 0.5554, + "step": 22741 + }, + { + "epoch": 2.85, + "grad_norm": 13.518997192382812, + "learning_rate": 9.722628958708112e-07, + "loss": 0.8175, + "step": 22742 + }, + { + "epoch": 2.85, + "grad_norm": 12.089187622070312, + "learning_rate": 9.714261808141238e-07, + "loss": 0.2888, + "step": 22743 + }, + { + "epoch": 2.85, + "grad_norm": 20.820369720458984, + "learning_rate": 9.705894657574364e-07, + "loss": 0.88, + "step": 22744 + }, + { + "epoch": 2.85, + "grad_norm": 35.164302825927734, + "learning_rate": 9.69752750700749e-07, + "loss": 2.1835, + "step": 22745 + }, + { + "epoch": 2.85, + "grad_norm": 9.61741828918457, + "learning_rate": 9.689160356440614e-07, + "loss": 0.7723, + "step": 22746 + }, + { + "epoch": 2.85, + "grad_norm": 9.48019027709961, + "learning_rate": 9.680793205873742e-07, + "loss": 0.3809, + "step": 22747 + }, + { + "epoch": 2.85, + "grad_norm": 12.2240571975708, + "learning_rate": 9.672426055306866e-07, + "loss": 0.3868, + "step": 22748 + }, + { + "epoch": 2.85, + "grad_norm": 3.512730836868286, + "learning_rate": 9.664058904739992e-07, + "loss": 0.1358, + "step": 22749 + }, + { + "epoch": 2.86, + "grad_norm": 38.790340423583984, + "learning_rate": 9.655691754173118e-07, + "loss": 1.7435, + "step": 22750 + }, + { + "epoch": 2.86, + "grad_norm": 10.633916854858398, + "learning_rate": 9.647324603606241e-07, + "loss": 0.7846, + "step": 22751 + }, + { + "epoch": 2.86, + "grad_norm": 0.6371921300888062, + "learning_rate": 9.63895745303937e-07, + "loss": 0.0126, + "step": 22752 + }, + { + "epoch": 2.86, + "grad_norm": 17.8377742767334, + "learning_rate": 9.630590302472493e-07, + "loss": 0.9767, + "step": 22753 + }, + { + "epoch": 2.86, + "grad_norm": 36.62273025512695, + "learning_rate": 9.62222315190562e-07, + "loss": 2.6855, + "step": 22754 + }, + { + "epoch": 2.86, + "grad_norm": 15.418661117553711, + "learning_rate": 9.613856001338745e-07, + "loss": 0.4603, + "step": 22755 + }, + { + "epoch": 2.86, + "grad_norm": 3.746643304824829, + "learning_rate": 9.60548885077187e-07, + "loss": 0.1215, + "step": 22756 + }, + { + "epoch": 2.86, + "grad_norm": 9.70486831665039, + "learning_rate": 9.597121700204997e-07, + "loss": 0.3539, + "step": 22757 + }, + { + "epoch": 2.86, + "grad_norm": 182.56591796875, + "learning_rate": 9.58875454963812e-07, + "loss": 1.801, + "step": 22758 + }, + { + "epoch": 2.86, + "grad_norm": 22.018207550048828, + "learning_rate": 9.580387399071249e-07, + "loss": 1.7881, + "step": 22759 + }, + { + "epoch": 2.86, + "grad_norm": 21.515419006347656, + "learning_rate": 9.572020248504372e-07, + "loss": 0.8439, + "step": 22760 + }, + { + "epoch": 2.86, + "grad_norm": 12.654033660888672, + "learning_rate": 9.563653097937498e-07, + "loss": 1.161, + "step": 22761 + }, + { + "epoch": 2.86, + "grad_norm": 83.0149917602539, + "learning_rate": 9.555285947370624e-07, + "loss": 1.7713, + "step": 22762 + }, + { + "epoch": 2.86, + "grad_norm": 10.134322166442871, + "learning_rate": 9.546918796803748e-07, + "loss": 1.1104, + "step": 22763 + }, + { + "epoch": 2.86, + "grad_norm": 6.967130184173584, + "learning_rate": 9.538551646236876e-07, + "loss": 0.4005, + "step": 22764 + }, + { + "epoch": 2.86, + "grad_norm": 10.932229042053223, + "learning_rate": 9.53018449567e-07, + "loss": 0.4291, + "step": 22765 + }, + { + "epoch": 2.86, + "grad_norm": 13.56113338470459, + "learning_rate": 9.521817345103127e-07, + "loss": 0.5479, + "step": 22766 + }, + { + "epoch": 2.86, + "grad_norm": 8.211050033569336, + "learning_rate": 9.513450194536251e-07, + "loss": 0.2898, + "step": 22767 + }, + { + "epoch": 2.86, + "grad_norm": 42.85634994506836, + "learning_rate": 9.505083043969376e-07, + "loss": 2.0145, + "step": 22768 + }, + { + "epoch": 2.86, + "grad_norm": 23.32672691345215, + "learning_rate": 9.496715893402503e-07, + "loss": 1.4944, + "step": 22769 + }, + { + "epoch": 2.86, + "grad_norm": 23.611173629760742, + "learning_rate": 9.488348742835628e-07, + "loss": 1.2648, + "step": 22770 + }, + { + "epoch": 2.86, + "grad_norm": 18.620038986206055, + "learning_rate": 9.479981592268754e-07, + "loss": 1.098, + "step": 22771 + }, + { + "epoch": 2.86, + "grad_norm": 22.640104293823242, + "learning_rate": 9.471614441701879e-07, + "loss": 2.1549, + "step": 22772 + }, + { + "epoch": 2.86, + "grad_norm": 23.428749084472656, + "learning_rate": 9.463247291135006e-07, + "loss": 1.4812, + "step": 22773 + }, + { + "epoch": 2.86, + "grad_norm": 7.496427536010742, + "learning_rate": 9.454880140568131e-07, + "loss": 0.3486, + "step": 22774 + }, + { + "epoch": 2.86, + "grad_norm": 14.45265007019043, + "learning_rate": 9.446512990001256e-07, + "loss": 1.7724, + "step": 22775 + }, + { + "epoch": 2.86, + "grad_norm": 13.942972183227539, + "learning_rate": 9.438145839434381e-07, + "loss": 0.9157, + "step": 22776 + }, + { + "epoch": 2.86, + "grad_norm": 30.090429306030273, + "learning_rate": 9.429778688867506e-07, + "loss": 0.9834, + "step": 22777 + }, + { + "epoch": 2.86, + "grad_norm": 9.631400108337402, + "learning_rate": 9.421411538300633e-07, + "loss": 1.4044, + "step": 22778 + }, + { + "epoch": 2.86, + "grad_norm": 11.13482666015625, + "learning_rate": 9.413044387733758e-07, + "loss": 0.7159, + "step": 22779 + }, + { + "epoch": 2.86, + "grad_norm": 29.129552841186523, + "learning_rate": 9.404677237166883e-07, + "loss": 1.5577, + "step": 22780 + }, + { + "epoch": 2.86, + "grad_norm": 34.3688850402832, + "learning_rate": 9.396310086600009e-07, + "loss": 1.8399, + "step": 22781 + }, + { + "epoch": 2.86, + "grad_norm": 4.2850165367126465, + "learning_rate": 9.387942936033135e-07, + "loss": 0.0655, + "step": 22782 + }, + { + "epoch": 2.86, + "grad_norm": 21.041540145874023, + "learning_rate": 9.379575785466261e-07, + "loss": 0.3858, + "step": 22783 + }, + { + "epoch": 2.86, + "grad_norm": 8.379650115966797, + "learning_rate": 9.371208634899385e-07, + "loss": 0.6699, + "step": 22784 + }, + { + "epoch": 2.86, + "grad_norm": 16.045167922973633, + "learning_rate": 9.362841484332512e-07, + "loss": 0.9175, + "step": 22785 + }, + { + "epoch": 2.86, + "grad_norm": 36.10418701171875, + "learning_rate": 9.354474333765637e-07, + "loss": 0.7872, + "step": 22786 + }, + { + "epoch": 2.86, + "grad_norm": 2.9977262020111084, + "learning_rate": 9.346107183198762e-07, + "loss": 0.1647, + "step": 22787 + }, + { + "epoch": 2.86, + "grad_norm": 12.359587669372559, + "learning_rate": 9.337740032631888e-07, + "loss": 0.7004, + "step": 22788 + }, + { + "epoch": 2.86, + "grad_norm": 23.762327194213867, + "learning_rate": 9.329372882065013e-07, + "loss": 1.3242, + "step": 22789 + }, + { + "epoch": 2.86, + "grad_norm": 24.016380310058594, + "learning_rate": 9.32100573149814e-07, + "loss": 0.9295, + "step": 22790 + }, + { + "epoch": 2.86, + "grad_norm": 340.8974304199219, + "learning_rate": 9.312638580931265e-07, + "loss": 1.8105, + "step": 22791 + }, + { + "epoch": 2.86, + "grad_norm": 12.684603691101074, + "learning_rate": 9.304271430364391e-07, + "loss": 0.2586, + "step": 22792 + }, + { + "epoch": 2.86, + "grad_norm": 49.162139892578125, + "learning_rate": 9.295904279797515e-07, + "loss": 1.6199, + "step": 22793 + }, + { + "epoch": 2.86, + "grad_norm": 11.601898193359375, + "learning_rate": 9.28753712923064e-07, + "loss": 0.3166, + "step": 22794 + }, + { + "epoch": 2.86, + "grad_norm": 16.88113021850586, + "learning_rate": 9.279169978663767e-07, + "loss": 0.3754, + "step": 22795 + }, + { + "epoch": 2.86, + "grad_norm": 21.27098846435547, + "learning_rate": 9.270802828096892e-07, + "loss": 1.9876, + "step": 22796 + }, + { + "epoch": 2.86, + "grad_norm": 17.28947639465332, + "learning_rate": 9.262435677530018e-07, + "loss": 1.0031, + "step": 22797 + }, + { + "epoch": 2.86, + "grad_norm": 11.878498077392578, + "learning_rate": 9.254068526963144e-07, + "loss": 0.6122, + "step": 22798 + }, + { + "epoch": 2.86, + "grad_norm": 24.625106811523438, + "learning_rate": 9.24570137639627e-07, + "loss": 0.6079, + "step": 22799 + }, + { + "epoch": 2.86, + "grad_norm": 11.562703132629395, + "learning_rate": 9.237334225829395e-07, + "loss": 1.325, + "step": 22800 + }, + { + "epoch": 2.86, + "eval_loss": 0.07697702199220657, + "eval_runtime": 95.4787, + "eval_samples_per_second": 37.097, + "eval_steps_per_second": 37.097, + "step": 22800 + }, + { + "epoch": 2.86, + "grad_norm": 45.269893646240234, + "learning_rate": 9.228967075262519e-07, + "loss": 2.3619, + "step": 22801 + }, + { + "epoch": 2.86, + "grad_norm": 28.223426818847656, + "learning_rate": 9.220599924695646e-07, + "loss": 1.3014, + "step": 22802 + }, + { + "epoch": 2.86, + "grad_norm": 15.729945182800293, + "learning_rate": 9.212232774128771e-07, + "loss": 1.9507, + "step": 22803 + }, + { + "epoch": 2.86, + "grad_norm": 53.05438232421875, + "learning_rate": 9.203865623561897e-07, + "loss": 1.2993, + "step": 22804 + }, + { + "epoch": 2.86, + "grad_norm": 10.385889053344727, + "learning_rate": 9.195498472995022e-07, + "loss": 1.204, + "step": 22805 + }, + { + "epoch": 2.86, + "grad_norm": 21.43562889099121, + "learning_rate": 9.187131322428147e-07, + "loss": 0.6698, + "step": 22806 + }, + { + "epoch": 2.86, + "grad_norm": 6.3664093017578125, + "learning_rate": 9.178764171861274e-07, + "loss": 1.0888, + "step": 22807 + }, + { + "epoch": 2.86, + "grad_norm": 14.827553749084473, + "learning_rate": 9.170397021294399e-07, + "loss": 0.4481, + "step": 22808 + }, + { + "epoch": 2.86, + "grad_norm": 5.577608108520508, + "learning_rate": 9.162029870727524e-07, + "loss": 0.5304, + "step": 22809 + }, + { + "epoch": 2.86, + "grad_norm": 13.780040740966797, + "learning_rate": 9.153662720160649e-07, + "loss": 0.5358, + "step": 22810 + }, + { + "epoch": 2.86, + "grad_norm": 9.930581092834473, + "learning_rate": 9.145295569593776e-07, + "loss": 0.6939, + "step": 22811 + }, + { + "epoch": 2.86, + "grad_norm": 27.25922393798828, + "learning_rate": 9.136928419026901e-07, + "loss": 1.7797, + "step": 22812 + }, + { + "epoch": 2.86, + "grad_norm": 24.871353149414062, + "learning_rate": 9.128561268460026e-07, + "loss": 1.393, + "step": 22813 + }, + { + "epoch": 2.86, + "grad_norm": 6.4324212074279785, + "learning_rate": 9.120194117893153e-07, + "loss": 0.2362, + "step": 22814 + }, + { + "epoch": 2.86, + "grad_norm": 44.0561408996582, + "learning_rate": 9.111826967326278e-07, + "loss": 2.3997, + "step": 22815 + }, + { + "epoch": 2.86, + "grad_norm": 26.62943458557129, + "learning_rate": 9.103459816759404e-07, + "loss": 0.4729, + "step": 22816 + }, + { + "epoch": 2.86, + "grad_norm": 9.564854621887207, + "learning_rate": 9.095092666192529e-07, + "loss": 0.3948, + "step": 22817 + }, + { + "epoch": 2.86, + "grad_norm": 6.366318225860596, + "learning_rate": 9.086725515625655e-07, + "loss": 0.403, + "step": 22818 + }, + { + "epoch": 2.86, + "grad_norm": 6.157698154449463, + "learning_rate": 9.07835836505878e-07, + "loss": 0.1733, + "step": 22819 + }, + { + "epoch": 2.86, + "grad_norm": 5.42472505569458, + "learning_rate": 9.069991214491905e-07, + "loss": 0.3922, + "step": 22820 + }, + { + "epoch": 2.86, + "grad_norm": 9.176149368286133, + "learning_rate": 9.061624063925031e-07, + "loss": 0.6081, + "step": 22821 + }, + { + "epoch": 2.86, + "grad_norm": 24.528722763061523, + "learning_rate": 9.053256913358156e-07, + "loss": 2.3687, + "step": 22822 + }, + { + "epoch": 2.86, + "grad_norm": 13.149932861328125, + "learning_rate": 9.044889762791283e-07, + "loss": 1.6374, + "step": 22823 + }, + { + "epoch": 2.86, + "grad_norm": 12.362624168395996, + "learning_rate": 9.036522612224408e-07, + "loss": 0.5837, + "step": 22824 + }, + { + "epoch": 2.86, + "grad_norm": 9.579484939575195, + "learning_rate": 9.028155461657533e-07, + "loss": 1.7579, + "step": 22825 + }, + { + "epoch": 2.86, + "grad_norm": 15.601700782775879, + "learning_rate": 9.019788311090658e-07, + "loss": 0.5001, + "step": 22826 + }, + { + "epoch": 2.86, + "grad_norm": 4.9154582023620605, + "learning_rate": 9.011421160523784e-07, + "loss": 0.0562, + "step": 22827 + }, + { + "epoch": 2.86, + "grad_norm": 22.330734252929688, + "learning_rate": 9.00305400995691e-07, + "loss": 1.3774, + "step": 22828 + }, + { + "epoch": 2.86, + "grad_norm": 26.706493377685547, + "learning_rate": 8.994686859390035e-07, + "loss": 1.2342, + "step": 22829 + }, + { + "epoch": 2.87, + "grad_norm": 31.059598922729492, + "learning_rate": 8.986319708823162e-07, + "loss": 0.5336, + "step": 22830 + }, + { + "epoch": 2.87, + "grad_norm": 4.273304462432861, + "learning_rate": 8.977952558256287e-07, + "loss": 0.2168, + "step": 22831 + }, + { + "epoch": 2.87, + "grad_norm": 6.656005859375, + "learning_rate": 8.969585407689412e-07, + "loss": 0.3318, + "step": 22832 + }, + { + "epoch": 2.87, + "grad_norm": 33.49717330932617, + "learning_rate": 8.961218257122538e-07, + "loss": 0.4419, + "step": 22833 + }, + { + "epoch": 2.87, + "grad_norm": 12.423048973083496, + "learning_rate": 8.952851106555662e-07, + "loss": 1.1112, + "step": 22834 + }, + { + "epoch": 2.87, + "grad_norm": 32.052955627441406, + "learning_rate": 8.944483955988789e-07, + "loss": 1.0875, + "step": 22835 + }, + { + "epoch": 2.87, + "grad_norm": 29.69832420349121, + "learning_rate": 8.936116805421914e-07, + "loss": 1.3687, + "step": 22836 + }, + { + "epoch": 2.87, + "grad_norm": 4.517812252044678, + "learning_rate": 8.92774965485504e-07, + "loss": 1.0006, + "step": 22837 + }, + { + "epoch": 2.87, + "grad_norm": 66.99334716796875, + "learning_rate": 8.919382504288165e-07, + "loss": 1.8536, + "step": 22838 + }, + { + "epoch": 2.87, + "grad_norm": 13.752928733825684, + "learning_rate": 8.911015353721291e-07, + "loss": 0.5339, + "step": 22839 + }, + { + "epoch": 2.87, + "grad_norm": 16.232959747314453, + "learning_rate": 8.902648203154417e-07, + "loss": 1.1483, + "step": 22840 + }, + { + "epoch": 2.87, + "grad_norm": 64.5910873413086, + "learning_rate": 8.894281052587542e-07, + "loss": 1.1875, + "step": 22841 + }, + { + "epoch": 2.87, + "grad_norm": 13.074568748474121, + "learning_rate": 8.885913902020668e-07, + "loss": 1.5426, + "step": 22842 + }, + { + "epoch": 2.87, + "grad_norm": 12.445037841796875, + "learning_rate": 8.877546751453793e-07, + "loss": 0.3852, + "step": 22843 + }, + { + "epoch": 2.87, + "grad_norm": 13.778498649597168, + "learning_rate": 8.869179600886919e-07, + "loss": 1.0955, + "step": 22844 + }, + { + "epoch": 2.87, + "grad_norm": 5.992719650268555, + "learning_rate": 8.860812450320044e-07, + "loss": 0.3819, + "step": 22845 + }, + { + "epoch": 2.87, + "grad_norm": 15.028879165649414, + "learning_rate": 8.852445299753169e-07, + "loss": 1.5503, + "step": 22846 + }, + { + "epoch": 2.87, + "grad_norm": 14.219990730285645, + "learning_rate": 8.844078149186296e-07, + "loss": 0.4838, + "step": 22847 + }, + { + "epoch": 2.87, + "grad_norm": 9.950181007385254, + "learning_rate": 8.835710998619421e-07, + "loss": 0.681, + "step": 22848 + }, + { + "epoch": 2.87, + "grad_norm": 13.02139663696289, + "learning_rate": 8.827343848052547e-07, + "loss": 0.2376, + "step": 22849 + }, + { + "epoch": 2.87, + "grad_norm": 15.152532577514648, + "learning_rate": 8.818976697485672e-07, + "loss": 0.9848, + "step": 22850 + }, + { + "epoch": 2.87, + "grad_norm": 108.89427185058594, + "learning_rate": 8.810609546918796e-07, + "loss": 1.4961, + "step": 22851 + }, + { + "epoch": 2.87, + "grad_norm": 33.63383102416992, + "learning_rate": 8.802242396351923e-07, + "loss": 1.314, + "step": 22852 + }, + { + "epoch": 2.87, + "grad_norm": 11.617888450622559, + "learning_rate": 8.793875245785048e-07, + "loss": 0.256, + "step": 22853 + }, + { + "epoch": 2.87, + "grad_norm": 41.808021545410156, + "learning_rate": 8.785508095218174e-07, + "loss": 2.0645, + "step": 22854 + }, + { + "epoch": 2.87, + "grad_norm": 3.3971142768859863, + "learning_rate": 8.7771409446513e-07, + "loss": 0.0616, + "step": 22855 + }, + { + "epoch": 2.87, + "grad_norm": 16.903583526611328, + "learning_rate": 8.768773794084426e-07, + "loss": 0.4121, + "step": 22856 + }, + { + "epoch": 2.87, + "grad_norm": 14.798884391784668, + "learning_rate": 8.760406643517551e-07, + "loss": 0.5728, + "step": 22857 + }, + { + "epoch": 2.87, + "grad_norm": 16.237289428710938, + "learning_rate": 8.752039492950676e-07, + "loss": 0.5617, + "step": 22858 + }, + { + "epoch": 2.87, + "grad_norm": 14.962675094604492, + "learning_rate": 8.743672342383803e-07, + "loss": 0.3719, + "step": 22859 + }, + { + "epoch": 2.87, + "grad_norm": 12.828699111938477, + "learning_rate": 8.735305191816927e-07, + "loss": 0.892, + "step": 22860 + }, + { + "epoch": 2.87, + "grad_norm": 2.6358883380889893, + "learning_rate": 8.726938041250053e-07, + "loss": 0.0822, + "step": 22861 + }, + { + "epoch": 2.87, + "grad_norm": 16.86612319946289, + "learning_rate": 8.718570890683178e-07, + "loss": 0.7652, + "step": 22862 + }, + { + "epoch": 2.87, + "grad_norm": 11.26570987701416, + "learning_rate": 8.710203740116305e-07, + "loss": 1.0248, + "step": 22863 + }, + { + "epoch": 2.87, + "grad_norm": 3.653964042663574, + "learning_rate": 8.70183658954943e-07, + "loss": 0.8155, + "step": 22864 + }, + { + "epoch": 2.87, + "grad_norm": 13.583930969238281, + "learning_rate": 8.693469438982555e-07, + "loss": 0.9328, + "step": 22865 + }, + { + "epoch": 2.87, + "grad_norm": 12.397669792175293, + "learning_rate": 8.685102288415681e-07, + "loss": 1.7724, + "step": 22866 + }, + { + "epoch": 2.87, + "grad_norm": 16.839582443237305, + "learning_rate": 8.676735137848806e-07, + "loss": 0.7151, + "step": 22867 + }, + { + "epoch": 2.87, + "grad_norm": 17.016077041625977, + "learning_rate": 8.668367987281932e-07, + "loss": 1.2868, + "step": 22868 + }, + { + "epoch": 2.87, + "grad_norm": 39.06998062133789, + "learning_rate": 8.660000836715057e-07, + "loss": 1.0109, + "step": 22869 + }, + { + "epoch": 2.87, + "grad_norm": 2.039919853210449, + "learning_rate": 8.651633686148182e-07, + "loss": 0.0619, + "step": 22870 + }, + { + "epoch": 2.87, + "grad_norm": 27.89397430419922, + "learning_rate": 8.643266535581309e-07, + "loss": 0.8545, + "step": 22871 + }, + { + "epoch": 2.87, + "grad_norm": 8.406256675720215, + "learning_rate": 8.634899385014434e-07, + "loss": 0.553, + "step": 22872 + }, + { + "epoch": 2.87, + "grad_norm": 12.104752540588379, + "learning_rate": 8.62653223444756e-07, + "loss": 0.4784, + "step": 22873 + }, + { + "epoch": 2.87, + "grad_norm": 13.156915664672852, + "learning_rate": 8.618165083880685e-07, + "loss": 0.8673, + "step": 22874 + }, + { + "epoch": 2.87, + "grad_norm": 150.2317657470703, + "learning_rate": 8.609797933313812e-07, + "loss": 2.7136, + "step": 22875 + }, + { + "epoch": 2.87, + "grad_norm": 11.70964241027832, + "learning_rate": 8.601430782746937e-07, + "loss": 1.5755, + "step": 22876 + }, + { + "epoch": 2.87, + "grad_norm": 25.56842803955078, + "learning_rate": 8.593063632180061e-07, + "loss": 1.2766, + "step": 22877 + }, + { + "epoch": 2.87, + "grad_norm": 4.928918838500977, + "learning_rate": 8.584696481613187e-07, + "loss": 0.391, + "step": 22878 + }, + { + "epoch": 2.87, + "grad_norm": 5.155158042907715, + "learning_rate": 8.576329331046312e-07, + "loss": 0.2252, + "step": 22879 + }, + { + "epoch": 2.87, + "grad_norm": 37.66831588745117, + "learning_rate": 8.567962180479439e-07, + "loss": 1.4692, + "step": 22880 + }, + { + "epoch": 2.87, + "grad_norm": 50.934024810791016, + "learning_rate": 8.559595029912564e-07, + "loss": 2.9425, + "step": 22881 + }, + { + "epoch": 2.87, + "grad_norm": 37.8398323059082, + "learning_rate": 8.55122787934569e-07, + "loss": 1.5301, + "step": 22882 + }, + { + "epoch": 2.87, + "grad_norm": 16.43904685974121, + "learning_rate": 8.542860728778815e-07, + "loss": 0.9197, + "step": 22883 + }, + { + "epoch": 2.87, + "grad_norm": 29.151037216186523, + "learning_rate": 8.534493578211941e-07, + "loss": 3.2912, + "step": 22884 + }, + { + "epoch": 2.87, + "grad_norm": 3.3625638484954834, + "learning_rate": 8.526126427645066e-07, + "loss": 0.0577, + "step": 22885 + }, + { + "epoch": 2.87, + "grad_norm": 17.012426376342773, + "learning_rate": 8.517759277078191e-07, + "loss": 0.5314, + "step": 22886 + }, + { + "epoch": 2.87, + "grad_norm": 6.2187628746032715, + "learning_rate": 8.509392126511318e-07, + "loss": 0.7742, + "step": 22887 + }, + { + "epoch": 2.87, + "grad_norm": 2.5364348888397217, + "learning_rate": 8.501024975944443e-07, + "loss": 0.0683, + "step": 22888 + }, + { + "epoch": 2.87, + "grad_norm": 10.555633544921875, + "learning_rate": 8.492657825377569e-07, + "loss": 1.6642, + "step": 22889 + }, + { + "epoch": 2.87, + "grad_norm": 24.966533660888672, + "learning_rate": 8.484290674810694e-07, + "loss": 0.5993, + "step": 22890 + }, + { + "epoch": 2.87, + "grad_norm": 131.3794403076172, + "learning_rate": 8.475923524243819e-07, + "loss": 1.2467, + "step": 22891 + }, + { + "epoch": 2.87, + "grad_norm": 5.629092693328857, + "learning_rate": 8.467556373676946e-07, + "loss": 0.6006, + "step": 22892 + }, + { + "epoch": 2.87, + "grad_norm": 11.908555030822754, + "learning_rate": 8.45918922311007e-07, + "loss": 0.2282, + "step": 22893 + }, + { + "epoch": 2.87, + "grad_norm": 50.8621940612793, + "learning_rate": 8.450822072543196e-07, + "loss": 4.1995, + "step": 22894 + }, + { + "epoch": 2.87, + "grad_norm": 5.015515327453613, + "learning_rate": 8.442454921976321e-07, + "loss": 0.1471, + "step": 22895 + }, + { + "epoch": 2.87, + "grad_norm": 279.75604248046875, + "learning_rate": 8.434087771409446e-07, + "loss": 1.1202, + "step": 22896 + }, + { + "epoch": 2.87, + "grad_norm": 15.043505668640137, + "learning_rate": 8.425720620842573e-07, + "loss": 0.406, + "step": 22897 + }, + { + "epoch": 2.87, + "grad_norm": 9.919713973999023, + "learning_rate": 8.417353470275698e-07, + "loss": 0.3008, + "step": 22898 + }, + { + "epoch": 2.87, + "grad_norm": 11.66871166229248, + "learning_rate": 8.408986319708824e-07, + "loss": 0.3378, + "step": 22899 + }, + { + "epoch": 2.87, + "grad_norm": 9.860762596130371, + "learning_rate": 8.40061916914195e-07, + "loss": 0.2331, + "step": 22900 + }, + { + "epoch": 2.87, + "grad_norm": 21.140554428100586, + "learning_rate": 8.392252018575076e-07, + "loss": 2.2822, + "step": 22901 + }, + { + "epoch": 2.87, + "grad_norm": 6.3714823722839355, + "learning_rate": 8.3838848680082e-07, + "loss": 1.3731, + "step": 22902 + }, + { + "epoch": 2.87, + "grad_norm": 8.593062400817871, + "learning_rate": 8.375517717441325e-07, + "loss": 0.923, + "step": 22903 + }, + { + "epoch": 2.87, + "grad_norm": 6.22232723236084, + "learning_rate": 8.367150566874452e-07, + "loss": 0.1529, + "step": 22904 + }, + { + "epoch": 2.87, + "grad_norm": 18.07470703125, + "learning_rate": 8.358783416307577e-07, + "loss": 0.7162, + "step": 22905 + }, + { + "epoch": 2.87, + "grad_norm": 12.35638427734375, + "learning_rate": 8.350416265740703e-07, + "loss": 0.2988, + "step": 22906 + }, + { + "epoch": 2.87, + "grad_norm": 11.684820175170898, + "learning_rate": 8.342049115173828e-07, + "loss": 0.5593, + "step": 22907 + }, + { + "epoch": 2.87, + "grad_norm": 39.3046760559082, + "learning_rate": 8.333681964606955e-07, + "loss": 1.6821, + "step": 22908 + }, + { + "epoch": 2.88, + "grad_norm": 12.850509643554688, + "learning_rate": 8.32531481404008e-07, + "loss": 0.6752, + "step": 22909 + }, + { + "epoch": 2.88, + "grad_norm": 15.1165189743042, + "learning_rate": 8.316947663473204e-07, + "loss": 0.1947, + "step": 22910 + }, + { + "epoch": 2.88, + "grad_norm": 13.694976806640625, + "learning_rate": 8.30858051290633e-07, + "loss": 1.8173, + "step": 22911 + }, + { + "epoch": 2.88, + "grad_norm": 76.21894836425781, + "learning_rate": 8.300213362339455e-07, + "loss": 1.1363, + "step": 22912 + }, + { + "epoch": 2.88, + "grad_norm": 20.433887481689453, + "learning_rate": 8.291846211772582e-07, + "loss": 1.649, + "step": 22913 + }, + { + "epoch": 2.88, + "grad_norm": 16.72854232788086, + "learning_rate": 8.283479061205707e-07, + "loss": 0.7471, + "step": 22914 + }, + { + "epoch": 2.88, + "grad_norm": 41.515411376953125, + "learning_rate": 8.275111910638832e-07, + "loss": 0.8162, + "step": 22915 + }, + { + "epoch": 2.88, + "grad_norm": 10.91340160369873, + "learning_rate": 8.266744760071959e-07, + "loss": 0.2465, + "step": 22916 + }, + { + "epoch": 2.88, + "grad_norm": 1.5930448770523071, + "learning_rate": 8.258377609505084e-07, + "loss": 0.0162, + "step": 22917 + }, + { + "epoch": 2.88, + "grad_norm": 24.409202575683594, + "learning_rate": 8.25001045893821e-07, + "loss": 1.3316, + "step": 22918 + }, + { + "epoch": 2.88, + "grad_norm": 15.13890552520752, + "learning_rate": 8.241643308371334e-07, + "loss": 1.8263, + "step": 22919 + }, + { + "epoch": 2.88, + "grad_norm": 29.22232437133789, + "learning_rate": 8.233276157804461e-07, + "loss": 1.3149, + "step": 22920 + }, + { + "epoch": 2.88, + "grad_norm": 22.16623878479004, + "learning_rate": 8.224909007237586e-07, + "loss": 1.1153, + "step": 22921 + }, + { + "epoch": 2.88, + "grad_norm": 5.999701023101807, + "learning_rate": 8.216541856670711e-07, + "loss": 0.2013, + "step": 22922 + }, + { + "epoch": 2.88, + "grad_norm": 34.92793273925781, + "learning_rate": 8.208174706103837e-07, + "loss": 1.1674, + "step": 22923 + }, + { + "epoch": 2.88, + "grad_norm": 10.559051513671875, + "learning_rate": 8.199807555536962e-07, + "loss": 0.1636, + "step": 22924 + }, + { + "epoch": 2.88, + "grad_norm": 12.62888240814209, + "learning_rate": 8.191440404970089e-07, + "loss": 0.9471, + "step": 22925 + }, + { + "epoch": 2.88, + "grad_norm": 34.148990631103516, + "learning_rate": 8.183073254403214e-07, + "loss": 1.0717, + "step": 22926 + }, + { + "epoch": 2.88, + "grad_norm": 13.00924301147461, + "learning_rate": 8.174706103836339e-07, + "loss": 0.7507, + "step": 22927 + }, + { + "epoch": 2.88, + "grad_norm": 15.056406021118164, + "learning_rate": 8.166338953269464e-07, + "loss": 0.2511, + "step": 22928 + }, + { + "epoch": 2.88, + "grad_norm": 15.062324523925781, + "learning_rate": 8.15797180270259e-07, + "loss": 0.6409, + "step": 22929 + }, + { + "epoch": 2.88, + "grad_norm": 18.487070083618164, + "learning_rate": 8.149604652135716e-07, + "loss": 1.1899, + "step": 22930 + }, + { + "epoch": 2.88, + "grad_norm": 15.771196365356445, + "learning_rate": 8.141237501568841e-07, + "loss": 1.087, + "step": 22931 + }, + { + "epoch": 2.88, + "grad_norm": 6.037031173706055, + "learning_rate": 8.132870351001968e-07, + "loss": 0.303, + "step": 22932 + }, + { + "epoch": 2.88, + "grad_norm": 26.775182723999023, + "learning_rate": 8.124503200435093e-07, + "loss": 1.1372, + "step": 22933 + }, + { + "epoch": 2.88, + "grad_norm": 15.645440101623535, + "learning_rate": 8.116136049868219e-07, + "loss": 0.6334, + "step": 22934 + }, + { + "epoch": 2.88, + "grad_norm": 15.815534591674805, + "learning_rate": 8.107768899301343e-07, + "loss": 0.7138, + "step": 22935 + }, + { + "epoch": 2.88, + "grad_norm": 7.859343528747559, + "learning_rate": 8.099401748734468e-07, + "loss": 0.801, + "step": 22936 + }, + { + "epoch": 2.88, + "grad_norm": 2.0807950496673584, + "learning_rate": 8.091034598167595e-07, + "loss": 0.0397, + "step": 22937 + }, + { + "epoch": 2.88, + "grad_norm": 25.227819442749023, + "learning_rate": 8.08266744760072e-07, + "loss": 1.1612, + "step": 22938 + }, + { + "epoch": 2.88, + "grad_norm": 8.388067245483398, + "learning_rate": 8.074300297033846e-07, + "loss": 1.2875, + "step": 22939 + }, + { + "epoch": 2.88, + "grad_norm": 20.248489379882812, + "learning_rate": 8.065933146466971e-07, + "loss": 0.9424, + "step": 22940 + }, + { + "epoch": 2.88, + "grad_norm": 16.774770736694336, + "learning_rate": 8.057565995900097e-07, + "loss": 0.7271, + "step": 22941 + }, + { + "epoch": 2.88, + "grad_norm": 86.81754302978516, + "learning_rate": 8.049198845333223e-07, + "loss": 0.5, + "step": 22942 + }, + { + "epoch": 2.88, + "grad_norm": 11.418489456176758, + "learning_rate": 8.040831694766347e-07, + "loss": 0.5448, + "step": 22943 + }, + { + "epoch": 2.88, + "grad_norm": 4.479661464691162, + "learning_rate": 8.032464544199473e-07, + "loss": 0.0863, + "step": 22944 + }, + { + "epoch": 2.88, + "grad_norm": 10.140596389770508, + "learning_rate": 8.024097393632599e-07, + "loss": 0.3461, + "step": 22945 + }, + { + "epoch": 2.88, + "grad_norm": 10.982282638549805, + "learning_rate": 8.015730243065725e-07, + "loss": 0.435, + "step": 22946 + }, + { + "epoch": 2.88, + "grad_norm": 33.08202362060547, + "learning_rate": 8.00736309249885e-07, + "loss": 1.4527, + "step": 22947 + }, + { + "epoch": 2.88, + "grad_norm": 23.6079044342041, + "learning_rate": 7.998995941931975e-07, + "loss": 0.8683, + "step": 22948 + }, + { + "epoch": 2.88, + "grad_norm": 6.480509281158447, + "learning_rate": 7.990628791365102e-07, + "loss": 0.8845, + "step": 22949 + }, + { + "epoch": 2.88, + "grad_norm": 16.13175392150879, + "learning_rate": 7.982261640798227e-07, + "loss": 0.5201, + "step": 22950 + }, + { + "epoch": 2.88, + "grad_norm": 4.348287582397461, + "learning_rate": 7.973894490231353e-07, + "loss": 0.1425, + "step": 22951 + }, + { + "epoch": 2.88, + "grad_norm": 8.483288764953613, + "learning_rate": 7.965527339664477e-07, + "loss": 0.8687, + "step": 22952 + }, + { + "epoch": 2.88, + "grad_norm": 23.86382484436035, + "learning_rate": 7.957160189097604e-07, + "loss": 0.444, + "step": 22953 + }, + { + "epoch": 2.88, + "grad_norm": 12.939167976379395, + "learning_rate": 7.948793038530729e-07, + "loss": 0.7317, + "step": 22954 + }, + { + "epoch": 2.88, + "grad_norm": 14.58227825164795, + "learning_rate": 7.940425887963854e-07, + "loss": 0.3941, + "step": 22955 + }, + { + "epoch": 2.88, + "grad_norm": 56.66604232788086, + "learning_rate": 7.93205873739698e-07, + "loss": 2.5979, + "step": 22956 + }, + { + "epoch": 2.88, + "grad_norm": 13.80915355682373, + "learning_rate": 7.923691586830106e-07, + "loss": 1.4544, + "step": 22957 + }, + { + "epoch": 2.88, + "grad_norm": 3.105476140975952, + "learning_rate": 7.915324436263232e-07, + "loss": 0.1329, + "step": 22958 + }, + { + "epoch": 2.88, + "grad_norm": 29.131650924682617, + "learning_rate": 7.906957285696357e-07, + "loss": 0.6263, + "step": 22959 + }, + { + "epoch": 2.88, + "grad_norm": 9.802521705627441, + "learning_rate": 7.898590135129481e-07, + "loss": 0.592, + "step": 22960 + }, + { + "epoch": 2.88, + "grad_norm": 9.273344039916992, + "learning_rate": 7.890222984562608e-07, + "loss": 0.2079, + "step": 22961 + }, + { + "epoch": 2.88, + "grad_norm": 5.975650787353516, + "learning_rate": 7.881855833995733e-07, + "loss": 0.6229, + "step": 22962 + }, + { + "epoch": 2.88, + "grad_norm": 126.74982452392578, + "learning_rate": 7.873488683428859e-07, + "loss": 2.1825, + "step": 22963 + }, + { + "epoch": 2.88, + "grad_norm": 11.358890533447266, + "learning_rate": 7.865121532861984e-07, + "loss": 0.4745, + "step": 22964 + }, + { + "epoch": 2.88, + "grad_norm": 24.60085105895996, + "learning_rate": 7.856754382295111e-07, + "loss": 1.0467, + "step": 22965 + }, + { + "epoch": 2.88, + "grad_norm": 7.778239727020264, + "learning_rate": 7.848387231728236e-07, + "loss": 0.6853, + "step": 22966 + }, + { + "epoch": 2.88, + "grad_norm": 26.256319046020508, + "learning_rate": 7.840020081161361e-07, + "loss": 1.8415, + "step": 22967 + }, + { + "epoch": 2.88, + "grad_norm": 11.514891624450684, + "learning_rate": 7.831652930594487e-07, + "loss": 0.3687, + "step": 22968 + }, + { + "epoch": 2.88, + "grad_norm": 554.5360717773438, + "learning_rate": 7.823285780027611e-07, + "loss": 1.308, + "step": 22969 + }, + { + "epoch": 2.88, + "grad_norm": 14.025053024291992, + "learning_rate": 7.814918629460738e-07, + "loss": 0.9177, + "step": 22970 + }, + { + "epoch": 2.88, + "grad_norm": 16.900421142578125, + "learning_rate": 7.806551478893863e-07, + "loss": 0.9049, + "step": 22971 + }, + { + "epoch": 2.88, + "grad_norm": 19.033496856689453, + "learning_rate": 7.798184328326989e-07, + "loss": 0.6428, + "step": 22972 + }, + { + "epoch": 2.88, + "grad_norm": 8.236845970153809, + "learning_rate": 7.789817177760115e-07, + "loss": 0.4224, + "step": 22973 + }, + { + "epoch": 2.88, + "grad_norm": 18.759428024291992, + "learning_rate": 7.78145002719324e-07, + "loss": 0.9096, + "step": 22974 + }, + { + "epoch": 2.88, + "grad_norm": 7.645739555358887, + "learning_rate": 7.773082876626366e-07, + "loss": 0.509, + "step": 22975 + }, + { + "epoch": 2.88, + "grad_norm": 21.72773551940918, + "learning_rate": 7.764715726059491e-07, + "loss": 1.2064, + "step": 22976 + }, + { + "epoch": 2.88, + "grad_norm": 4.316758155822754, + "learning_rate": 7.756348575492618e-07, + "loss": 0.2389, + "step": 22977 + }, + { + "epoch": 2.88, + "grad_norm": 13.951539993286133, + "learning_rate": 7.747981424925742e-07, + "loss": 0.4588, + "step": 22978 + }, + { + "epoch": 2.88, + "grad_norm": 5.033867359161377, + "learning_rate": 7.739614274358867e-07, + "loss": 0.769, + "step": 22979 + }, + { + "epoch": 2.88, + "grad_norm": 19.416301727294922, + "learning_rate": 7.731247123791993e-07, + "loss": 3.3767, + "step": 22980 + }, + { + "epoch": 2.88, + "grad_norm": 18.113895416259766, + "learning_rate": 7.722879973225118e-07, + "loss": 0.5824, + "step": 22981 + }, + { + "epoch": 2.88, + "grad_norm": 23.813232421875, + "learning_rate": 7.714512822658245e-07, + "loss": 0.7504, + "step": 22982 + }, + { + "epoch": 2.88, + "grad_norm": 19.223020553588867, + "learning_rate": 7.70614567209137e-07, + "loss": 1.0433, + "step": 22983 + }, + { + "epoch": 2.88, + "grad_norm": 6.926052093505859, + "learning_rate": 7.697778521524496e-07, + "loss": 0.5064, + "step": 22984 + }, + { + "epoch": 2.88, + "grad_norm": 9.047198295593262, + "learning_rate": 7.68941137095762e-07, + "loss": 1.1277, + "step": 22985 + }, + { + "epoch": 2.88, + "grad_norm": 5.469225883483887, + "learning_rate": 7.681044220390746e-07, + "loss": 0.258, + "step": 22986 + }, + { + "epoch": 2.88, + "grad_norm": 10.57030200958252, + "learning_rate": 7.672677069823872e-07, + "loss": 0.8368, + "step": 22987 + }, + { + "epoch": 2.88, + "grad_norm": 21.639989852905273, + "learning_rate": 7.664309919256997e-07, + "loss": 2.6035, + "step": 22988 + }, + { + "epoch": 2.89, + "grad_norm": 13.677913665771484, + "learning_rate": 7.655942768690124e-07, + "loss": 0.5595, + "step": 22989 + }, + { + "epoch": 2.89, + "grad_norm": 1.7012652158737183, + "learning_rate": 7.647575618123249e-07, + "loss": 0.0864, + "step": 22990 + }, + { + "epoch": 2.89, + "grad_norm": 22.131879806518555, + "learning_rate": 7.639208467556375e-07, + "loss": 2.1319, + "step": 22991 + }, + { + "epoch": 2.89, + "grad_norm": 20.789060592651367, + "learning_rate": 7.6308413169895e-07, + "loss": 0.8272, + "step": 22992 + }, + { + "epoch": 2.89, + "grad_norm": 13.723470687866211, + "learning_rate": 7.622474166422625e-07, + "loss": 0.8894, + "step": 22993 + }, + { + "epoch": 2.89, + "grad_norm": 16.123985290527344, + "learning_rate": 7.614107015855751e-07, + "loss": 0.7385, + "step": 22994 + }, + { + "epoch": 2.89, + "grad_norm": 6.176941871643066, + "learning_rate": 7.605739865288876e-07, + "loss": 1.6784, + "step": 22995 + }, + { + "epoch": 2.89, + "grad_norm": 20.015087127685547, + "learning_rate": 7.597372714722002e-07, + "loss": 1.2517, + "step": 22996 + }, + { + "epoch": 2.89, + "grad_norm": 24.278839111328125, + "learning_rate": 7.589005564155127e-07, + "loss": 1.9441, + "step": 22997 + }, + { + "epoch": 2.89, + "grad_norm": 8.983095169067383, + "learning_rate": 7.580638413588254e-07, + "loss": 0.687, + "step": 22998 + }, + { + "epoch": 2.89, + "grad_norm": 4.301553726196289, + "learning_rate": 7.572271263021379e-07, + "loss": 0.4347, + "step": 22999 + }, + { + "epoch": 2.89, + "grad_norm": 15.785185813903809, + "learning_rate": 7.563904112454504e-07, + "loss": 1.4665, + "step": 23000 + }, + { + "epoch": 2.89, + "grad_norm": 30.19902801513672, + "learning_rate": 7.55553696188763e-07, + "loss": 1.1447, + "step": 23001 + }, + { + "epoch": 2.89, + "grad_norm": 12.137067794799805, + "learning_rate": 7.547169811320755e-07, + "loss": 0.5485, + "step": 23002 + }, + { + "epoch": 2.89, + "grad_norm": 25.211074829101562, + "learning_rate": 7.538802660753881e-07, + "loss": 2.0282, + "step": 23003 + }, + { + "epoch": 2.89, + "grad_norm": 9.797856330871582, + "learning_rate": 7.530435510187006e-07, + "loss": 0.5698, + "step": 23004 + }, + { + "epoch": 2.89, + "grad_norm": 20.488197326660156, + "learning_rate": 7.522068359620131e-07, + "loss": 0.782, + "step": 23005 + }, + { + "epoch": 2.89, + "grad_norm": 35.42162322998047, + "learning_rate": 7.513701209053258e-07, + "loss": 1.8208, + "step": 23006 + }, + { + "epoch": 2.89, + "grad_norm": 12.245278358459473, + "learning_rate": 7.505334058486383e-07, + "loss": 1.6764, + "step": 23007 + }, + { + "epoch": 2.89, + "grad_norm": 29.070436477661133, + "learning_rate": 7.496966907919509e-07, + "loss": 1.7391, + "step": 23008 + }, + { + "epoch": 2.89, + "grad_norm": 485.6443176269531, + "learning_rate": 7.488599757352634e-07, + "loss": 1.8613, + "step": 23009 + }, + { + "epoch": 2.89, + "grad_norm": 6.379110813140869, + "learning_rate": 7.480232606785761e-07, + "loss": 0.3097, + "step": 23010 + }, + { + "epoch": 2.89, + "grad_norm": 21.31460189819336, + "learning_rate": 7.471865456218885e-07, + "loss": 2.8207, + "step": 23011 + }, + { + "epoch": 2.89, + "grad_norm": 20.664499282836914, + "learning_rate": 7.46349830565201e-07, + "loss": 2.4824, + "step": 23012 + }, + { + "epoch": 2.89, + "grad_norm": 40.1668815612793, + "learning_rate": 7.455131155085136e-07, + "loss": 3.3693, + "step": 23013 + }, + { + "epoch": 2.89, + "grad_norm": 21.943397521972656, + "learning_rate": 7.446764004518262e-07, + "loss": 0.3427, + "step": 23014 + }, + { + "epoch": 2.89, + "grad_norm": 9.698728561401367, + "learning_rate": 7.438396853951388e-07, + "loss": 0.5873, + "step": 23015 + }, + { + "epoch": 2.89, + "grad_norm": 17.50680160522461, + "learning_rate": 7.430029703384513e-07, + "loss": 1.2329, + "step": 23016 + }, + { + "epoch": 2.89, + "grad_norm": 20.397890090942383, + "learning_rate": 7.421662552817639e-07, + "loss": 2.4361, + "step": 23017 + }, + { + "epoch": 2.89, + "grad_norm": 6.906713962554932, + "learning_rate": 7.413295402250765e-07, + "loss": 0.2127, + "step": 23018 + }, + { + "epoch": 2.89, + "grad_norm": 7.076891899108887, + "learning_rate": 7.404928251683889e-07, + "loss": 1.7859, + "step": 23019 + }, + { + "epoch": 2.89, + "grad_norm": 14.63764476776123, + "learning_rate": 7.396561101117015e-07, + "loss": 0.3831, + "step": 23020 + }, + { + "epoch": 2.89, + "grad_norm": 6.63559627532959, + "learning_rate": 7.38819395055014e-07, + "loss": 1.1545, + "step": 23021 + }, + { + "epoch": 2.89, + "grad_norm": 20.352502822875977, + "learning_rate": 7.379826799983267e-07, + "loss": 0.8208, + "step": 23022 + }, + { + "epoch": 2.89, + "grad_norm": 15.528678894042969, + "learning_rate": 7.371459649416392e-07, + "loss": 0.539, + "step": 23023 + }, + { + "epoch": 2.89, + "grad_norm": 2.7121853828430176, + "learning_rate": 7.363092498849517e-07, + "loss": 0.1615, + "step": 23024 + }, + { + "epoch": 2.89, + "grad_norm": 14.02507209777832, + "learning_rate": 7.354725348282643e-07, + "loss": 0.5469, + "step": 23025 + }, + { + "epoch": 2.89, + "grad_norm": 31.693456649780273, + "learning_rate": 7.346358197715768e-07, + "loss": 1.0745, + "step": 23026 + }, + { + "epoch": 2.89, + "grad_norm": 35.96394348144531, + "learning_rate": 7.337991047148895e-07, + "loss": 1.2756, + "step": 23027 + }, + { + "epoch": 2.89, + "grad_norm": 13.627449989318848, + "learning_rate": 7.329623896582019e-07, + "loss": 0.9638, + "step": 23028 + }, + { + "epoch": 2.89, + "grad_norm": 14.99082088470459, + "learning_rate": 7.321256746015145e-07, + "loss": 1.5079, + "step": 23029 + }, + { + "epoch": 2.89, + "grad_norm": 11.57680606842041, + "learning_rate": 7.312889595448271e-07, + "loss": 0.282, + "step": 23030 + }, + { + "epoch": 2.89, + "grad_norm": 8.11372184753418, + "learning_rate": 7.304522444881396e-07, + "loss": 0.6722, + "step": 23031 + }, + { + "epoch": 2.89, + "grad_norm": 17.07073402404785, + "learning_rate": 7.296155294314522e-07, + "loss": 1.3778, + "step": 23032 + }, + { + "epoch": 2.89, + "grad_norm": 19.17386817932129, + "learning_rate": 7.287788143747647e-07, + "loss": 1.5862, + "step": 23033 + }, + { + "epoch": 2.89, + "grad_norm": 30.804306030273438, + "learning_rate": 7.279420993180774e-07, + "loss": 1.9452, + "step": 23034 + }, + { + "epoch": 2.89, + "grad_norm": 6.854452133178711, + "learning_rate": 7.271053842613899e-07, + "loss": 0.231, + "step": 23035 + }, + { + "epoch": 2.89, + "grad_norm": 9.98362922668457, + "learning_rate": 7.262686692047024e-07, + "loss": 0.3267, + "step": 23036 + }, + { + "epoch": 2.89, + "grad_norm": 10.96481704711914, + "learning_rate": 7.254319541480149e-07, + "loss": 0.6027, + "step": 23037 + }, + { + "epoch": 2.89, + "grad_norm": 21.294233322143555, + "learning_rate": 7.245952390913274e-07, + "loss": 0.8939, + "step": 23038 + }, + { + "epoch": 2.89, + "grad_norm": 15.081424713134766, + "learning_rate": 7.237585240346401e-07, + "loss": 0.5742, + "step": 23039 + }, + { + "epoch": 2.89, + "grad_norm": 11.576356887817383, + "learning_rate": 7.229218089779526e-07, + "loss": 1.1172, + "step": 23040 + }, + { + "epoch": 2.89, + "grad_norm": 306.48291015625, + "learning_rate": 7.220850939212652e-07, + "loss": 3.5268, + "step": 23041 + }, + { + "epoch": 2.89, + "grad_norm": 13.041169166564941, + "learning_rate": 7.212483788645777e-07, + "loss": 0.3596, + "step": 23042 + }, + { + "epoch": 2.89, + "grad_norm": 3.398914337158203, + "learning_rate": 7.204116638078904e-07, + "loss": 0.2012, + "step": 23043 + }, + { + "epoch": 2.89, + "grad_norm": 9.716375350952148, + "learning_rate": 7.195749487512028e-07, + "loss": 0.3987, + "step": 23044 + }, + { + "epoch": 2.89, + "grad_norm": 19.549253463745117, + "learning_rate": 7.187382336945153e-07, + "loss": 1.277, + "step": 23045 + }, + { + "epoch": 2.89, + "grad_norm": 27.09036636352539, + "learning_rate": 7.17901518637828e-07, + "loss": 1.2798, + "step": 23046 + }, + { + "epoch": 2.89, + "grad_norm": 5.830048561096191, + "learning_rate": 7.170648035811405e-07, + "loss": 0.2446, + "step": 23047 + }, + { + "epoch": 2.89, + "grad_norm": 126.25982666015625, + "learning_rate": 7.162280885244531e-07, + "loss": 1.1631, + "step": 23048 + }, + { + "epoch": 2.89, + "grad_norm": 38.02354049682617, + "learning_rate": 7.153913734677656e-07, + "loss": 0.8995, + "step": 23049 + }, + { + "epoch": 2.89, + "grad_norm": 9.047268867492676, + "learning_rate": 7.145546584110781e-07, + "loss": 0.3694, + "step": 23050 + }, + { + "epoch": 2.89, + "grad_norm": 21.358596801757812, + "learning_rate": 7.137179433543908e-07, + "loss": 1.0105, + "step": 23051 + }, + { + "epoch": 2.89, + "grad_norm": 9.097389221191406, + "learning_rate": 7.128812282977033e-07, + "loss": 0.5755, + "step": 23052 + }, + { + "epoch": 2.89, + "grad_norm": 25.98590850830078, + "learning_rate": 7.120445132410158e-07, + "loss": 0.4946, + "step": 23053 + }, + { + "epoch": 2.89, + "grad_norm": 10.612198829650879, + "learning_rate": 7.112077981843283e-07, + "loss": 0.684, + "step": 23054 + }, + { + "epoch": 2.89, + "grad_norm": 10.165337562561035, + "learning_rate": 7.10371083127641e-07, + "loss": 0.9295, + "step": 23055 + }, + { + "epoch": 2.89, + "grad_norm": 3.873164415359497, + "learning_rate": 7.095343680709535e-07, + "loss": 0.1756, + "step": 23056 + }, + { + "epoch": 2.89, + "grad_norm": 16.203018188476562, + "learning_rate": 7.08697653014266e-07, + "loss": 1.1577, + "step": 23057 + }, + { + "epoch": 2.89, + "grad_norm": 6.412531852722168, + "learning_rate": 7.078609379575786e-07, + "loss": 0.2554, + "step": 23058 + }, + { + "epoch": 2.89, + "grad_norm": 45.4237174987793, + "learning_rate": 7.070242229008912e-07, + "loss": 1.6425, + "step": 23059 + }, + { + "epoch": 2.89, + "grad_norm": 15.77699089050293, + "learning_rate": 7.061875078442038e-07, + "loss": 0.9989, + "step": 23060 + }, + { + "epoch": 2.89, + "grad_norm": 7.485088348388672, + "learning_rate": 7.053507927875162e-07, + "loss": 0.3462, + "step": 23061 + }, + { + "epoch": 2.89, + "grad_norm": 11.70754337310791, + "learning_rate": 7.045140777308289e-07, + "loss": 0.6414, + "step": 23062 + }, + { + "epoch": 2.89, + "grad_norm": 22.87289047241211, + "learning_rate": 7.036773626741414e-07, + "loss": 0.3964, + "step": 23063 + }, + { + "epoch": 2.89, + "grad_norm": 9.068693161010742, + "learning_rate": 7.028406476174539e-07, + "loss": 0.6969, + "step": 23064 + }, + { + "epoch": 2.89, + "grad_norm": 302.4141845703125, + "learning_rate": 7.020039325607665e-07, + "loss": 1.5057, + "step": 23065 + }, + { + "epoch": 2.89, + "grad_norm": 8.26516056060791, + "learning_rate": 7.01167217504079e-07, + "loss": 0.3393, + "step": 23066 + }, + { + "epoch": 2.89, + "grad_norm": 13.177416801452637, + "learning_rate": 7.003305024473917e-07, + "loss": 1.6422, + "step": 23067 + }, + { + "epoch": 2.89, + "grad_norm": 18.07768440246582, + "learning_rate": 6.994937873907042e-07, + "loss": 0.8876, + "step": 23068 + }, + { + "epoch": 2.9, + "grad_norm": 25.200332641601562, + "learning_rate": 6.986570723340166e-07, + "loss": 0.5052, + "step": 23069 + }, + { + "epoch": 2.9, + "grad_norm": 8.668069839477539, + "learning_rate": 6.978203572773292e-07, + "loss": 0.3231, + "step": 23070 + }, + { + "epoch": 2.9, + "grad_norm": 12.362847328186035, + "learning_rate": 6.969836422206417e-07, + "loss": 0.9623, + "step": 23071 + }, + { + "epoch": 2.9, + "grad_norm": 7.626944065093994, + "learning_rate": 6.961469271639544e-07, + "loss": 0.6111, + "step": 23072 + }, + { + "epoch": 2.9, + "grad_norm": 8.494346618652344, + "learning_rate": 6.953102121072669e-07, + "loss": 0.2686, + "step": 23073 + }, + { + "epoch": 2.9, + "grad_norm": 39.15300369262695, + "learning_rate": 6.944734970505795e-07, + "loss": 0.6873, + "step": 23074 + }, + { + "epoch": 2.9, + "grad_norm": 11.612045288085938, + "learning_rate": 6.936367819938921e-07, + "loss": 1.364, + "step": 23075 + }, + { + "epoch": 2.9, + "grad_norm": 11.220239639282227, + "learning_rate": 6.928000669372046e-07, + "loss": 0.6776, + "step": 23076 + }, + { + "epoch": 2.9, + "grad_norm": 17.403223037719727, + "learning_rate": 6.919633518805172e-07, + "loss": 0.8078, + "step": 23077 + }, + { + "epoch": 2.9, + "grad_norm": 9.634208679199219, + "learning_rate": 6.911266368238296e-07, + "loss": 0.2619, + "step": 23078 + }, + { + "epoch": 2.9, + "grad_norm": 16.1423282623291, + "learning_rate": 6.902899217671423e-07, + "loss": 0.6613, + "step": 23079 + }, + { + "epoch": 2.9, + "grad_norm": 15.611533164978027, + "learning_rate": 6.894532067104548e-07, + "loss": 0.9895, + "step": 23080 + }, + { + "epoch": 2.9, + "grad_norm": 29.376800537109375, + "learning_rate": 6.886164916537674e-07, + "loss": 1.5025, + "step": 23081 + }, + { + "epoch": 2.9, + "grad_norm": 22.50613021850586, + "learning_rate": 6.877797765970799e-07, + "loss": 1.1317, + "step": 23082 + }, + { + "epoch": 2.9, + "grad_norm": 296.834716796875, + "learning_rate": 6.869430615403924e-07, + "loss": 1.7144, + "step": 23083 + }, + { + "epoch": 2.9, + "grad_norm": 7.580447673797607, + "learning_rate": 6.861063464837051e-07, + "loss": 1.5265, + "step": 23084 + }, + { + "epoch": 2.9, + "grad_norm": 26.02106475830078, + "learning_rate": 6.852696314270176e-07, + "loss": 1.6396, + "step": 23085 + }, + { + "epoch": 2.9, + "grad_norm": 5.650911331176758, + "learning_rate": 6.844329163703301e-07, + "loss": 0.1539, + "step": 23086 + }, + { + "epoch": 2.9, + "grad_norm": 15.165826797485352, + "learning_rate": 6.835962013136426e-07, + "loss": 0.5427, + "step": 23087 + }, + { + "epoch": 2.9, + "grad_norm": 8.462101936340332, + "learning_rate": 6.827594862569553e-07, + "loss": 0.9209, + "step": 23088 + }, + { + "epoch": 2.9, + "grad_norm": 18.40264892578125, + "learning_rate": 6.819227712002678e-07, + "loss": 1.3245, + "step": 23089 + }, + { + "epoch": 2.9, + "grad_norm": 10.905312538146973, + "learning_rate": 6.810860561435803e-07, + "loss": 0.5452, + "step": 23090 + }, + { + "epoch": 2.9, + "grad_norm": 29.667407989501953, + "learning_rate": 6.80249341086893e-07, + "loss": 1.4181, + "step": 23091 + }, + { + "epoch": 2.9, + "grad_norm": 6.203871726989746, + "learning_rate": 6.794126260302055e-07, + "loss": 0.3344, + "step": 23092 + }, + { + "epoch": 2.9, + "grad_norm": 21.73906707763672, + "learning_rate": 6.785759109735181e-07, + "loss": 0.9113, + "step": 23093 + }, + { + "epoch": 2.9, + "grad_norm": 23.47955322265625, + "learning_rate": 6.777391959168306e-07, + "loss": 0.0693, + "step": 23094 + }, + { + "epoch": 2.9, + "grad_norm": 9.022835731506348, + "learning_rate": 6.76902480860143e-07, + "loss": 1.6314, + "step": 23095 + }, + { + "epoch": 2.9, + "grad_norm": 14.276798248291016, + "learning_rate": 6.760657658034557e-07, + "loss": 0.7597, + "step": 23096 + }, + { + "epoch": 2.9, + "grad_norm": 11.97426986694336, + "learning_rate": 6.752290507467682e-07, + "loss": 0.2431, + "step": 23097 + }, + { + "epoch": 2.9, + "grad_norm": 7.717976093292236, + "learning_rate": 6.743923356900808e-07, + "loss": 0.8563, + "step": 23098 + }, + { + "epoch": 2.9, + "grad_norm": 15.67991828918457, + "learning_rate": 6.735556206333933e-07, + "loss": 1.0141, + "step": 23099 + }, + { + "epoch": 2.9, + "grad_norm": 5.017505645751953, + "learning_rate": 6.72718905576706e-07, + "loss": 0.6121, + "step": 23100 + }, + { + "epoch": 2.9, + "grad_norm": 6.8837151527404785, + "learning_rate": 6.718821905200185e-07, + "loss": 0.2891, + "step": 23101 + }, + { + "epoch": 2.9, + "grad_norm": 14.142083168029785, + "learning_rate": 6.71045475463331e-07, + "loss": 1.5944, + "step": 23102 + }, + { + "epoch": 2.9, + "grad_norm": 18.120759963989258, + "learning_rate": 6.702087604066435e-07, + "loss": 1.2532, + "step": 23103 + }, + { + "epoch": 2.9, + "grad_norm": 16.809154510498047, + "learning_rate": 6.693720453499561e-07, + "loss": 1.4368, + "step": 23104 + }, + { + "epoch": 2.9, + "grad_norm": 15.845349311828613, + "learning_rate": 6.685353302932687e-07, + "loss": 0.829, + "step": 23105 + }, + { + "epoch": 2.9, + "grad_norm": 15.640596389770508, + "learning_rate": 6.676986152365812e-07, + "loss": 1.0846, + "step": 23106 + }, + { + "epoch": 2.9, + "grad_norm": 11.92232894897461, + "learning_rate": 6.668619001798939e-07, + "loss": 0.4998, + "step": 23107 + }, + { + "epoch": 2.9, + "grad_norm": 14.522953987121582, + "learning_rate": 6.660251851232064e-07, + "loss": 0.672, + "step": 23108 + }, + { + "epoch": 2.9, + "grad_norm": 18.471738815307617, + "learning_rate": 6.651884700665189e-07, + "loss": 0.7952, + "step": 23109 + }, + { + "epoch": 2.9, + "grad_norm": 11.424614906311035, + "learning_rate": 6.643517550098315e-07, + "loss": 0.6084, + "step": 23110 + }, + { + "epoch": 2.9, + "grad_norm": 16.51778221130371, + "learning_rate": 6.635150399531439e-07, + "loss": 2.2651, + "step": 23111 + }, + { + "epoch": 2.9, + "grad_norm": 23.68822479248047, + "learning_rate": 6.626783248964566e-07, + "loss": 0.3113, + "step": 23112 + }, + { + "epoch": 2.9, + "grad_norm": 9.63133430480957, + "learning_rate": 6.618416098397691e-07, + "loss": 0.1978, + "step": 23113 + }, + { + "epoch": 2.9, + "grad_norm": 45.005104064941406, + "learning_rate": 6.610048947830816e-07, + "loss": 1.5576, + "step": 23114 + }, + { + "epoch": 2.9, + "grad_norm": 15.488126754760742, + "learning_rate": 6.601681797263942e-07, + "loss": 0.7878, + "step": 23115 + }, + { + "epoch": 2.9, + "grad_norm": 1.7126505374908447, + "learning_rate": 6.593314646697068e-07, + "loss": 0.1323, + "step": 23116 + }, + { + "epoch": 2.9, + "grad_norm": 6.407670497894287, + "learning_rate": 6.584947496130194e-07, + "loss": 0.1611, + "step": 23117 + }, + { + "epoch": 2.9, + "grad_norm": 34.24800491333008, + "learning_rate": 6.576580345563319e-07, + "loss": 0.7007, + "step": 23118 + }, + { + "epoch": 2.9, + "grad_norm": 231.00238037109375, + "learning_rate": 6.568213194996445e-07, + "loss": 2.245, + "step": 23119 + }, + { + "epoch": 2.9, + "grad_norm": 34.10979080200195, + "learning_rate": 6.55984604442957e-07, + "loss": 2.3559, + "step": 23120 + }, + { + "epoch": 2.9, + "grad_norm": 11.2767972946167, + "learning_rate": 6.551478893862695e-07, + "loss": 1.0564, + "step": 23121 + }, + { + "epoch": 2.9, + "grad_norm": 26.728120803833008, + "learning_rate": 6.543111743295821e-07, + "loss": 1.647, + "step": 23122 + }, + { + "epoch": 2.9, + "grad_norm": 13.866838455200195, + "learning_rate": 6.534744592728946e-07, + "loss": 0.576, + "step": 23123 + }, + { + "epoch": 2.9, + "grad_norm": 10.672852516174316, + "learning_rate": 6.526377442162073e-07, + "loss": 1.7061, + "step": 23124 + }, + { + "epoch": 2.9, + "grad_norm": 6.957767009735107, + "learning_rate": 6.518010291595198e-07, + "loss": 0.4779, + "step": 23125 + }, + { + "epoch": 2.9, + "grad_norm": 21.99214744567871, + "learning_rate": 6.509643141028324e-07, + "loss": 1.5281, + "step": 23126 + }, + { + "epoch": 2.9, + "grad_norm": 25.584535598754883, + "learning_rate": 6.501275990461449e-07, + "loss": 0.9848, + "step": 23127 + }, + { + "epoch": 2.9, + "grad_norm": 8.245368003845215, + "learning_rate": 6.492908839894573e-07, + "loss": 0.0568, + "step": 23128 + }, + { + "epoch": 2.9, + "grad_norm": 16.42379379272461, + "learning_rate": 6.4845416893277e-07, + "loss": 1.4548, + "step": 23129 + }, + { + "epoch": 2.9, + "grad_norm": 12.99228286743164, + "learning_rate": 6.476174538760825e-07, + "loss": 0.3139, + "step": 23130 + }, + { + "epoch": 2.9, + "grad_norm": 3.258230447769165, + "learning_rate": 6.467807388193951e-07, + "loss": 0.1501, + "step": 23131 + }, + { + "epoch": 2.9, + "grad_norm": 23.172542572021484, + "learning_rate": 6.459440237627077e-07, + "loss": 0.8088, + "step": 23132 + }, + { + "epoch": 2.9, + "grad_norm": 3.2802579402923584, + "learning_rate": 6.451073087060203e-07, + "loss": 0.0514, + "step": 23133 + }, + { + "epoch": 2.9, + "grad_norm": 11.514467239379883, + "learning_rate": 6.442705936493328e-07, + "loss": 0.3247, + "step": 23134 + }, + { + "epoch": 2.9, + "grad_norm": 8.98300552368164, + "learning_rate": 6.434338785926453e-07, + "loss": 1.1936, + "step": 23135 + }, + { + "epoch": 2.9, + "grad_norm": 26.330307006835938, + "learning_rate": 6.42597163535958e-07, + "loss": 1.6227, + "step": 23136 + }, + { + "epoch": 2.9, + "grad_norm": 17.73244857788086, + "learning_rate": 6.417604484792704e-07, + "loss": 0.9954, + "step": 23137 + }, + { + "epoch": 2.9, + "grad_norm": 15.236470222473145, + "learning_rate": 6.40923733422583e-07, + "loss": 0.7773, + "step": 23138 + }, + { + "epoch": 2.9, + "grad_norm": 68.36768341064453, + "learning_rate": 6.400870183658955e-07, + "loss": 2.8554, + "step": 23139 + }, + { + "epoch": 2.9, + "grad_norm": 8.454971313476562, + "learning_rate": 6.39250303309208e-07, + "loss": 0.7537, + "step": 23140 + }, + { + "epoch": 2.9, + "grad_norm": 10.701157569885254, + "learning_rate": 6.384135882525207e-07, + "loss": 0.7045, + "step": 23141 + }, + { + "epoch": 2.9, + "grad_norm": 13.48745059967041, + "learning_rate": 6.375768731958332e-07, + "loss": 0.2927, + "step": 23142 + }, + { + "epoch": 2.9, + "grad_norm": 32.00803756713867, + "learning_rate": 6.367401581391458e-07, + "loss": 3.1151, + "step": 23143 + }, + { + "epoch": 2.9, + "grad_norm": 10.901165962219238, + "learning_rate": 6.359034430824583e-07, + "loss": 0.5296, + "step": 23144 + }, + { + "epoch": 2.9, + "grad_norm": 14.928459167480469, + "learning_rate": 6.35066728025771e-07, + "loss": 1.511, + "step": 23145 + }, + { + "epoch": 2.9, + "grad_norm": 7.383054733276367, + "learning_rate": 6.342300129690834e-07, + "loss": 0.3177, + "step": 23146 + }, + { + "epoch": 2.9, + "grad_norm": 7.198001861572266, + "learning_rate": 6.333932979123959e-07, + "loss": 0.2922, + "step": 23147 + }, + { + "epoch": 2.91, + "grad_norm": 6.210714340209961, + "learning_rate": 6.325565828557086e-07, + "loss": 0.8739, + "step": 23148 + }, + { + "epoch": 2.91, + "grad_norm": 43.29275894165039, + "learning_rate": 6.317198677990211e-07, + "loss": 1.1105, + "step": 23149 + }, + { + "epoch": 2.91, + "grad_norm": 15.920175552368164, + "learning_rate": 6.308831527423337e-07, + "loss": 1.0372, + "step": 23150 + }, + { + "epoch": 2.91, + "grad_norm": 8.938474655151367, + "learning_rate": 6.300464376856462e-07, + "loss": 0.8599, + "step": 23151 + }, + { + "epoch": 2.91, + "grad_norm": 13.253548622131348, + "learning_rate": 6.292097226289589e-07, + "loss": 1.8187, + "step": 23152 + }, + { + "epoch": 2.91, + "grad_norm": 14.02438735961914, + "learning_rate": 6.283730075722714e-07, + "loss": 0.7358, + "step": 23153 + }, + { + "epoch": 2.91, + "grad_norm": 12.823583602905273, + "learning_rate": 6.275362925155838e-07, + "loss": 0.6888, + "step": 23154 + }, + { + "epoch": 2.91, + "grad_norm": 18.59247398376465, + "learning_rate": 6.266995774588964e-07, + "loss": 0.6411, + "step": 23155 + }, + { + "epoch": 2.91, + "grad_norm": 12.513319969177246, + "learning_rate": 6.258628624022089e-07, + "loss": 1.2862, + "step": 23156 + }, + { + "epoch": 2.91, + "grad_norm": 185.5263671875, + "learning_rate": 6.250261473455216e-07, + "loss": 2.3999, + "step": 23157 + }, + { + "epoch": 2.91, + "grad_norm": 11.595245361328125, + "learning_rate": 6.241894322888341e-07, + "loss": 1.5035, + "step": 23158 + }, + { + "epoch": 2.91, + "grad_norm": 17.20775604248047, + "learning_rate": 6.233527172321467e-07, + "loss": 0.8866, + "step": 23159 + }, + { + "epoch": 2.91, + "grad_norm": 68.4419174194336, + "learning_rate": 6.225160021754592e-07, + "loss": 0.2079, + "step": 23160 + }, + { + "epoch": 2.91, + "grad_norm": 6.003311634063721, + "learning_rate": 6.216792871187718e-07, + "loss": 0.1097, + "step": 23161 + }, + { + "epoch": 2.91, + "grad_norm": 55.0208854675293, + "learning_rate": 6.208425720620843e-07, + "loss": 2.2685, + "step": 23162 + }, + { + "epoch": 2.91, + "grad_norm": 11.7134428024292, + "learning_rate": 6.200058570053968e-07, + "loss": 0.8954, + "step": 23163 + }, + { + "epoch": 2.91, + "grad_norm": 19.41765022277832, + "learning_rate": 6.191691419487094e-07, + "loss": 2.6211, + "step": 23164 + }, + { + "epoch": 2.91, + "grad_norm": 14.811298370361328, + "learning_rate": 6.18332426892022e-07, + "loss": 0.5761, + "step": 23165 + }, + { + "epoch": 2.91, + "grad_norm": 7.661105632781982, + "learning_rate": 6.174957118353345e-07, + "loss": 0.5413, + "step": 23166 + }, + { + "epoch": 2.91, + "grad_norm": 11.37485408782959, + "learning_rate": 6.166589967786471e-07, + "loss": 0.409, + "step": 23167 + }, + { + "epoch": 2.91, + "grad_norm": 7.339240550994873, + "learning_rate": 6.158222817219597e-07, + "loss": 0.8778, + "step": 23168 + }, + { + "epoch": 2.91, + "grad_norm": 13.293191909790039, + "learning_rate": 6.149855666652722e-07, + "loss": 0.8464, + "step": 23169 + }, + { + "epoch": 2.91, + "grad_norm": 5.022184371948242, + "learning_rate": 6.141488516085847e-07, + "loss": 0.1938, + "step": 23170 + }, + { + "epoch": 2.91, + "grad_norm": 11.445906639099121, + "learning_rate": 6.133121365518973e-07, + "loss": 1.5553, + "step": 23171 + }, + { + "epoch": 2.91, + "grad_norm": 17.587867736816406, + "learning_rate": 6.124754214952098e-07, + "loss": 0.9976, + "step": 23172 + }, + { + "epoch": 2.91, + "grad_norm": 26.690670013427734, + "learning_rate": 6.116387064385224e-07, + "loss": 0.836, + "step": 23173 + }, + { + "epoch": 2.91, + "grad_norm": 20.364336013793945, + "learning_rate": 6.10801991381835e-07, + "loss": 1.8699, + "step": 23174 + }, + { + "epoch": 2.91, + "grad_norm": 15.160350799560547, + "learning_rate": 6.099652763251475e-07, + "loss": 0.4064, + "step": 23175 + }, + { + "epoch": 2.91, + "grad_norm": 288.4300842285156, + "learning_rate": 6.091285612684601e-07, + "loss": 0.857, + "step": 23176 + }, + { + "epoch": 2.91, + "grad_norm": 4.267444610595703, + "learning_rate": 6.082918462117727e-07, + "loss": 0.3047, + "step": 23177 + }, + { + "epoch": 2.91, + "grad_norm": 23.117055892944336, + "learning_rate": 6.074551311550851e-07, + "loss": 0.9334, + "step": 23178 + }, + { + "epoch": 2.91, + "grad_norm": 4.963648796081543, + "learning_rate": 6.066184160983977e-07, + "loss": 0.639, + "step": 23179 + }, + { + "epoch": 2.91, + "grad_norm": 81.25936126708984, + "learning_rate": 6.057817010417103e-07, + "loss": 1.1255, + "step": 23180 + }, + { + "epoch": 2.91, + "grad_norm": 205.37783813476562, + "learning_rate": 6.049449859850229e-07, + "loss": 1.3728, + "step": 23181 + }, + { + "epoch": 2.91, + "grad_norm": 13.395986557006836, + "learning_rate": 6.041082709283354e-07, + "loss": 1.2588, + "step": 23182 + }, + { + "epoch": 2.91, + "grad_norm": 14.80759334564209, + "learning_rate": 6.03271555871648e-07, + "loss": 1.804, + "step": 23183 + }, + { + "epoch": 2.91, + "grad_norm": 8.79738712310791, + "learning_rate": 6.024348408149605e-07, + "loss": 0.223, + "step": 23184 + }, + { + "epoch": 2.91, + "grad_norm": 18.706571578979492, + "learning_rate": 6.015981257582731e-07, + "loss": 0.8289, + "step": 23185 + }, + { + "epoch": 2.91, + "grad_norm": 13.420476913452148, + "learning_rate": 6.007614107015857e-07, + "loss": 1.9154, + "step": 23186 + }, + { + "epoch": 2.91, + "grad_norm": 10.114333152770996, + "learning_rate": 5.999246956448982e-07, + "loss": 0.3806, + "step": 23187 + }, + { + "epoch": 2.91, + "grad_norm": 5.580753803253174, + "learning_rate": 5.990879805882107e-07, + "loss": 0.219, + "step": 23188 + }, + { + "epoch": 2.91, + "grad_norm": 15.963912963867188, + "learning_rate": 5.982512655315232e-07, + "loss": 0.7323, + "step": 23189 + }, + { + "epoch": 2.91, + "grad_norm": 14.737431526184082, + "learning_rate": 5.974145504748358e-07, + "loss": 0.672, + "step": 23190 + }, + { + "epoch": 2.91, + "grad_norm": 10.397017478942871, + "learning_rate": 5.965778354181484e-07, + "loss": 0.2714, + "step": 23191 + }, + { + "epoch": 2.91, + "grad_norm": 78.07597351074219, + "learning_rate": 5.95741120361461e-07, + "loss": 1.9645, + "step": 23192 + }, + { + "epoch": 2.91, + "grad_norm": 10.356441497802734, + "learning_rate": 5.949044053047736e-07, + "loss": 0.6972, + "step": 23193 + }, + { + "epoch": 2.91, + "grad_norm": 174.6509552001953, + "learning_rate": 5.940676902480861e-07, + "loss": 1.2032, + "step": 23194 + }, + { + "epoch": 2.91, + "grad_norm": 19.284364700317383, + "learning_rate": 5.932309751913985e-07, + "loss": 1.3186, + "step": 23195 + }, + { + "epoch": 2.91, + "grad_norm": 3.266484260559082, + "learning_rate": 5.923942601347111e-07, + "loss": 0.0663, + "step": 23196 + }, + { + "epoch": 2.91, + "grad_norm": 11.210550308227539, + "learning_rate": 5.915575450780237e-07, + "loss": 0.6201, + "step": 23197 + }, + { + "epoch": 2.91, + "grad_norm": 99.94712829589844, + "learning_rate": 5.907208300213363e-07, + "loss": 2.5873, + "step": 23198 + }, + { + "epoch": 2.91, + "grad_norm": 72.69721984863281, + "learning_rate": 5.898841149646489e-07, + "loss": 0.947, + "step": 23199 + }, + { + "epoch": 2.91, + "grad_norm": 1.3036818504333496, + "learning_rate": 5.890473999079614e-07, + "loss": 0.0282, + "step": 23200 + }, + { + "epoch": 2.91, + "eval_loss": 0.07849589735269547, + "eval_runtime": 96.2443, + "eval_samples_per_second": 36.802, + "eval_steps_per_second": 36.802, + "step": 23200 } ], "logging_steps": 1, - "max_steps": 9978, + "max_steps": 23904, "num_input_tokens_seen": 0, "num_train_epochs": 3, - "save_steps": 120, - "total_flos": 435510745758720.0, + "save_steps": 400, + "total_flos": 3277316217323520.0, "train_batch_size": 1, "trial_name": null, "trial_params": null